diff --git a/README.html b/README.html index e53bfa6..8859d94 100644 --- a/README.html +++ b/README.html @@ -52,7 +52,8 @@ $ ./dumppdf.py foo.pdf

Extract the text:

-$ ./pdf2txt.py foo.pdf > foo.xml
+$ ./pdf2txt.py samples/naacl06-shinyama.pdf
+$ ./pdf2txt.py -c euc-jp samples/jo.pdf
 

diff --git a/conv_cmap.py b/conv_cmap.py index d1e5385..ef9830d 100755 --- a/conv_cmap.py +++ b/conv_cmap.py @@ -23,7 +23,7 @@ def dumpcdb(cmap, cdbfile, verbose=1): def convert_cmap(files, cmapdir, cdbcmapdir, force=False): from cmap import CMapDB CMapDB.initialize(cmapdir) - for fname in fiels: + for fname in files: cmapname = os.path.basename(fname) cdbname = os.path.join(cdbcmapdir, cmapname+'.cmap.cdb') if not force and os.path.exists(cdbname): diff --git a/samples/jo.pdf b/samples/jo.pdf new file mode 100644 index 0000000..53b1b1e Binary files /dev/null and b/samples/jo.pdf differ diff --git a/samples/naacl06-shinyama.pdf b/samples/naacl06-shinyama.pdf new file mode 100644 index 0000000..e14c8f9 Binary files /dev/null and b/samples/naacl06-shinyama.pdf differ diff --git a/utils.py b/utils.py index 5ceb333..9d9eeef 100644 --- a/utils.py +++ b/utils.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from struct import pack, unpack ## Utilities ##