diff --git a/Makefile b/Makefile index 0e793c5..4e20d5d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20080429 +VERSION=20080629 GNUTAR=tar SVN=svn PYTHON=python @@ -10,14 +10,21 @@ WORKDIR=/tmp DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTFILE=$(DISTNAME).tar.gz +CONV_CMAP=$(PYTHON) -m tools.conv_cmap + all: cdbcmap: CMap -mkdir CDBCMap - $(PYTHON) conv_cmap.py CMap/* + $(CONV_CMAP) CMap/* + +samples: + cd samples && make clean: - -rm *.pyc *.pyo *~ + cd pdfminer && make clean + cd tools && make clean + cd samples && make clean # Maintainance: diff --git a/README.html b/README.html index 24669d3..5f9cec9 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
pdf2txt.py
extracts text contents from a PDF file.
It extracts all the texts that are to be rendered programatically.
It also extracts the corresponding locations, font names,
-and font sizes for each text portion in an SGML-like format.
-It cannot extract texts embedded within images
+and font sizes for each text portion. However,
+it cannot extract texts embedded within images
(i.e. it does not do optical character recognition).
You can provide a password for protected PDF documents
whose access is limited.
@@ -129,7 +129,8 @@ Unicode Standard.
Examples:
-$ ./pdf2txt.py samples/naacl06-shinyama.pdf +$ ./pdf2txt.py -H -o output.html samples/naacl06-shinyama.pdf +(extract text as an HTML file whose filename is output.html) $ ./pdf2txt.py -c euc-jp samples/jo.pdf (extract Japanese texts in vertical writing, CMap is required) @@ -145,15 +146,17 @@ Options:Speficies the output file name. By default, it prints the extracted contents to stdout. -
-p pageno
-Speficies the page number to be extracted. -Note that page numbers start from zero. -Multiple -p
options are allowed. +-p pageno[,pageno,...]
+Speficies the comma-separated list of the page numbers to be extracted. +Page numbers are starting from zero. By default, it extracts texts from all the pages.
-c codec
Speficies the output codec for non-ASCII texts. +
-H
+Speficies the output to be HTML file. +
-P password
Provides the user password to open the PDF file. diff --git a/TODO b/TODO index db6c611..9b12f11 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,6 @@ TODOs: - Documentation. - Error handling for invalid type. - - Simple viewer application using pygame. - Outlines. - Named Objects. (pages) diff --git a/pdfminer/Makefile b/pdfminer/Makefile new file mode 100644 index 0000000..efee7fc --- /dev/null +++ b/pdfminer/Makefile @@ -0,0 +1,6 @@ +# Makefile for pdfminer + +all: + +clean: + -rm *.pyc *.pyo diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/arcfour.py b/pdfminer/arcfour.py similarity index 100% rename from arcfour.py rename to pdfminer/arcfour.py diff --git a/cmap.py b/pdfminer/cmap.py similarity index 100% rename from cmap.py rename to pdfminer/cmap.py diff --git a/fontmetrics.py b/pdfminer/fontmetrics.py similarity index 100% rename from fontmetrics.py rename to pdfminer/fontmetrics.py diff --git a/glyphlist.py b/pdfminer/glyphlist.py similarity index 100% rename from glyphlist.py rename to pdfminer/glyphlist.py diff --git a/latin_enc.py b/pdfminer/latin_enc.py similarity index 100% rename from latin_enc.py rename to pdfminer/latin_enc.py diff --git a/lzw.py b/pdfminer/lzw.py similarity index 100% rename from lzw.py rename to pdfminer/lzw.py diff --git a/pdfinterp.py b/pdfminer/pdfinterp.py similarity index 100% rename from pdfinterp.py rename to pdfminer/pdfinterp.py diff --git a/pdfparser.py b/pdfminer/pdfparser.py similarity index 100% rename from pdfparser.py rename to pdfminer/pdfparser.py diff --git a/psparser.py b/pdfminer/psparser.py similarity index 100% rename from psparser.py rename to pdfminer/psparser.py diff --git a/pycdb.py b/pdfminer/pycdb.py similarity index 100% rename from pycdb.py rename to pdfminer/pycdb.py diff --git a/utils.py b/pdfminer/utils.py similarity index 100% rename from utils.py rename to pdfminer/utils.py diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 0000000..bde7570 --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,24 @@ +# GNUMakefile for test + +PYTHON=python +PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt + +HTMLS= \ + simple1.html \ + simple2.html \ + dmca.html \ + f1040nr.html \ + i1040nr.html \ + jo.html \ + kampo.html \ + naacl06-shinyama.html \ + nlp2004slides.html + +all: $(HTMLS) + +clean: + -rm $(HTMLS) + +.SUFFIXES: .pdf .html +.pdf.html: + $(PDF2TXT) -H -o $@ $< diff --git a/tools/Makefile b/tools/Makefile new file mode 100644 index 0000000..8299398 --- /dev/null +++ b/tools/Makefile @@ -0,0 +1,6 @@ +# Makefile for tools + +all: + +clean: + -rm *.pyc *.pyo diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/conv_afm.py b/tools/conv_afm.py similarity index 100% rename from conv_afm.py rename to tools/conv_afm.py diff --git a/conv_cmap.py b/tools/conv_cmap.py similarity index 96% rename from conv_cmap.py rename to tools/conv_cmap.py index d8a8022..e5fc4ec 100755 --- a/conv_cmap.py +++ b/tools/conv_cmap.py @@ -7,7 +7,7 @@ def dumpcdb(cmap, cdbfile, verbose=1): try: import cdb except ImportError: - import pycdb as cdb + import pdfminer.pycdb as cdb m = cdb.cdbmake(cdbfile, cdbfile+'.tmp') if verbose: print >>stderr, 'Writing: %r...' % cdbfile @@ -21,7 +21,7 @@ def dumpcdb(cmap, cdbfile, verbose=1): return def convert_cmap(files, cmapdir, cdbcmapdir, force=False): - from cmap import CMapDB + from pdfminer.cmap import CMapDB CMapDB.initialize(cmapdir) for fname in files: if fname.endswith('.upr'): continue diff --git a/dumppdf.py b/tools/dumppdf.py similarity index 98% rename from dumppdf.py rename to tools/dumppdf.py index c86cf3a..0568d1a 100755 --- a/dumppdf.py +++ b/tools/dumppdf.py @@ -7,7 +7,7 @@ # -i objid : object id # import sys, re -from pdfparser import PDFDocument, PDFParser, PDFStream, \ +from pdfminer.pdfparser import PDFDocument, PDFParser, PDFStream, \ PDFObjRef, PSKeyword, PSLiteral stdout = sys.stdout stderr = sys.stderr diff --git a/extent.py b/tools/extent.py similarity index 100% rename from extent.py rename to tools/extent.py diff --git a/pdf2txt.py b/tools/pdf2txt.py similarity index 97% rename from pdf2txt.py rename to tools/pdf2txt.py index 8b2736f..071e8e3 100755 --- a/pdf2txt.py +++ b/tools/pdf2txt.py @@ -2,11 +2,11 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdfinterp import PDFDevice, PDFResourceManager, \ +from pdfminer.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect +from pdfminer.pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined, \ mult_matrix, apply_matrix -from cmap import CMapDB +from pdfminer.cmap import CMapDB def enc(x, codec): diff --git a/sgml.py b/tools/sgml.py similarity index 100% rename from sgml.py rename to tools/sgml.py diff --git a/viewpdf.py b/tools/viewpdf.py similarity index 98% rename from viewpdf.py rename to tools/viewpdf.py index ec25d12..6721036 100755 --- a/viewpdf.py +++ b/tools/viewpdf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python import sys -from sgml import PDFSGMLParser, Document +from pdfminer.sgml import PDFSGMLParser, Document stdout = sys.stdout stderr = sys.stderr try: