diff --git a/Makefile b/Makefile index e4cbd27..1024671 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ # Makefile for pdfminer PACKAGE=pdfminer -VERSION=20090330 -GNUTAR=tar -SVN=svn -PYTHON=python -WORKDIR=/tmp +SVN=svn +GNUTAR=tar +PYTHON=python +TMPDIR=/tmp +VERSION=`$(PYTHON) $(PACKAGE)/__init__.py` DISTNAME=$(PACKAGE)-dist-$(VERSION) DISTFILE=$(DISTNAME).tar.gz @@ -14,34 +14,33 @@ CONV_CMAP=$(PYTHON) -m tools.conv_cmap all: +clean: + -rm -rf build + -cd $(PACKAGE) && make clean + -cd tools && make clean + -cd samples && make clean + +test: + cd samples && make test + cdbcmap: CMap -mkdir CDBCMap $(CONV_CMAP) CMap/* -test: - cd samples && make - -clean: - -cd pdflib && make clean - -cd tools && make clean - -cd samples && make clean - -rm -rf build - # Maintainance: - -pack: clean - $(SVN) cleanup - $(SVN) export . $(WORKDIR)/$(DISTNAME) - $(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner - -rm -rf $(WORKDIR)/$(DISTNAME) - -check: - -pychecker --limit=0 *.py - commit: clean $(SVN) commit +check: + cd $(PACKAGE) && make check + +dist: clean + $(SVN) cleanup + $(SVN) export . $(TMPDIR)/$(DISTNAME) + $(GNUTAR) c -z -C$(TMPDIR) -f $(TMPDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner + -rm -rf $(TMPDIR)/$(DISTNAME) + WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer -publish: pack - cp $(WORKDIR)/$(DISTFILE) $(WEBDIR) +publish: dist + cp $(TMPDIR)/$(DISTFILE) $(WEBDIR) cp README.html $(WEBDIR)/index.html diff --git a/pdfminer/Makefile b/pdfminer/Makefile index 19814b4..2298cc1 100644 --- a/pdfminer/Makefile +++ b/pdfminer/Makefile @@ -1,32 +1,11 @@ # Makefile for pdfminer -DESTDIR=/usr/local/src/pdflib +PYCHECKER=pychecker --limit=0 -PDFLIB = ${DESTDIR}/__init__.py \ - ${DESTDIR}/arcfour.py \ - ${DESTDIR}/ascii85.py \ - ${DESTDIR}/cmap.py \ - ${DESTDIR}/fontmetrics.py \ - ${DESTDIR}/glyphlist.py \ - ${DESTDIR}/latin_enc.py \ - ${DESTDIR}/lzw.py \ - ${DESTDIR}/pdf2txt.py \ - ${DESTDIR}/pdfcolor.py \ - ${DESTDIR}/pdfdevice.py \ - ${DESTDIR}/pdffont.py \ - ${DESTDIR}/pdfinterp.py \ - ${DESTDIR}/pdfparser.py \ - ${DESTDIR}/pdftypes.py \ - ${DESTDIR}/psparser.py \ - ${DESTDIR}/pycdb.py \ - ${DESTDIR}/rijndael.py \ - ${DESTDIR}/utils.py \ - -${DESTDIR}/%: % - cp $? $@ - chmod 755 $@ - -all: ${PDFLIB} +all: clean: -rm *.pyc *.pyo + +check: + $(PYCHECKER) *.py diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index e69de29..dc5fe9d 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python +__version__ = '20090330' + +if __name__ == '__main__': print __version__ diff --git a/pdfminer/converter.py b/pdfminer/converter.py index aae324a..3776699 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -45,7 +45,7 @@ class PDFPageAggregator(PDFDevice): def handle_undefined_char(self, cidcoding, cid): if self.debug: - print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) + print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) return self.undefined_char def paint_path(self, gstate, stroke, fill, evenodd, path): @@ -149,7 +149,7 @@ class TagExtractor(PDFDevice): text += char except PDFUnicodeNotDefined: pass - self.write(text) + self.outfp.write(enc(text, self.codec)) return def begin_page(self, page): @@ -306,18 +306,17 @@ class TextConverter(PDFConverter): def end_page(self, page): def render(item): if isinstance(item, LTText): - self.outfp.write(obj.text.encode(self.codec, 'replace')) - self.outfp.write('\n') + self.write(item.text+'\n') elif isinstance(item, LTTextBox): for line in item.get_lines(self.word_margin): - self.outfp.write(line.encode(self.codec, 'replace')+'\n') - self.outfp.write('\n') + self.write(line+'\n') + self.write('\n') elif isinstance(item, LayoutContainer): for child in item: render(child) page = PDFConverter.end_page(self, page) if self.showpageno: - self.outfp.write('Page %d\n' % page.id) + self.write('Page %d\n' % page.id) render(page) - self.outfp.write('\f') + self.write('\f') return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 2705e5b..aec5d03 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -362,7 +362,7 @@ class LTFigure(LayoutContainer): return def __repr__(self): - return ('
' % (self.id, self.get_bbox(), self.ctm)) + return ('
' % (self.id, self.get_bbox(), self.matrix)) ## LTTextBox diff --git a/pdfminer/lzw.py b/pdfminer/lzw.py index 5013dfd..5dfcee1 100644 --- a/pdfminer/lzw.py +++ b/pdfminer/lzw.py @@ -86,12 +86,12 @@ class LZWDecoder(object): def main(argv): import StringIO - input = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' - fp = StringIO.StringIO(input) + data = '\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01' + fp = StringIO.StringIO(data) expected = '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' LZWDecoder.debug = 1 output = ''.join(LZWDecoder(fp).run()) - print (input, expected, output) + print (data, expected, output) print output == expected return 0 diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index f53482d..a3de399 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -243,7 +243,7 @@ class TrueTypeFont(object): self.tables = {} fonttype = fp.read(4) (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - for i in xrange(ntables): + for _ in xrange(ntables): (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) self.tables[name] = (offset, length) return diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 611bd93..a39e354 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -152,7 +152,7 @@ class PDFResourceManager(object): else: if STRICT: raise PDFFontError('Invalid Font spec: %r' % spec) - font = PDFType1Font(spec) # this is so wrong! + font = PDFType1Font(self, spec) # this is so wrong! if objid: self.fonts[objid] = font return font diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index 60b3985..9167cb3 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -55,7 +55,6 @@ class XRefObjRange(object): class PDFBaseXRef(object): def __init__(self): self.objid_ranges = None - self.objid_list = None return def objids(self): @@ -63,11 +62,9 @@ class PDFBaseXRef(object): for objid_range in self.objid_ranges: for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1): yield objid - else: - for objid in self.offsets: - yield objid return + ## PDFXRef ## class PDFXRef(PDFBaseXRef): @@ -154,7 +151,7 @@ class PDFXRefStream(PDFBaseXRef): return def __repr__(self): - return '' % (self.objid_first, self.objid_last) + return '' % (self.fl1, self.fl2, self.fl3) def load(self, parser, debug=0): (_,objid) = parser.nexttoken() # ignored diff --git a/samples/Makefile b/samples/Makefile index 08be431..fa5c728 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -16,11 +16,13 @@ HTMLS= \ naacl06-shinyama.html \ nlp2004slides.html -all: $(HTMLS) +all: clean: -rm $(HTMLS) +test: $(HTMLS) + .SUFFIXES: .pdf .html .sgml .txt .pdf.html: $(PDF2TXT) -t html -o $@ $<