From cb020514816e53b315a49cea2e445b33a5e8c7b9 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Thu, 3 Jul 2008 15:51:44 +0000 Subject: [PATCH] several bugfixes. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@41 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 6 +++--- pdflib/pdfinterp.py | 2 +- pdflib/pdfparser.py | 11 ++++++----- tools/pdf2txt.py | 9 ++++++--- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/README.html b/README.html index 40957e2..8772b56 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }

PDFMiner

-Last Modified: Sun Jun 29 23:57:07 JST 2008 +Last Modified: Tue Jul 1 00:02:48 JST 2008
@@ -54,8 +54,8 @@ http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080629.tar.gz

Svn repository:
- -http://code.google.com/p/pdfminerr/source/browse + +http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer

diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 01fde03..d045769 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -856,7 +856,7 @@ class PDFPageInterpreter: return # setleading def do_TL(self, leading): - self.textstate.leading = leading + self.textstate.leading = -leading return # selectfont def do_Tf(self, fontid, fontsize): diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 494b6db..8a4be0f 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -362,7 +362,7 @@ class PDFXRefStream(object): (_,genno) = parser.nexttoken() # ignored (_,kwd) = parser.nexttoken() (_,stream) = parser.nextobject() - if stream.dic['Type'] != LITERAL_XREF: + if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF: raise PDFNoValidXRef('invalid stream spec.') size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) @@ -450,7 +450,7 @@ class PDFDocument: def set_root(self, root): self.root = root self.catalog = dict_value(self.root) - if self.catalog['Type'] != LITERAL_CATALOG: + if self.catalog.get('Type') != LITERAL_CATALOG: if STRICT: raise PDFValueError('Catalog not found!') self.outline = self.catalog.get('Outline') @@ -504,7 +504,7 @@ class PDFDocument: hash.update(docid[0]) # 3 x = Arcfour(key).process(hash.digest()[:16]) # 4 for i in xrange(1,19+1): - k = ''.join( chr(c ^ i) for c in key ) + k = ''.join( chr(ord(c) ^ i) for c in key ) x = Arcfour(k).process(x) u1 = x+x # 32bytes total if R == 2: @@ -599,16 +599,17 @@ class PDFDocument: for (k,v) in parent.iteritems(): if k in self.INHERITABLE_ATTRS and k not in tree: tree[k] = v - if tree['Type'] == LITERAL_PAGES: + if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree: if 1 <= debug: print >>stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x - elif tree['Type'] == LITERAL_PAGE: + elif tree.get('Type') == LITERAL_PAGE: if 1 <= debug: print >>stderr, 'Page: %r' % tree yield tree + if 'Pages' not in self.catalog: return for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): yield PDFPage(self, i, tree) return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index a802806..33d652b 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -205,7 +205,7 @@ class TextConverter(PDFDevice): # pdf2txt class TextExtractionNotAllowed(RuntimeError): pass -def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=10, html=False, password='', debug=0): +def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0): device = TextConverter(rsrc, debug=debug) doc = PDFDocument(debug=debug) fp = file(fname, 'rb') @@ -238,7 +238,7 @@ def main(argv): print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -247,6 +247,7 @@ def main(argv): cdbcmapdir = 'CDBCMap' codec = 'ascii' pages = set() + maxpages = 0 html = False password = '' outfp = stdout @@ -255,6 +256,7 @@ def main(argv): elif k == '-p': pages.update( int(x)-1 for x in v.split(',') ) elif k == '-P': password = v elif k == '-c': codec = v + elif k == '-m': maxpages = int(v) elif k == '-C': cmapdir = v elif k == '-D': cdbcmapdir = v elif k == '-H': html = True @@ -263,7 +265,8 @@ def main(argv): CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) for fname in args: - pdf2txt(outfp, rsrc, fname, pages, codec, html=html, password=password, debug=debug) + pdf2txt(outfp, rsrc, fname, pages, codec, + maxpages=maxpages, html=html, password=password, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv))