several bugfixes.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@41 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-07-03 15:51:44 +00:00
parent 434cb894a8
commit cb02051481
4 changed files with 16 additions and 12 deletions

View File

@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
<h1>PDFMiner</h1> <h1>PDFMiner</h1>
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Jun 29 23:57:07 JST 2008 Last Modified: Tue Jul 1 00:02:48 JST 2008
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -54,8 +54,8 @@ http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080629.tar.gz
<P> <P>
<strong>Svn repository:</strong><br> <strong>Svn repository:</strong><br>
<a href="http://code.google.com/p/pdfminerr/source/browse"> <a href="http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer">
http://code.google.com/p/pdfminerr/source/browse http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer
</a> </a>
<P> <P>

View File

@ -856,7 +856,7 @@ class PDFPageInterpreter:
return return
# setleading # setleading
def do_TL(self, leading): def do_TL(self, leading):
self.textstate.leading = leading self.textstate.leading = -leading
return return
# selectfont # selectfont
def do_Tf(self, fontid, fontsize): def do_Tf(self, fontid, fontsize):

View File

@ -362,7 +362,7 @@ class PDFXRefStream(object):
(_,genno) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_,stream) = parser.nextobject()
if stream.dic['Type'] != LITERAL_XREF: if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
raise PDFNoValidXRef('invalid stream spec.') raise PDFNoValidXRef('invalid stream spec.')
size = stream.dic['Size'] size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size)) (start, nobjs) = stream.dic.get('Index', (0,size))
@ -450,7 +450,7 @@ class PDFDocument:
def set_root(self, root): def set_root(self, root):
self.root = root self.root = root
self.catalog = dict_value(self.root) self.catalog = dict_value(self.root)
if self.catalog['Type'] != LITERAL_CATALOG: if self.catalog.get('Type') != LITERAL_CATALOG:
if STRICT: if STRICT:
raise PDFValueError('Catalog not found!') raise PDFValueError('Catalog not found!')
self.outline = self.catalog.get('Outline') self.outline = self.catalog.get('Outline')
@ -504,7 +504,7 @@ class PDFDocument:
hash.update(docid[0]) # 3 hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4 x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1): for i in xrange(1,19+1):
k = ''.join( chr(c ^ i) for c in key ) k = ''.join( chr(ord(c) ^ i) for c in key )
x = Arcfour(k).process(x) x = Arcfour(k).process(x)
u1 = x+x # 32bytes total u1 = x+x # 32bytes total
if R == 2: if R == 2:
@ -599,16 +599,17 @@ class PDFDocument:
for (k,v) in parent.iteritems(): for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree: if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree['Type'] == LITERAL_PAGES: if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
if 1 <= debug: if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids'] print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']: for c in tree['Kids']:
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree['Type'] == LITERAL_PAGE: elif tree.get('Type') == LITERAL_PAGE:
if 1 <= debug: if 1 <= debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield tree yield tree
if 'Pages' not in self.catalog: return
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)): for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree) yield PDFPage(self, i, tree)
return return

View File

@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
# pdf2txt # pdf2txt
class TextExtractionNotAllowed(RuntimeError): pass class TextExtractionNotAllowed(RuntimeError): pass
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=10, html=False, password='', debug=0): def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
device = TextConverter(rsrc, debug=debug) device = TextConverter(rsrc, debug=debug)
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname, 'rb') fp = file(fname, 'rb')
@ -238,7 +238,7 @@ def main(argv):
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:') (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -247,6 +247,7 @@ def main(argv):
cdbcmapdir = 'CDBCMap' cdbcmapdir = 'CDBCMap'
codec = 'ascii' codec = 'ascii'
pages = set() pages = set()
maxpages = 0
html = False html = False
password = '' password = ''
outfp = stdout outfp = stdout
@ -255,6 +256,7 @@ def main(argv):
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-m': maxpages = int(v)
elif k == '-C': cmapdir = v elif k == '-C': cmapdir = v
elif k == '-D': cdbcmapdir = v elif k == '-D': cdbcmapdir = v
elif k == '-H': html = True elif k == '-H': html = True
@ -263,7 +265,8 @@ def main(argv):
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug) rsrc = PDFResourceManager(debug=debug)
for fname in args: for fname in args:
pdf2txt(outfp, rsrc, fname, pages, codec, html=html, password=password, debug=debug) pdf2txt(outfp, rsrc, fname, pages, codec,
maxpages=maxpages, html=html, password=password, debug=debug)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))