several bugfixes.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@41 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
434cb894a8
commit
cb02051481
|
@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
|
||||||
<h1>PDFMiner</h1>
|
<h1>PDFMiner</h1>
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Jun 29 23:57:07 JST 2008
|
Last Modified: Tue Jul 1 00:02:48 JST 2008
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -54,8 +54,8 @@ http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080629.tar.gz
|
||||||
|
|
||||||
<P>
|
<P>
|
||||||
<strong>Svn repository:</strong><br>
|
<strong>Svn repository:</strong><br>
|
||||||
<a href="http://code.google.com/p/pdfminerr/source/browse">
|
<a href="http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer">
|
||||||
http://code.google.com/p/pdfminerr/source/browse
|
http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
<P>
|
<P>
|
||||||
|
|
|
@ -856,7 +856,7 @@ class PDFPageInterpreter:
|
||||||
return
|
return
|
||||||
# setleading
|
# setleading
|
||||||
def do_TL(self, leading):
|
def do_TL(self, leading):
|
||||||
self.textstate.leading = leading
|
self.textstate.leading = -leading
|
||||||
return
|
return
|
||||||
# selectfont
|
# selectfont
|
||||||
def do_Tf(self, fontid, fontsize):
|
def do_Tf(self, fontid, fontsize):
|
||||||
|
|
|
@ -362,7 +362,7 @@ class PDFXRefStream(object):
|
||||||
(_,genno) = parser.nexttoken() # ignored
|
(_,genno) = parser.nexttoken() # ignored
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_,stream) = parser.nextobject()
|
||||||
if stream.dic['Type'] != LITERAL_XREF:
|
if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
|
||||||
raise PDFNoValidXRef('invalid stream spec.')
|
raise PDFNoValidXRef('invalid stream spec.')
|
||||||
size = stream.dic['Size']
|
size = stream.dic['Size']
|
||||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||||
|
@ -450,7 +450,7 @@ class PDFDocument:
|
||||||
def set_root(self, root):
|
def set_root(self, root):
|
||||||
self.root = root
|
self.root = root
|
||||||
self.catalog = dict_value(self.root)
|
self.catalog = dict_value(self.root)
|
||||||
if self.catalog['Type'] != LITERAL_CATALOG:
|
if self.catalog.get('Type') != LITERAL_CATALOG:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Catalog not found!')
|
raise PDFValueError('Catalog not found!')
|
||||||
self.outline = self.catalog.get('Outline')
|
self.outline = self.catalog.get('Outline')
|
||||||
|
@ -504,7 +504,7 @@ class PDFDocument:
|
||||||
hash.update(docid[0]) # 3
|
hash.update(docid[0]) # 3
|
||||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||||
for i in xrange(1,19+1):
|
for i in xrange(1,19+1):
|
||||||
k = ''.join( chr(c ^ i) for c in key )
|
k = ''.join( chr(ord(c) ^ i) for c in key )
|
||||||
x = Arcfour(k).process(x)
|
x = Arcfour(k).process(x)
|
||||||
u1 = x+x # 32bytes total
|
u1 = x+x # 32bytes total
|
||||||
if R == 2:
|
if R == 2:
|
||||||
|
@ -599,16 +599,17 @@ class PDFDocument:
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree['Type'] == LITERAL_PAGES:
|
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||||
for c in tree['Kids']:
|
for c in tree['Kids']:
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree['Type'] == LITERAL_PAGE:
|
elif tree.get('Type') == LITERAL_PAGE:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield tree
|
yield tree
|
||||||
|
if 'Pages' not in self.catalog: return
|
||||||
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
||||||
yield PDFPage(self, i, tree)
|
yield PDFPage(self, i, tree)
|
||||||
return
|
return
|
||||||
|
|
|
@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=10, html=False, password='', debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
|
||||||
device = TextConverter(rsrc, debug=debug)
|
device = TextConverter(rsrc, debug=debug)
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
|
@ -238,7 +238,7 @@ def main(argv):
|
||||||
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
|
print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -247,6 +247,7 @@ def main(argv):
|
||||||
cdbcmapdir = 'CDBCMap'
|
cdbcmapdir = 'CDBCMap'
|
||||||
codec = 'ascii'
|
codec = 'ascii'
|
||||||
pages = set()
|
pages = set()
|
||||||
|
maxpages = 0
|
||||||
html = False
|
html = False
|
||||||
password = ''
|
password = ''
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
|
@ -255,6 +256,7 @@ def main(argv):
|
||||||
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
|
elif k == '-m': maxpages = int(v)
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
elif k == '-D': cdbcmapdir = v
|
elif k == '-D': cdbcmapdir = v
|
||||||
elif k == '-H': html = True
|
elif k == '-H': html = True
|
||||||
|
@ -263,7 +265,8 @@ def main(argv):
|
||||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
||||||
rsrc = PDFResourceManager(debug=debug)
|
rsrc = PDFResourceManager(debug=debug)
|
||||||
for fname in args:
|
for fname in args:
|
||||||
pdf2txt(outfp, rsrc, fname, pages, codec, html=html, password=password, debug=debug)
|
pdf2txt(outfp, rsrc, fname, pages, codec,
|
||||||
|
maxpages=maxpages, html=html, password=password, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue