Python 3.4 support added and tested
parent
846cd18186
commit
a6475b61b4
|
@ -9,6 +9,8 @@ This code is in the public domain.
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
|
|
||||||
# ascii85decode(data)
|
# ascii85decode(data)
|
||||||
def ascii85decode(data):
|
def ascii85decode(data):
|
||||||
|
@ -31,7 +33,8 @@ def ascii85decode(data):
|
||||||
"""
|
"""
|
||||||
n = b = 0
|
n = b = 0
|
||||||
out = b''
|
out = b''
|
||||||
for c in data:
|
for i in six.iterbytes(data):
|
||||||
|
c=six.int2byte(i)
|
||||||
if b'!' <= c and c <= b'u':
|
if b'!' <= c and c <= b'u':
|
||||||
n += 1
|
n += 1
|
||||||
b = b*85+(ord(c)-33)
|
b = b*85+(ord(c)-33)
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
|
|
||||||
class CorruptDataError(Exception):
|
class CorruptDataError(Exception):
|
||||||
pass
|
pass
|
||||||
|
@ -47,7 +49,7 @@ class LZWDecoder(object):
|
||||||
def feed(self, code):
|
def feed(self, code):
|
||||||
x = b''
|
x = b''
|
||||||
if code == 256:
|
if code == 256:
|
||||||
self.table = [chr(c) for c in xrange(256)] # 0-255
|
self.table = [six.int2byte(c) for c in range(256)] # 0-255
|
||||||
self.table.append(None) # 256
|
self.table.append(None) # 256
|
||||||
self.table.append(None) # 257
|
self.table.append(None) # 257
|
||||||
self.prevbuf = b''
|
self.prevbuf = b''
|
||||||
|
@ -99,7 +101,8 @@ def lzwdecode(data):
|
||||||
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
|
||||||
"""
|
"""
|
||||||
fp = BytesIO(data)
|
fp = BytesIO(data)
|
||||||
return b''.join(LZWDecoder(fp).run())
|
s=LZWDecoder(fp).run()
|
||||||
|
return b''.join(s)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import doctest
|
import doctest
|
||||||
|
|
|
@ -180,7 +180,7 @@ class PDFXRefFallback(PDFXRef):
|
||||||
logging.info('trailer: %r' % self.get_trailer())
|
logging.info('trailer: %r' % self.get_trailer())
|
||||||
break
|
break
|
||||||
if six.PY3:
|
if six.PY3:
|
||||||
line=line.decode('utf-8')
|
line=line.decode('latin-1') #default pdf encoding
|
||||||
m = self.PDFOBJ_CUE.match(line)
|
m = self.PDFOBJ_CUE.match(line)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -12,6 +12,7 @@ from .psparser import STRICT
|
||||||
from .utils import apply_png_predictor
|
from .utils import apply_png_predictor
|
||||||
from .utils import isnumber
|
from .utils import isnumber
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
LITERAL_CRYPT = LIT('Crypt')
|
LITERAL_CRYPT = LIT('Crypt')
|
||||||
|
|
||||||
|
@ -105,7 +106,7 @@ def decipher_all(decipher, objid, genno, x):
|
||||||
if isinstance(x, list):
|
if isinstance(x, list):
|
||||||
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
x = [decipher_all(decipher, objid, genno, v) for v in x]
|
||||||
elif isinstance(x, dict):
|
elif isinstance(x, dict):
|
||||||
for (k, v) in x.iteritems():
|
for (k, v) in six.iteritems(x):
|
||||||
x[k] = decipher_all(decipher, objid, genno, v)
|
x[k] = decipher_all(decipher, objid, genno, v)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
@ -140,7 +141,7 @@ def num_value(x):
|
||||||
|
|
||||||
def str_value(x):
|
def str_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, str):
|
if not isinstance(x, six.binary_type):
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFTypeError('String required: %r' % x)
|
raise PDFTypeError('String required: %r' % x)
|
||||||
return ''
|
return ''
|
||||||
|
|
|
@ -17,7 +17,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
|
||||||
i = 0
|
i = 0
|
||||||
buf = b''
|
buf = b''
|
||||||
line0 = b'\x00' * columns
|
line0 = b'\x00' * columns
|
||||||
for i in xrange(0, len(data), nbytes+1):
|
for i in range(0, len(data), nbytes+1):
|
||||||
ft = data[i]
|
ft = data[i]
|
||||||
i += 1
|
i += 1
|
||||||
line1 = data[i:i+nbytes]
|
line1 = data[i:i+nbytes]
|
||||||
|
@ -90,7 +90,7 @@ def apply_matrix_norm(m, v):
|
||||||
|
|
||||||
# isnumber
|
# isnumber
|
||||||
def isnumber(x):
|
def isnumber(x):
|
||||||
return isinstance(x, (int, long, float))
|
return isinstance(x, (six.integer_types, float))
|
||||||
|
|
||||||
# uniq
|
# uniq
|
||||||
def uniq(objs):
|
def uniq(objs):
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -31,6 +31,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
|
||||||
],
|
],
|
||||||
keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'],
|
keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
'Programming Language :: Python',
|
||||||
|
'Programming Language :: Python :: 2.7',
|
||||||
|
'Programming Language :: Python :: 3.4',
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 4 - Beta',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'Intended Audience :: Developers',
|
'Intended Audience :: Developers',
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from nose.tools import assert_equal, assert_true, assert_false
|
||||||
|
from nose import SkipTest
|
||||||
|
import nose
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from pdfminer.psparser import *
|
||||||
|
|
||||||
|
## Simplistic Test cases
|
||||||
|
##
|
||||||
|
class TestPSBaseParser:
|
||||||
|
|
||||||
|
TESTDATA = br'''%!PS
|
||||||
|
begin end
|
||||||
|
" @ #
|
||||||
|
/a/BCD /Some_Name /foo#5f#xbaa
|
||||||
|
0 +1 -2 .5 1.234
|
||||||
|
(abc) () (abc ( def ) ghi)
|
||||||
|
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
|
||||||
|
(this % is not a comment.)
|
||||||
|
(foo
|
||||||
|
baa)
|
||||||
|
(foo\
|
||||||
|
baa)
|
||||||
|
<> <20> < 40 4020 >
|
||||||
|
<abcd00
|
||||||
|
12345>
|
||||||
|
func/a/b{(c)do*}def
|
||||||
|
[ 1 (z) ! ]
|
||||||
|
<< /foo (bar) >>
|
||||||
|
'''
|
||||||
|
|
||||||
|
TOKENS = [
|
||||||
|
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
|
||||||
|
(21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||||
|
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||||
|
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||||
|
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||||
|
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
||||||
|
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
||||||
|
(226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
|
||||||
|
(234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
|
||||||
|
(242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
|
||||||
|
(256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
|
||||||
|
(272, KWD(b'>>'))
|
||||||
|
]
|
||||||
|
|
||||||
|
OBJS = [
|
||||||
|
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
|
||||||
|
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
|
||||||
|
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
|
||||||
|
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
|
||||||
|
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
|
||||||
|
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
|
||||||
|
(230, LIT('a')), (232, LIT('b')), (234, [b'c']), (246, [1, b'z']),
|
||||||
|
(258, {'foo': b'bar'}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_tokens(self, s):
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
class MyParser(PSBaseParser):
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
parser = MyParser(BytesIO(s))
|
||||||
|
r = []
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
r.append(parser.nexttoken())
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
|
return r
|
||||||
|
|
||||||
|
def get_objects(self, s):
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
class MyParser(PSStackParser):
|
||||||
|
def flush(self):
|
||||||
|
self.add_results(*self.popall())
|
||||||
|
parser = MyParser(BytesIO(s))
|
||||||
|
r = []
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
r.append(parser.nextobject())
|
||||||
|
except PSEOF:
|
||||||
|
pass
|
||||||
|
return r
|
||||||
|
|
||||||
|
def test_1(self):
|
||||||
|
tokens = self.get_tokens(self.TESTDATA)
|
||||||
|
logging.info(tokens)
|
||||||
|
assert_equal(tokens, self.TOKENS)
|
||||||
|
return
|
||||||
|
|
||||||
|
def test_2(self):
|
||||||
|
objs = self.get_objects(self.TESTDATA)
|
||||||
|
logging.info(objs)
|
||||||
|
assert_equal(objs, self.OBJS)
|
||||||
|
return
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import logging,sys,os,six
|
||||||
|
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
|
||||||
|
nose.runmodule()
|
|
@ -0,0 +1,46 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest, logging, os
|
||||||
|
|
||||||
|
import tools.dumppdf as dumppdf
|
||||||
|
|
||||||
|
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
||||||
|
|
||||||
|
def run(datapath,filename,options=None):
|
||||||
|
i=path+datapath+filename+'.pdf'
|
||||||
|
o=path+filename+'.xml'
|
||||||
|
if options:
|
||||||
|
s='dumppdf -o%s %s %s'%(o,options,i)
|
||||||
|
else:
|
||||||
|
s='dumppdf -o%s %s'%(o,i)
|
||||||
|
dumppdf.main(s.split(' '))
|
||||||
|
|
||||||
|
class TestDumpPDF(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
|
def test_1(self):
|
||||||
|
run('../samples/','jo','-t -a')
|
||||||
|
run('../samples/','simple1','-t -a')
|
||||||
|
run('../samples/','simple2','-t -a')
|
||||||
|
run('../samples/','simple3','-t -a')
|
||||||
|
|
||||||
|
def test_2(self):
|
||||||
|
run('../samples/nonfree/','dmca','-t -a')
|
||||||
|
|
||||||
|
def test_3(self):
|
||||||
|
run('../samples/nonfree/','f1040nr')
|
||||||
|
|
||||||
|
def test_4(self):
|
||||||
|
run('../samples/nonfree/','i1040nr')
|
||||||
|
|
||||||
|
def test_5(self):
|
||||||
|
run('../samples/nonfree/','kampo','-t -a')
|
||||||
|
|
||||||
|
def test_6(self):
|
||||||
|
run('../samples/nonfree/','naacl06-shinyama','-t -a')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import logging,sys,os,six
|
||||||
|
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
|
||||||
|
unittest.main()
|
|
@ -6,7 +6,7 @@
|
||||||
# options:
|
# options:
|
||||||
# -i objid : object id
|
# -i objid : object id
|
||||||
#
|
#
|
||||||
import sys, os.path, re
|
import sys, os.path, re, logging
|
||||||
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
from pdfminer.psparser import PSKeyword, PSLiteral, LIT
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
|
||||||
|
@ -18,8 +18,12 @@ from pdfminer.utils import isnumber
|
||||||
|
|
||||||
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
|
||||||
def e(s):
|
def e(s):
|
||||||
|
if isinstance(s,six.binary_type):
|
||||||
|
s=str(s,'latin-1')
|
||||||
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
|
|
||||||
# dumpxml
|
# dumpxml
|
||||||
def dumpxml(out, obj, codec=None):
|
def dumpxml(out, obj, codec=None):
|
||||||
|
@ -29,7 +33,7 @@ def dumpxml(out, obj, codec=None):
|
||||||
|
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
out.write('<dict size="%d">\n' % len(obj))
|
out.write('<dict size="%d">\n' % len(obj))
|
||||||
for (k,v) in obj.iteritems():
|
for (k,v) in six.iteritems(obj):
|
||||||
out.write('<key>%s</key>\n' % k)
|
out.write('<key>%s</key>\n' % k)
|
||||||
out.write('<value>')
|
out.write('<value>')
|
||||||
dumpxml(out, v)
|
dumpxml(out, v)
|
||||||
|
@ -45,7 +49,7 @@ def dumpxml(out, obj, codec=None):
|
||||||
out.write('</list>')
|
out.write('</list>')
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, str):
|
if isinstance(obj, (six.string_types, six.binary_type)):
|
||||||
out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
|
out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -113,7 +117,7 @@ def dumpallobjs(out, doc, codec=None):
|
||||||
# dumpoutline
|
# dumpoutline
|
||||||
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
def dumpoutline(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, extractdir=None):
|
dumpall=False, codec=None, extractdir=None):
|
||||||
fp = file(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
pages = dict( (page.pageid, pageno) for (pageno,page)
|
pages = dict( (page.pageid, pageno) for (pageno,page)
|
||||||
|
@ -183,7 +187,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
out.close()
|
out.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
fp = file(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
|
@ -191,12 +195,13 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
|
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
|
||||||
extract1(obj)
|
extract1(obj)
|
||||||
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pagenos, password='',
|
def dumppdf(outfp, fname, objids, pagenos, password='',
|
||||||
dumpall=False, codec=None, extractdir=None):
|
dumpall=False, codec=None, extractdir=None):
|
||||||
fp = file(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
parser = PDFParser(fp)
|
parser = PDFParser(fp)
|
||||||
doc = PDFDocument(parser, password)
|
doc = PDFDocument(parser, password)
|
||||||
if objids:
|
if objids:
|
||||||
|
@ -229,22 +234,21 @@ def main(argv):
|
||||||
print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
|
print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:')
|
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
debug = 0
|
|
||||||
objids = []
|
objids = []
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
codec = None
|
codec = None
|
||||||
password = ''
|
password = b''
|
||||||
dumpall = False
|
dumpall = False
|
||||||
proc = dumppdf
|
proc = dumppdf
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
extractdir = None
|
extractdir = None
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': logging.getLogger().setlevel(logging.DEBUG)
|
||||||
elif k == '-o': outfp = file(v, 'wb')
|
elif k == '-o': outfp = open(v, 'w')
|
||||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
|
@ -256,13 +260,10 @@ def main(argv):
|
||||||
elif k == '-E':
|
elif k == '-E':
|
||||||
extractdir = v
|
extractdir = v
|
||||||
proc = extractembedded
|
proc = extractembedded
|
||||||
#
|
|
||||||
PDFDocument.debug = debug
|
|
||||||
PDFParser.debug = debug
|
|
||||||
#
|
|
||||||
for fname in args:
|
for fname in args:
|
||||||
proc(outfp, fname, objids, pagenos, password=password,
|
proc(outfp, fname, objids, pagenos, password=password,
|
||||||
dumpall=dumpall, codec=codec, extractdir=extractdir)
|
dumpall=dumpall, codec=codec, extractdir=extractdir)
|
||||||
return
|
outfp.close()
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue