Python 3.4 support added and tested

pull/1/head
unknown 2014-09-03 13:17:41 +02:00
parent 846cd18186
commit a6475b61b4
9 changed files with 188 additions and 24 deletions

View File

@ -9,6 +9,8 @@ This code is in the public domain.
import re import re
import struct import struct
import six #Python 2+3 compatibility
# ascii85decode(data) # ascii85decode(data)
def ascii85decode(data): def ascii85decode(data):
@ -31,7 +33,8 @@ def ascii85decode(data):
""" """
n = b = 0 n = b = 0
out = b'' out = b''
for c in data: for i in six.iterbytes(data):
c=six.int2byte(i)
if b'!' <= c and c <= b'u': if b'!' <= c and c <= b'u':
n += 1 n += 1
b = b*85+(ord(c)-33) b = b*85+(ord(c)-33)

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
from io import BytesIO from io import BytesIO
import six #Python 2+3 compatibility
class CorruptDataError(Exception): class CorruptDataError(Exception):
pass pass
@ -47,7 +49,7 @@ class LZWDecoder(object):
def feed(self, code): def feed(self, code):
x = b'' x = b''
if code == 256: if code == 256:
self.table = [chr(c) for c in xrange(256)] # 0-255 self.table = [six.int2byte(c) for c in range(256)] # 0-255
self.table.append(None) # 256 self.table.append(None) # 256
self.table.append(None) # 257 self.table.append(None) # 257
self.prevbuf = b'' self.prevbuf = b''
@ -99,7 +101,8 @@ def lzwdecode(data):
'\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42'
""" """
fp = BytesIO(data) fp = BytesIO(data)
return b''.join(LZWDecoder(fp).run()) s=LZWDecoder(fp).run()
return b''.join(s)
if __name__ == '__main__': if __name__ == '__main__':
import doctest import doctest

View File

@ -180,7 +180,7 @@ class PDFXRefFallback(PDFXRef):
logging.info('trailer: %r' % self.get_trailer()) logging.info('trailer: %r' % self.get_trailer())
break break
if six.PY3: if six.PY3:
line=line.decode('utf-8') line=line.decode('latin-1') #default pdf encoding
m = self.PDFOBJ_CUE.match(line) m = self.PDFOBJ_CUE.match(line)
if not m: if not m:
continue continue

View File

@ -12,6 +12,7 @@ from .psparser import STRICT
from .utils import apply_png_predictor from .utils import apply_png_predictor
from .utils import isnumber from .utils import isnumber
import six #Python 2+3 compatibility
LITERAL_CRYPT = LIT('Crypt') LITERAL_CRYPT = LIT('Crypt')
@ -105,7 +106,7 @@ def decipher_all(decipher, objid, genno, x):
if isinstance(x, list): if isinstance(x, list):
x = [decipher_all(decipher, objid, genno, v) for v in x] x = [decipher_all(decipher, objid, genno, v) for v in x]
elif isinstance(x, dict): elif isinstance(x, dict):
for (k, v) in x.iteritems(): for (k, v) in six.iteritems(x):
x[k] = decipher_all(decipher, objid, genno, v) x[k] = decipher_all(decipher, objid, genno, v)
return x return x
@ -140,7 +141,7 @@ def num_value(x):
def str_value(x): def str_value(x):
x = resolve1(x) x = resolve1(x)
if not isinstance(x, str): if not isinstance(x, six.binary_type):
if STRICT: if STRICT:
raise PDFTypeError('String required: %r' % x) raise PDFTypeError('String required: %r' % x)
return '' return ''

View File

@ -17,7 +17,7 @@ def apply_png_predictor(pred, colors, columns, bitspercomponent, data):
i = 0 i = 0
buf = b'' buf = b''
line0 = b'\x00' * columns line0 = b'\x00' * columns
for i in xrange(0, len(data), nbytes+1): for i in range(0, len(data), nbytes+1):
ft = data[i] ft = data[i]
i += 1 i += 1
line1 = data[i:i+nbytes] line1 = data[i:i+nbytes]
@ -90,7 +90,7 @@ def apply_matrix_norm(m, v):
# isnumber # isnumber
def isnumber(x): def isnumber(x):
return isinstance(x, (int, long, float)) return isinstance(x, (six.integer_types, float))
# uniq # uniq
def uniq(objs): def uniq(objs):

View File

@ -31,6 +31,9 @@ PDF parser that can be used for other purposes instead of text analysis.''',
], ],
keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'], keywords=['pdf parser', 'pdf converter', 'layout analysis', 'text mining'],
classifiers=[ classifiers=[
'Programming Language :: Python',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.4',
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'Environment :: Console', 'Environment :: Console',
'Intended Audience :: Developers', 'Intended Audience :: Developers',

View File

@ -0,0 +1,107 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from nose.tools import assert_equal, assert_true, assert_false
from nose import SkipTest
import nose
import logging
from pdfminer.psparser import *
## Simplistic Test cases
##
class TestPSBaseParser:
TESTDATA = br'''%!PS
begin end
" @ #
/a/BCD /Some_Name /foo#5f#xbaa
0 +1 -2 .5 1.234
(abc) () (abc ( def ) ghi)
(def\040\0\0404ghi) (bach\\slask) (foo\nbaa)
(this % is not a comment.)
(foo
baa)
(foo\
baa)
<> <20> < 40 4020 >
<abcd00
12345>
func/a/b{(c)do*}def
[ 1 (z) ! ]
<< /foo (bar) >>
'''
TOKENS = [
(5, KWD(b'begin')), (11, KWD(b'end')), (16, KWD(b'"')), (19, KWD(b'@')),
(21, KWD(b'#')), (23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
(226, KWD(b'func')), (230, LIT('a')), (232, LIT('b')),
(234, KWD(b'{')), (235, b'c'), (238, KWD(b'do*')), (241, KWD(b'}')),
(242, KWD(b'def')), (246, KWD(b'[')), (248, 1), (250, b'z'), (254, KWD(b'!')),
(256, KWD(b']')), (258, KWD(b'<<')), (261, LIT('foo')), (266, b'bar'),
(272, KWD(b'>>'))
]
OBJS = [
(23, LIT('a')), (25, LIT('BCD')), (30, LIT('Some_Name')),
(41, LIT('foo_xbaa')), (54, 0), (56, 1), (59, -2), (62, 0.5),
(65, 1.234), (71, b'abc'), (77, b''), (80, b'abc ( def ) ghi'),
(98, b'def \x00 4ghi'), (118, b'bach\\slask'), (132, b'foo\nbaa'),
(143, b'this % is not a comment.'), (170, b'foo\nbaa'), (180, b'foobaa'),
(191, b''), (194, b' '), (199, b'@@ '), (211, b'\xab\xcd\x00\x124\x05'),
(230, LIT('a')), (232, LIT('b')), (234, [b'c']), (246, [1, b'z']),
(258, {'foo': b'bar'}),
]
def get_tokens(self, s):
from io import BytesIO
class MyParser(PSBaseParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(BytesIO(s))
r = []
try:
while True:
r.append(parser.nexttoken())
except PSEOF:
pass
return r
def get_objects(self, s):
from io import BytesIO
class MyParser(PSStackParser):
def flush(self):
self.add_results(*self.popall())
parser = MyParser(BytesIO(s))
r = []
try:
while True:
r.append(parser.nextobject())
except PSEOF:
pass
return r
def test_1(self):
tokens = self.get_tokens(self.TESTDATA)
logging.info(tokens)
assert_equal(tokens, self.TOKENS)
return
def test_2(self):
objs = self.get_objects(self.TESTDATA)
logging.info(objs)
assert_equal(objs, self.OBJS)
return
if __name__ == '__main__':
import logging,sys,os,six
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
nose.runmodule()

View File

@ -0,0 +1,46 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest, logging, os
import tools.dumppdf as dumppdf
path=os.path.dirname(os.path.abspath(__file__))+'/'
def run(datapath,filename,options=None):
i=path+datapath+filename+'.pdf'
o=path+filename+'.xml'
if options:
s='dumppdf -o%s %s %s'%(o,options,i)
else:
s='dumppdf -o%s %s'%(o,i)
dumppdf.main(s.split(' '))
class TestDumpPDF(unittest.TestCase):
def test_1(self):
run('../samples/','jo','-t -a')
run('../samples/','simple1','-t -a')
run('../samples/','simple2','-t -a')
run('../samples/','simple3','-t -a')
def test_2(self):
run('../samples/nonfree/','dmca','-t -a')
def test_3(self):
run('../samples/nonfree/','f1040nr')
def test_4(self):
run('../samples/nonfree/','i1040nr')
def test_5(self):
run('../samples/nonfree/','kampo','-t -a')
def test_6(self):
run('../samples/nonfree/','naacl06-shinyama','-t -a')
if __name__ == '__main__':
import logging,sys,os,six
logging.basicConfig(level=logging.DEBUG, filename='%s_%d.%d.log'%(os.path.basename(__file__),sys.version_info[0],sys.version_info[1]))
unittest.main()

View File

@ -6,7 +6,7 @@
# options: # options:
# -i objid : object id # -i objid : object id
# #
import sys, os.path, re import sys, os.path, re, logging
from pdfminer.psparser import PSKeyword, PSLiteral, LIT from pdfminer.psparser import PSKeyword, PSLiteral, LIT
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
@ -18,8 +18,12 @@ from pdfminer.utils import isnumber
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]') ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
def e(s): def e(s):
if isinstance(s,six.binary_type):
s=str(s,'latin-1')
return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s) return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
import six # Python 2+3 compatibility
# dumpxml # dumpxml
def dumpxml(out, obj, codec=None): def dumpxml(out, obj, codec=None):
@ -29,7 +33,7 @@ def dumpxml(out, obj, codec=None):
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems(): for (k,v) in six.iteritems(obj):
out.write('<key>%s</key>\n' % k) out.write('<key>%s</key>\n' % k)
out.write('<value>') out.write('<value>')
dumpxml(out, v) dumpxml(out, v)
@ -45,7 +49,7 @@ def dumpxml(out, obj, codec=None):
out.write('</list>') out.write('</list>')
return return
if isinstance(obj, str): if isinstance(obj, (six.string_types, six.binary_type)):
out.write('<string size="%d">%s</string>' % (len(obj), e(obj))) out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
return return
@ -113,7 +117,7 @@ def dumpallobjs(out, doc, codec=None):
# dumpoutline # dumpoutline
def dumpoutline(outfp, fname, objids, pagenos, password='', def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = open(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
pages = dict( (page.pageid, pageno) for (pageno,page) pages = dict( (page.pageid, pageno) for (pageno,page)
@ -183,7 +187,7 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
out.close() out.close()
return return
fp = file(fname, 'rb') fp = open(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
for xref in doc.xrefs: for xref in doc.xrefs:
@ -191,12 +195,13 @@ def extractembedded(outfp, fname, objids, pagenos, password='',
obj = doc.getobj(objid) obj = doc.getobj(objid)
if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
extract1(obj) extract1(obj)
fp.close()
return return
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pagenos, password='', def dumppdf(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None): dumpall=False, codec=None, extractdir=None):
fp = file(fname, 'rb') fp = open(fname, 'rb')
parser = PDFParser(fp) parser = PDFParser(fp)
doc = PDFDocument(parser, password) doc = PDFDocument(parser, password)
if objids: if objids:
@ -229,22 +234,21 @@ def main(argv):
print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0]) print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:') (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
debug = 0
objids = [] objids = []
pagenos = set() pagenos = set()
codec = None codec = None
password = '' password = b''
dumpall = False dumpall = False
proc = dumppdf proc = dumppdf
outfp = sys.stdout outfp = sys.stdout
extractdir = None extractdir = None
for (k, v) in opts: for (k, v) in opts:
if k == '-d': debug += 1 if k == '-d': logging.getLogger().setlevel(logging.DEBUG)
elif k == '-o': outfp = file(v, 'wb') elif k == '-o': outfp = open(v, 'w')
elif k == '-i': objids.extend( int(x) for x in v.split(',') ) elif k == '-i': objids.extend( int(x) for x in v.split(',') )
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
elif k == '-P': password = v elif k == '-P': password = v
@ -256,13 +260,10 @@ def main(argv):
elif k == '-E': elif k == '-E':
extractdir = v extractdir = v
proc = extractembedded proc = extractembedded
#
PDFDocument.debug = debug
PDFParser.debug = debug
#
for fname in args: for fname in args:
proc(outfp, fname, objids, pagenos, password=password, proc(outfp, fname, objids, pagenos, password=password,
dumpall=dumpall, codec=codec, extractdir=extractdir) dumpall=dumpall, codec=codec, extractdir=extractdir)
return outfp.close()
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))