Python 3.4 support and tests
parent
a6475b61b4
commit
29c07ea770
|
@ -2,7 +2,10 @@ language: python
|
||||||
python:
|
python:
|
||||||
- "2.6"
|
- "2.6"
|
||||||
- "2.7"
|
- "2.7"
|
||||||
|
- "3.4"
|
||||||
install:
|
install:
|
||||||
|
- pip install six
|
||||||
- pip install pycrypto
|
- pip install pycrypto
|
||||||
script:
|
script:
|
||||||
- make test
|
- make test
|
||||||
|
- nosetests
|
||||||
|
|
|
@ -34,7 +34,7 @@ Features
|
||||||
How to Install
|
How to Install
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
* Install Python 2.6 or newer. (**Python 3 is not supported.**)
|
* Install Python 2.6 or newer. (Python 3.4 is supported.)
|
||||||
* Download the source code.
|
* Download the source code.
|
||||||
* Unpack it.
|
* Unpack it.
|
||||||
* Run `setup.py`:
|
* Run `setup.py`:
|
||||||
|
|
|
@ -31,6 +31,7 @@ from .encodingdb import name2unicode
|
||||||
from .utils import choplist
|
from .utils import choplist
|
||||||
from .utils import nunpack
|
from .utils import nunpack
|
||||||
|
|
||||||
|
import six #Python 2+3 compatibility
|
||||||
|
|
||||||
class CMapError(Exception):
|
class CMapError(Exception):
|
||||||
pass
|
pass
|
||||||
|
@ -92,8 +93,8 @@ class CMap(CMapBase):
|
||||||
def decode(self, code):
|
def decode(self, code):
|
||||||
logging.debug('decode: %r, %r' % (self, code))
|
logging.debug('decode: %r, %r' % (self, code))
|
||||||
d = self.code2cid
|
d = self.code2cid
|
||||||
for c in code:
|
for i in six.iterbytes(code):
|
||||||
c = ord(c)
|
c = six.int2byte(i)
|
||||||
if c in d:
|
if c in d:
|
||||||
d = d[c]
|
d = d[c]
|
||||||
if isinstance(d, int):
|
if isinstance(d, int):
|
||||||
|
|
|
@ -21,6 +21,7 @@ from .utils import mult_matrix
|
||||||
from .utils import enc
|
from .utils import enc
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
## PDFLayoutAnalyzer
|
## PDFLayoutAnalyzer
|
||||||
##
|
##
|
||||||
|
@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||||
try:
|
try:
|
||||||
text = font.to_unichr(cid)
|
text = font.to_unichr(cid)
|
||||||
assert isinstance(text, unicode), text
|
assert isinstance(text, six.text_type), text
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
text = self.handle_undefined_char(font, cid)
|
text = self.handle_undefined_char(font, cid)
|
||||||
textwidth = font.char_width(cid)
|
textwidth = font.char_width(cid)
|
||||||
|
@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class XMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||||
laparams=None, imagewriter=None, stripcontrol=False):
|
laparams=None, imagewriter=None, stripcontrol=False):
|
||||||
|
|
|
@ -4,6 +4,7 @@ from .psparser import PSLiteral
|
||||||
from .glyphlist import glyphname2unicode
|
from .glyphlist import glyphname2unicode
|
||||||
from .latin_enc import ENCODING
|
from .latin_enc import ENCODING
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
STRIP_NAME = re.compile(r'[0-9]+')
|
STRIP_NAME = re.compile(r'[0-9]+')
|
||||||
|
|
||||||
|
@ -17,7 +18,7 @@ def name2unicode(name):
|
||||||
m = STRIP_NAME.search(name)
|
m = STRIP_NAME.search(name)
|
||||||
if not m:
|
if not m:
|
||||||
raise KeyError(name)
|
raise KeyError(name)
|
||||||
return unichr(int(m.group(0)))
|
return six.unichr(int(m.group(0)))
|
||||||
|
|
||||||
|
|
||||||
## EncodingDB
|
## EncodingDB
|
||||||
|
|
|
@ -9,6 +9,7 @@ from .utils import bbox2str
|
||||||
from .utils import matrix2str
|
from .utils import matrix2str
|
||||||
from .utils import apply_matrix_pt
|
from .utils import apply_matrix_pt
|
||||||
|
|
||||||
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
## IndexAssigner
|
## IndexAssigner
|
||||||
##
|
##
|
||||||
|
@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer):
|
||||||
|
|
||||||
# XXX this still takes O(n^2) :(
|
# XXX this still takes O(n^2) :(
|
||||||
dists = []
|
dists = []
|
||||||
for i in xrange(len(boxes)):
|
for i in range(len(boxes)):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in xrange(i+1, len(boxes)):
|
for j in range(i+1, len(boxes)):
|
||||||
obj2 = boxes[j]
|
obj2 = boxes[j]
|
||||||
dists.append((0, dist(obj1, obj2), obj1, obj2))
|
dists.append((0, dist(obj1, obj2), obj1, obj2))
|
||||||
# We could use dists.sort(), but it would randomize the test result.
|
# We could use dists.sort(), but it would randomize the test result.
|
||||||
|
|
|
@ -41,6 +41,13 @@ class PDFResourceError(PDFException):
|
||||||
class PDFInterpreterError(PDFException):
|
class PDFInterpreterError(PDFException):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
## Constants
|
||||||
|
##
|
||||||
|
LITERAL_PDF = LIT('PDF')
|
||||||
|
LITERAL_TEXT = LIT('Text')
|
||||||
|
LITERAL_FONT = LIT('Font')
|
||||||
|
LITERAL_FORM = LIT('Form')
|
||||||
|
LITERAL_IMAGE = LIT('Image')
|
||||||
|
|
||||||
## PDFTextState
|
## PDFTextState
|
||||||
##
|
##
|
||||||
|
@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser):
|
||||||
while i <= len(target):
|
while i <= len(target):
|
||||||
self.fillbuf()
|
self.fillbuf()
|
||||||
if i:
|
if i:
|
||||||
c = self.buf[self.charpos]
|
c = six.indexbytes(self.buf,self.charpos)
|
||||||
|
c=six.int2byte(c)
|
||||||
data += c
|
data += c
|
||||||
self.charpos += 1
|
self.charpos += 1
|
||||||
if len(target) <= i and c.isspace():
|
if len(target) <= i and c.isspace():
|
||||||
|
@ -334,7 +342,7 @@ class PDFPageInterpreter(object):
|
||||||
for (k, v) in six.iteritems(dict_value(resources)):
|
for (k, v) in six.iteritems(dict_value(resources)):
|
||||||
logging.debug('Resource: %r: %r' % (k, v))
|
logging.debug('Resource: %r: %r' % (k, v))
|
||||||
if k == 'Font':
|
if k == 'Font':
|
||||||
for (fontid, spec) in dict_value(v).iteritems():
|
for (fontid, spec) in six.iteritems(dict_value(v)):
|
||||||
objid = None
|
objid = None
|
||||||
if isinstance(spec, PDFObjRef):
|
if isinstance(spec, PDFObjRef):
|
||||||
objid = spec.objid
|
objid = spec.objid
|
||||||
|
@ -346,7 +354,7 @@ class PDFPageInterpreter(object):
|
||||||
elif k == 'ProcSet':
|
elif k == 'ProcSet':
|
||||||
self.rsrcmgr.get_procset(list_value(v))
|
self.rsrcmgr.get_procset(list_value(v))
|
||||||
elif k == 'XObject':
|
elif k == 'XObject':
|
||||||
for (xobjid, xobjstrm) in dict_value(v).iteritems():
|
for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
|
||||||
self.xobjmap[xobjid] = xobjstrm
|
self.xobjmap[xobjid] = xobjstrm
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
|
# some predefined literals and keywords.
|
||||||
|
LITERAL_PAGE = LIT('Page')
|
||||||
|
LITERAL_PAGES = LIT('Pages')
|
||||||
|
|
||||||
## PDFPage
|
## PDFPage
|
||||||
##
|
##
|
||||||
class PDFPage(object):
|
class PDFPage(object):
|
||||||
|
@ -82,12 +86,12 @@ class PDFPage(object):
|
||||||
for (k, v) in six.iteritems(parent):
|
for (k, v) in six.iteritems(parent):
|
||||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type').name=='Pages' and 'Kids' in tree:
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||||
for c in list_value(tree['Kids']):
|
for c in list_value(tree['Kids']):
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree.get('Type').name=='Page':
|
elif tree.get('Type') is LITERAL_PAGE:
|
||||||
logging.info('Page: %r' % tree)
|
logging.info('Page: %r' % tree)
|
||||||
yield (objid, tree)
|
yield (objid, tree)
|
||||||
pages = False
|
pages = False
|
||||||
|
|
|
@ -128,7 +128,7 @@ def fsplit(pred, objs):
|
||||||
def drange(v0, v1, d):
|
def drange(v0, v1, d):
|
||||||
"""Returns a discrete range."""
|
"""Returns a discrete range."""
|
||||||
assert v0 < v1
|
assert v0 < v1
|
||||||
return xrange(int(v0)//d, int(v1+d)//d)
|
return range(int(v0)//d, int(v1+d)//d)
|
||||||
|
|
||||||
|
|
||||||
# get_bound
|
# get_bound
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest, logging, os
|
||||||
|
|
||||||
|
import tools.pdf2txt as pdf2txt
|
||||||
|
|
||||||
|
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
||||||
|
|
||||||
|
def run(datapath,filename,options=None):
|
||||||
|
i=path+datapath+filename+'.pdf'
|
||||||
|
o=path+filename+'.txt'
|
||||||
|
if options:
|
||||||
|
s='pdf2txt -o%s %s %s'%(o,options,i)
|
||||||
|
else:
|
||||||
|
s='pdf2txt -o%s %s'%(o,i)
|
||||||
|
pdf2txt.main(s.split(' '))
|
||||||
|
|
||||||
|
class TestDumpPDF(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
|
def test_1(self):
|
||||||
|
run('../samples/','jo')
|
||||||
|
run('../samples/','simple1')
|
||||||
|
run('../samples/','simple2')
|
||||||
|
run('../samples/','simple3')
|
||||||
|
|
||||||
|
def test_2(self):
|
||||||
|
run('../samples/nonfree/','dmca')
|
||||||
|
|
||||||
|
def test_3(self):
|
||||||
|
run('../samples/nonfree/','f1040nr')
|
||||||
|
|
||||||
|
def test_4(self):
|
||||||
|
run('../samples/nonfree/','i1040nr')
|
||||||
|
|
||||||
|
def test_5(self):
|
||||||
|
run('../samples/nonfree/','kampo')
|
||||||
|
|
||||||
|
def test_6(self):
|
||||||
|
run('../samples/nonfree/','naacl06-shinyama')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
|
@ -28,7 +28,7 @@ def main(argv):
|
||||||
# debug option
|
# debug option
|
||||||
debug = 0
|
debug = 0
|
||||||
# input option
|
# input option
|
||||||
password = ''
|
password = b''
|
||||||
pagenos = set()
|
pagenos = set()
|
||||||
maxpages = 0
|
maxpages = 0
|
||||||
# output option
|
# output option
|
||||||
|
@ -82,7 +82,7 @@ def main(argv):
|
||||||
elif outfile.endswith('.tag'):
|
elif outfile.endswith('.tag'):
|
||||||
outtype = 'tag'
|
outtype = 'tag'
|
||||||
if outfile:
|
if outfile:
|
||||||
outfp = file(outfile, 'w')
|
outfp = open(outfile, 'wb')
|
||||||
else:
|
else:
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
|
@ -101,7 +101,7 @@ def main(argv):
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
for fname in args:
|
for fname in args:
|
||||||
fp = file(fname, 'rb')
|
fp = open(fname, 'rb')
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
for page in PDFPage.get_pages(fp, pagenos,
|
for page in PDFPage.get_pages(fp, pagenos,
|
||||||
maxpages=maxpages, password=password,
|
maxpages=maxpages, password=password,
|
||||||
|
|
Loading…
Reference in New Issue