Python 3.4 support and tests
parent
a6475b61b4
commit
29c07ea770
|
@ -2,7 +2,10 @@ language: python
|
|||
python:
|
||||
- "2.6"
|
||||
- "2.7"
|
||||
- "3.4"
|
||||
install:
|
||||
- pip install six
|
||||
- pip install pycrypto
|
||||
script:
|
||||
- make test
|
||||
- nosetests
|
||||
|
|
|
@ -34,7 +34,7 @@ Features
|
|||
How to Install
|
||||
--------------
|
||||
|
||||
* Install Python 2.6 or newer. (**Python 3 is not supported.**)
|
||||
* Install Python 2.6 or newer. (Python 3.4 is supported.)
|
||||
* Download the source code.
|
||||
* Unpack it.
|
||||
* Run `setup.py`:
|
||||
|
|
|
@ -31,6 +31,7 @@ from .encodingdb import name2unicode
|
|||
from .utils import choplist
|
||||
from .utils import nunpack
|
||||
|
||||
import six #Python 2+3 compatibility
|
||||
|
||||
class CMapError(Exception):
|
||||
pass
|
||||
|
@ -92,8 +93,8 @@ class CMap(CMapBase):
|
|||
def decode(self, code):
|
||||
logging.debug('decode: %r, %r' % (self, code))
|
||||
d = self.code2cid
|
||||
for c in code:
|
||||
c = ord(c)
|
||||
for i in six.iterbytes(code):
|
||||
c = six.int2byte(i)
|
||||
if c in d:
|
||||
d = d[c]
|
||||
if isinstance(d, int):
|
||||
|
|
|
@ -21,6 +21,7 @@ from .utils import mult_matrix
|
|||
from .utils import enc
|
||||
from .utils import bbox2str
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
## PDFLayoutAnalyzer
|
||||
##
|
||||
|
@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
|
||||
try:
|
||||
text = font.to_unichr(cid)
|
||||
assert isinstance(text, unicode), text
|
||||
assert isinstance(text, six.text_type), text
|
||||
except PDFUnicodeNotDefined:
|
||||
text = self.handle_undefined_char(font, cid)
|
||||
textwidth = font.char_width(cid)
|
||||
|
@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||
laparams=None, imagewriter=None, stripcontrol=False):
|
||||
|
|
|
@ -4,6 +4,7 @@ from .psparser import PSLiteral
|
|||
from .glyphlist import glyphname2unicode
|
||||
from .latin_enc import ENCODING
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
STRIP_NAME = re.compile(r'[0-9]+')
|
||||
|
||||
|
@ -17,7 +18,7 @@ def name2unicode(name):
|
|||
m = STRIP_NAME.search(name)
|
||||
if not m:
|
||||
raise KeyError(name)
|
||||
return unichr(int(m.group(0)))
|
||||
return six.unichr(int(m.group(0)))
|
||||
|
||||
|
||||
## EncodingDB
|
||||
|
|
|
@ -9,6 +9,7 @@ from .utils import bbox2str
|
|||
from .utils import matrix2str
|
||||
from .utils import apply_matrix_pt
|
||||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
## IndexAssigner
|
||||
##
|
||||
|
@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer):
|
|||
|
||||
# XXX this still takes O(n^2) :(
|
||||
dists = []
|
||||
for i in xrange(len(boxes)):
|
||||
for i in range(len(boxes)):
|
||||
obj1 = boxes[i]
|
||||
for j in xrange(i+1, len(boxes)):
|
||||
for j in range(i+1, len(boxes)):
|
||||
obj2 = boxes[j]
|
||||
dists.append((0, dist(obj1, obj2), obj1, obj2))
|
||||
# We could use dists.sort(), but it would randomize the test result.
|
||||
|
|
|
@ -41,6 +41,13 @@ class PDFResourceError(PDFException):
|
|||
class PDFInterpreterError(PDFException):
|
||||
pass
|
||||
|
||||
## Constants
|
||||
##
|
||||
LITERAL_PDF = LIT('PDF')
|
||||
LITERAL_TEXT = LIT('Text')
|
||||
LITERAL_FONT = LIT('Font')
|
||||
LITERAL_FORM = LIT('Form')
|
||||
LITERAL_IMAGE = LIT('Image')
|
||||
|
||||
## PDFTextState
|
||||
##
|
||||
|
@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser):
|
|||
while i <= len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
c = self.buf[self.charpos]
|
||||
c = six.indexbytes(self.buf,self.charpos)
|
||||
c=six.int2byte(c)
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if len(target) <= i and c.isspace():
|
||||
|
@ -334,7 +342,7 @@ class PDFPageInterpreter(object):
|
|||
for (k, v) in six.iteritems(dict_value(resources)):
|
||||
logging.debug('Resource: %r: %r' % (k, v))
|
||||
if k == 'Font':
|
||||
for (fontid, spec) in dict_value(v).iteritems():
|
||||
for (fontid, spec) in six.iteritems(dict_value(v)):
|
||||
objid = None
|
||||
if isinstance(spec, PDFObjRef):
|
||||
objid = spec.objid
|
||||
|
@ -346,7 +354,7 @@ class PDFPageInterpreter(object):
|
|||
elif k == 'ProcSet':
|
||||
self.rsrcmgr.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid, xobjstrm) in dict_value(v).iteritems():
|
||||
for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
return
|
||||
|
||||
|
|
|
@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed
|
|||
|
||||
import six # Python 2+3 compatibility
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_PAGE = LIT('Page')
|
||||
LITERAL_PAGES = LIT('Pages')
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
@ -82,12 +86,12 @@ class PDFPage(object):
|
|||
for (k, v) in six.iteritems(parent):
|
||||
if k in klass.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type').name=='Pages' and 'Kids' in tree:
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
logging.info('Pages: Kids=%r' % tree['Kids'])
|
||||
for c in list_value(tree['Kids']):
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type').name=='Page':
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
logging.info('Page: %r' % tree)
|
||||
yield (objid, tree)
|
||||
pages = False
|
||||
|
|
|
@ -128,7 +128,7 @@ def fsplit(pred, objs):
|
|||
def drange(v0, v1, d):
|
||||
"""Returns a discrete range."""
|
||||
assert v0 < v1
|
||||
return xrange(int(v0)//d, int(v1+d)//d)
|
||||
return range(int(v0)//d, int(v1+d)//d)
|
||||
|
||||
|
||||
# get_bound
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import unittest, logging, os
|
||||
|
||||
import tools.pdf2txt as pdf2txt
|
||||
|
||||
path=os.path.dirname(os.path.abspath(__file__))+'/'
|
||||
|
||||
def run(datapath,filename,options=None):
|
||||
i=path+datapath+filename+'.pdf'
|
||||
o=path+filename+'.txt'
|
||||
if options:
|
||||
s='pdf2txt -o%s %s %s'%(o,options,i)
|
||||
else:
|
||||
s='pdf2txt -o%s %s'%(o,i)
|
||||
pdf2txt.main(s.split(' '))
|
||||
|
||||
class TestDumpPDF(unittest.TestCase):
|
||||
|
||||
|
||||
def test_1(self):
|
||||
run('../samples/','jo')
|
||||
run('../samples/','simple1')
|
||||
run('../samples/','simple2')
|
||||
run('../samples/','simple3')
|
||||
|
||||
def test_2(self):
|
||||
run('../samples/nonfree/','dmca')
|
||||
|
||||
def test_3(self):
|
||||
run('../samples/nonfree/','f1040nr')
|
||||
|
||||
def test_4(self):
|
||||
run('../samples/nonfree/','i1040nr')
|
||||
|
||||
def test_5(self):
|
||||
run('../samples/nonfree/','kampo')
|
||||
|
||||
def test_6(self):
|
||||
run('../samples/nonfree/','naacl06-shinyama')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
|
@ -28,7 +28,7 @@ def main(argv):
|
|||
# debug option
|
||||
debug = 0
|
||||
# input option
|
||||
password = ''
|
||||
password = b''
|
||||
pagenos = set()
|
||||
maxpages = 0
|
||||
# output option
|
||||
|
@ -82,7 +82,7 @@ def main(argv):
|
|||
elif outfile.endswith('.tag'):
|
||||
outtype = 'tag'
|
||||
if outfile:
|
||||
outfp = file(outfile, 'w')
|
||||
outfp = open(outfile, 'wb')
|
||||
else:
|
||||
outfp = sys.stdout
|
||||
if outtype == 'text':
|
||||
|
@ -101,7 +101,7 @@ def main(argv):
|
|||
else:
|
||||
return usage()
|
||||
for fname in args:
|
||||
fp = file(fname, 'rb')
|
||||
fp = open(fname, 'rb')
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
for page in PDFPage.get_pages(fp, pagenos,
|
||||
maxpages=maxpages, password=password,
|
||||
|
|
Loading…
Reference in New Issue