Python 3.4 support and tests

pull/1/head
unknown 2014-09-03 15:26:08 +02:00
parent a6475b61b4
commit 29c07ea770
11 changed files with 80 additions and 17 deletions

View File

@ -2,7 +2,10 @@ language: python
python:
- "2.6"
- "2.7"
- "3.4"
install:
- pip install six
- pip install pycrypto
script:
- make test
- nosetests

View File

@ -34,7 +34,7 @@ Features
How to Install
--------------
* Install Python 2.6 or newer. (**Python 3 is not supported.**)
* Install Python 2.6 or newer. (Python 3.4 is supported.)
* Download the source code.
* Unpack it.
* Run `setup.py`:

View File

@ -31,6 +31,7 @@ from .encodingdb import name2unicode
from .utils import choplist
from .utils import nunpack
import six #Python 2+3 compatibility
class CMapError(Exception):
pass
@ -92,8 +93,8 @@ class CMap(CMapBase):
def decode(self, code):
logging.debug('decode: %r, %r' % (self, code))
d = self.code2cid
for c in code:
c = ord(c)
for i in six.iterbytes(code):
c = six.int2byte(i)
if c in d:
d = d[c]
if isinstance(d, int):

View File

@ -21,6 +21,7 @@ from .utils import mult_matrix
from .utils import enc
from .utils import bbox2str
import six # Python 2+3 compatibility
## PDFLayoutAnalyzer
##
@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
def render_char(self, matrix, font, fontsize, scaling, rise, cid):
try:
text = font.to_unichr(cid)
assert isinstance(text, unicode), text
assert isinstance(text, six.text_type), text
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter):
##
class XMLConverter(PDFConverter):
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None, stripcontrol=False):

View File

@ -4,6 +4,7 @@ from .psparser import PSLiteral
from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING
import six # Python 2+3 compatibility
STRIP_NAME = re.compile(r'[0-9]+')
@ -17,7 +18,7 @@ def name2unicode(name):
m = STRIP_NAME.search(name)
if not m:
raise KeyError(name)
return unichr(int(m.group(0)))
return six.unichr(int(m.group(0)))
## EncodingDB

View File

@ -9,6 +9,7 @@ from .utils import bbox2str
from .utils import matrix2str
from .utils import apply_matrix_pt
import six # Python 2+3 compatibility
## IndexAssigner
##
@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer):
# XXX this still takes O(n^2) :(
dists = []
for i in xrange(len(boxes)):
for i in range(len(boxes)):
obj1 = boxes[i]
for j in xrange(i+1, len(boxes)):
for j in range(i+1, len(boxes)):
obj2 = boxes[j]
dists.append((0, dist(obj1, obj2), obj1, obj2))
# We could use dists.sort(), but it would randomize the test result.

View File

@ -41,6 +41,13 @@ class PDFResourceError(PDFException):
class PDFInterpreterError(PDFException):
pass
## Constants
##
LITERAL_PDF = LIT('PDF')
LITERAL_TEXT = LIT('Text')
LITERAL_FONT = LIT('Font')
LITERAL_FORM = LIT('Form')
LITERAL_IMAGE = LIT('Image')
## PDFTextState
##
@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser):
while i <= len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
c = six.indexbytes(self.buf,self.charpos)
c=six.int2byte(c)
data += c
self.charpos += 1
if len(target) <= i and c.isspace():
@ -334,7 +342,7 @@ class PDFPageInterpreter(object):
for (k, v) in six.iteritems(dict_value(resources)):
logging.debug('Resource: %r: %r' % (k, v))
if k == 'Font':
for (fontid, spec) in dict_value(v).iteritems():
for (fontid, spec) in six.iteritems(dict_value(v)):
objid = None
if isinstance(spec, PDFObjRef):
objid = spec.objid
@ -346,7 +354,7 @@ class PDFPageInterpreter(object):
elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid, xobjstrm) in dict_value(v).iteritems():
for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
self.xobjmap[xobjid] = xobjstrm
return

View File

@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed
import six # Python 2+3 compatibility
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
## PDFPage
##
class PDFPage(object):
@ -82,12 +86,12 @@ class PDFPage(object):
for (k, v) in six.iteritems(parent):
if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v
if tree.get('Type').name=='Pages' and 'Kids' in tree:
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
logging.info('Pages: Kids=%r' % tree['Kids'])
for c in list_value(tree['Kids']):
for x in search(c, tree):
yield x
elif tree.get('Type').name=='Page':
elif tree.get('Type') is LITERAL_PAGE:
logging.info('Page: %r' % tree)
yield (objid, tree)
pages = False

View File

@ -128,7 +128,7 @@ def fsplit(pred, objs):
def drange(v0, v1, d):
"""Returns a discrete range."""
assert v0 < v1
return xrange(int(v0)//d, int(v1+d)//d)
return range(int(v0)//d, int(v1+d)//d)
# get_bound

View File

@ -0,0 +1,44 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest, logging, os
import tools.pdf2txt as pdf2txt
path=os.path.dirname(os.path.abspath(__file__))+'/'
def run(datapath,filename,options=None):
i=path+datapath+filename+'.pdf'
o=path+filename+'.txt'
if options:
s='pdf2txt -o%s %s %s'%(o,options,i)
else:
s='pdf2txt -o%s %s'%(o,i)
pdf2txt.main(s.split(' '))
class TestDumpPDF(unittest.TestCase):
def test_1(self):
run('../samples/','jo')
run('../samples/','simple1')
run('../samples/','simple2')
run('../samples/','simple3')
def test_2(self):
run('../samples/nonfree/','dmca')
def test_3(self):
run('../samples/nonfree/','f1040nr')
def test_4(self):
run('../samples/nonfree/','i1040nr')
def test_5(self):
run('../samples/nonfree/','kampo')
def test_6(self):
run('../samples/nonfree/','naacl06-shinyama')
if __name__ == '__main__':
unittest.main()

View File

@ -28,7 +28,7 @@ def main(argv):
# debug option
debug = 0
# input option
password = ''
password = b''
pagenos = set()
maxpages = 0
# output option
@ -82,7 +82,7 @@ def main(argv):
elif outfile.endswith('.tag'):
outtype = 'tag'
if outfile:
outfp = file(outfile, 'w')
outfp = open(outfile, 'wb')
else:
outfp = sys.stdout
if outtype == 'text':
@ -101,7 +101,7 @@ def main(argv):
else:
return usage()
for fname in args:
fp = file(fname, 'rb')
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password,