Python 3.4 support and tests

pull/1/head
unknown 2014-09-03 15:26:08 +02:00
parent a6475b61b4
commit 29c07ea770
11 changed files with 80 additions and 17 deletions

View File

@ -2,7 +2,10 @@ language: python
python: python:
- "2.6" - "2.6"
- "2.7" - "2.7"
- "3.4"
install: install:
- pip install six
- pip install pycrypto - pip install pycrypto
script: script:
- make test - make test
- nosetests

View File

@ -34,7 +34,7 @@ Features
How to Install How to Install
-------------- --------------
* Install Python 2.6 or newer. (**Python 3 is not supported.**) * Install Python 2.6 or newer. (Python 3.4 is supported.)
* Download the source code. * Download the source code.
* Unpack it. * Unpack it.
* Run `setup.py`: * Run `setup.py`:

View File

@ -31,6 +31,7 @@ from .encodingdb import name2unicode
from .utils import choplist from .utils import choplist
from .utils import nunpack from .utils import nunpack
import six #Python 2+3 compatibility
class CMapError(Exception): class CMapError(Exception):
pass pass
@ -92,8 +93,8 @@ class CMap(CMapBase):
def decode(self, code): def decode(self, code):
logging.debug('decode: %r, %r' % (self, code)) logging.debug('decode: %r, %r' % (self, code))
d = self.code2cid d = self.code2cid
for c in code: for i in six.iterbytes(code):
c = ord(c) c = six.int2byte(i)
if c in d: if c in d:
d = d[c] d = d[c]
if isinstance(d, int): if isinstance(d, int):

View File

@ -21,6 +21,7 @@ from .utils import mult_matrix
from .utils import enc from .utils import enc
from .utils import bbox2str from .utils import bbox2str
import six # Python 2+3 compatibility
## PDFLayoutAnalyzer ## PDFLayoutAnalyzer
## ##
@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
def render_char(self, matrix, font, fontsize, scaling, rise, cid): def render_char(self, matrix, font, fontsize, scaling, rise, cid):
try: try:
text = font.to_unichr(cid) text = font.to_unichr(cid)
assert isinstance(text, unicode), text assert isinstance(text, six.text_type), text
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid) text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid) textwidth = font.char_width(cid)
@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]') CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None, stripcontrol=False): laparams=None, imagewriter=None, stripcontrol=False):

View File

@ -4,6 +4,7 @@ from .psparser import PSLiteral
from .glyphlist import glyphname2unicode from .glyphlist import glyphname2unicode
from .latin_enc import ENCODING from .latin_enc import ENCODING
import six # Python 2+3 compatibility
STRIP_NAME = re.compile(r'[0-9]+') STRIP_NAME = re.compile(r'[0-9]+')
@ -17,7 +18,7 @@ def name2unicode(name):
m = STRIP_NAME.search(name) m = STRIP_NAME.search(name)
if not m: if not m:
raise KeyError(name) raise KeyError(name)
return unichr(int(m.group(0))) return six.unichr(int(m.group(0)))
## EncodingDB ## EncodingDB

View File

@ -9,6 +9,7 @@ from .utils import bbox2str
from .utils import matrix2str from .utils import matrix2str
from .utils import apply_matrix_pt from .utils import apply_matrix_pt
import six # Python 2+3 compatibility
## IndexAssigner ## IndexAssigner
## ##
@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer):
# XXX this still takes O(n^2) :( # XXX this still takes O(n^2) :(
dists = [] dists = []
for i in xrange(len(boxes)): for i in range(len(boxes)):
obj1 = boxes[i] obj1 = boxes[i]
for j in xrange(i+1, len(boxes)): for j in range(i+1, len(boxes)):
obj2 = boxes[j] obj2 = boxes[j]
dists.append((0, dist(obj1, obj2), obj1, obj2)) dists.append((0, dist(obj1, obj2), obj1, obj2))
# We could use dists.sort(), but it would randomize the test result. # We could use dists.sort(), but it would randomize the test result.

View File

@ -41,6 +41,13 @@ class PDFResourceError(PDFException):
class PDFInterpreterError(PDFException): class PDFInterpreterError(PDFException):
pass pass
## Constants
##
LITERAL_PDF = LIT('PDF')
LITERAL_TEXT = LIT('Text')
LITERAL_FONT = LIT('Font')
LITERAL_FORM = LIT('Form')
LITERAL_IMAGE = LIT('Image')
## PDFTextState ## PDFTextState
## ##
@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser):
while i <= len(target): while i <= len(target):
self.fillbuf() self.fillbuf()
if i: if i:
c = self.buf[self.charpos] c = six.indexbytes(self.buf,self.charpos)
c=six.int2byte(c)
data += c data += c
self.charpos += 1 self.charpos += 1
if len(target) <= i and c.isspace(): if len(target) <= i and c.isspace():
@ -334,7 +342,7 @@ class PDFPageInterpreter(object):
for (k, v) in six.iteritems(dict_value(resources)): for (k, v) in six.iteritems(dict_value(resources)):
logging.debug('Resource: %r: %r' % (k, v)) logging.debug('Resource: %r: %r' % (k, v))
if k == 'Font': if k == 'Font':
for (fontid, spec) in dict_value(v).iteritems(): for (fontid, spec) in six.iteritems(dict_value(v)):
objid = None objid = None
if isinstance(spec, PDFObjRef): if isinstance(spec, PDFObjRef):
objid = spec.objid objid = spec.objid
@ -346,7 +354,7 @@ class PDFPageInterpreter(object):
elif k == 'ProcSet': elif k == 'ProcSet':
self.rsrcmgr.get_procset(list_value(v)) self.rsrcmgr.get_procset(list_value(v))
elif k == 'XObject': elif k == 'XObject':
for (xobjid, xobjstrm) in dict_value(v).iteritems(): for (xobjid, xobjstrm) in six.iteritems(dict_value(v)):
self.xobjmap[xobjid] = xobjstrm self.xobjmap[xobjid] = xobjstrm
return return

View File

@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed
import six # Python 2+3 compatibility import six # Python 2+3 compatibility
# some predefined literals and keywords.
LITERAL_PAGE = LIT('Page')
LITERAL_PAGES = LIT('Pages')
## PDFPage ## PDFPage
## ##
class PDFPage(object): class PDFPage(object):
@ -82,12 +86,12 @@ class PDFPage(object):
for (k, v) in six.iteritems(parent): for (k, v) in six.iteritems(parent):
if k in klass.INHERITABLE_ATTRS and k not in tree: if k in klass.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type').name=='Pages' and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
logging.info('Pages: Kids=%r' % tree['Kids']) logging.info('Pages: Kids=%r' % tree['Kids'])
for c in list_value(tree['Kids']): for c in list_value(tree['Kids']):
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type').name=='Page': elif tree.get('Type') is LITERAL_PAGE:
logging.info('Page: %r' % tree) logging.info('Page: %r' % tree)
yield (objid, tree) yield (objid, tree)
pages = False pages = False

View File

@ -128,7 +128,7 @@ def fsplit(pred, objs):
def drange(v0, v1, d): def drange(v0, v1, d):
"""Returns a discrete range.""" """Returns a discrete range."""
assert v0 < v1 assert v0 < v1
return xrange(int(v0)//d, int(v1+d)//d) return range(int(v0)//d, int(v1+d)//d)
# get_bound # get_bound

View File

@ -0,0 +1,44 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest, logging, os
import tools.pdf2txt as pdf2txt
path=os.path.dirname(os.path.abspath(__file__))+'/'
def run(datapath,filename,options=None):
i=path+datapath+filename+'.pdf'
o=path+filename+'.txt'
if options:
s='pdf2txt -o%s %s %s'%(o,options,i)
else:
s='pdf2txt -o%s %s'%(o,i)
pdf2txt.main(s.split(' '))
class TestDumpPDF(unittest.TestCase):
def test_1(self):
run('../samples/','jo')
run('../samples/','simple1')
run('../samples/','simple2')
run('../samples/','simple3')
def test_2(self):
run('../samples/nonfree/','dmca')
def test_3(self):
run('../samples/nonfree/','f1040nr')
def test_4(self):
run('../samples/nonfree/','i1040nr')
def test_5(self):
run('../samples/nonfree/','kampo')
def test_6(self):
run('../samples/nonfree/','naacl06-shinyama')
if __name__ == '__main__':
unittest.main()

View File

@ -28,7 +28,7 @@ def main(argv):
# debug option # debug option
debug = 0 debug = 0
# input option # input option
password = '' password = b''
pagenos = set() pagenos = set()
maxpages = 0 maxpages = 0
# output option # output option
@ -82,7 +82,7 @@ def main(argv):
elif outfile.endswith('.tag'): elif outfile.endswith('.tag'):
outtype = 'tag' outtype = 'tag'
if outfile: if outfile:
outfp = file(outfile, 'w') outfp = open(outfile, 'wb')
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'text': if outtype == 'text':
@ -101,7 +101,7 @@ def main(argv):
else: else:
return usage() return usage()
for fname in args: for fname in args:
fp = file(fname, 'rb') fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, pagenos, for page in PDFPage.get_pages(fp, pagenos,
maxpages=maxpages, password=password, maxpages=maxpages, password=password,