diff --git a/.travis.yml b/.travis.yml index 4f3c2f0..7c37767 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,10 @@ language: python python: - "2.6" - "2.7" + - "3.4" install: + - pip install six - pip install pycrypto script: - make test + - nosetests diff --git a/README.md b/README.md index 30aa5db..432361e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Features How to Install -------------- - * Install Python 2.6 or newer. (**Python 3 is not supported.**) + * Install Python 2.6 or newer. (Python 3.4 is supported.) * Download the source code. * Unpack it. * Run `setup.py`: diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 69d9de2..1e1c94e 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -31,6 +31,7 @@ from .encodingdb import name2unicode from .utils import choplist from .utils import nunpack +import six #Python 2+3 compatibility class CMapError(Exception): pass @@ -92,8 +93,8 @@ class CMap(CMapBase): def decode(self, code): logging.debug('decode: %r, %r' % (self, code)) d = self.code2cid - for c in code: - c = ord(c) + for i in six.iterbytes(code): + c = six.int2byte(i) if c in d: d = d[c] if isinstance(d, int): diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3e515d6..285d826 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -21,6 +21,7 @@ from .utils import mult_matrix from .utils import enc from .utils import bbox2str +import six # Python 2+3 compatibility ## PDFLayoutAnalyzer ## @@ -106,7 +107,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): def render_char(self, matrix, font, fontsize, scaling, rise, cid): try: text = font.to_unichr(cid) - assert isinstance(text, unicode), text + assert isinstance(text, six.text_type), text except PDFUnicodeNotDefined: text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) @@ -398,7 +399,7 @@ class HTMLConverter(PDFConverter): ## class XMLConverter(PDFConverter): - CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]') + CONTROL = re.compile(u'[\x00-\x08\x0b-\x0c\x0e-\x1f]') def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, imagewriter=None, stripcontrol=False): diff --git a/pdfminer/encodingdb.py b/pdfminer/encodingdb.py index b3263bd..209ce54 100644 --- a/pdfminer/encodingdb.py +++ b/pdfminer/encodingdb.py @@ -4,6 +4,7 @@ from .psparser import PSLiteral from .glyphlist import glyphname2unicode from .latin_enc import ENCODING +import six # Python 2+3 compatibility STRIP_NAME = re.compile(r'[0-9]+') @@ -17,7 +18,7 @@ def name2unicode(name): m = STRIP_NAME.search(name) if not m: raise KeyError(name) - return unichr(int(m.group(0))) + return six.unichr(int(m.group(0))) ## EncodingDB diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 32b706f..44cce01 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -9,6 +9,7 @@ from .utils import bbox2str from .utils import matrix2str from .utils import apply_matrix_pt +import six # Python 2+3 compatibility ## IndexAssigner ## @@ -633,9 +634,9 @@ class LTLayoutContainer(LTContainer): # XXX this still takes O(n^2) :( dists = [] - for i in xrange(len(boxes)): + for i in range(len(boxes)): obj1 = boxes[i] - for j in xrange(i+1, len(boxes)): + for j in range(i+1, len(boxes)): obj2 = boxes[j] dists.append((0, dist(obj1, obj2), obj1, obj2)) # We could use dists.sort(), but it would randomize the test result. diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 1469a6b..5d73ca4 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -41,6 +41,13 @@ class PDFResourceError(PDFException): class PDFInterpreterError(PDFException): pass +## Constants +## +LITERAL_PDF = LIT('PDF') +LITERAL_TEXT = LIT('Text') +LITERAL_FONT = LIT('Font') +LITERAL_FORM = LIT('Form') +LITERAL_IMAGE = LIT('Image') ## PDFTextState ## @@ -245,7 +252,8 @@ class PDFContentParser(PSStackParser): while i <= len(target): self.fillbuf() if i: - c = self.buf[self.charpos] + c = six.indexbytes(self.buf,self.charpos) + c=six.int2byte(c) data += c self.charpos += 1 if len(target) <= i and c.isspace(): @@ -334,7 +342,7 @@ class PDFPageInterpreter(object): for (k, v) in six.iteritems(dict_value(resources)): logging.debug('Resource: %r: %r' % (k, v)) if k == 'Font': - for (fontid, spec) in dict_value(v).iteritems(): + for (fontid, spec) in six.iteritems(dict_value(v)): objid = None if isinstance(spec, PDFObjRef): objid = spec.objid @@ -346,7 +354,7 @@ class PDFPageInterpreter(object): elif k == 'ProcSet': self.rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': - for (xobjid, xobjstrm) in dict_value(v).iteritems(): + for (xobjid, xobjstrm) in six.iteritems(dict_value(v)): self.xobjmap[xobjid] = xobjstrm return diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index a13bf23..5755229 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -12,6 +12,10 @@ from .pdfdocument import PDFTextExtractionNotAllowed import six # Python 2+3 compatibility +# some predefined literals and keywords. +LITERAL_PAGE = LIT('Page') +LITERAL_PAGES = LIT('Pages') + ## PDFPage ## class PDFPage(object): @@ -82,12 +86,12 @@ class PDFPage(object): for (k, v) in six.iteritems(parent): if k in klass.INHERITABLE_ATTRS and k not in tree: tree[k] = v - if tree.get('Type').name=='Pages' and 'Kids' in tree: + if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree: logging.info('Pages: Kids=%r' % tree['Kids']) for c in list_value(tree['Kids']): for x in search(c, tree): yield x - elif tree.get('Type').name=='Page': + elif tree.get('Type') is LITERAL_PAGE: logging.info('Page: %r' % tree) yield (objid, tree) pages = False diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 2514949..0ec01cf 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -128,7 +128,7 @@ def fsplit(pred, objs): def drange(v0, v1, d): """Returns a discrete range.""" assert v0 < v1 - return xrange(int(v0)//d, int(v1+d)//d) + return range(int(v0)//d, int(v1+d)//d) # get_bound diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py new file mode 100644 index 0000000..662ec9f --- /dev/null +++ b/tests/test_tools_pdf2txt.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest, logging, os + +import tools.pdf2txt as pdf2txt + +path=os.path.dirname(os.path.abspath(__file__))+'/' + +def run(datapath,filename,options=None): + i=path+datapath+filename+'.pdf' + o=path+filename+'.txt' + if options: + s='pdf2txt -o%s %s %s'%(o,options,i) + else: + s='pdf2txt -o%s %s'%(o,i) + pdf2txt.main(s.split(' ')) + +class TestDumpPDF(unittest.TestCase): + + + def test_1(self): + run('../samples/','jo') + run('../samples/','simple1') + run('../samples/','simple2') + run('../samples/','simple3') + + def test_2(self): + run('../samples/nonfree/','dmca') + + def test_3(self): + run('../samples/nonfree/','f1040nr') + + def test_4(self): + run('../samples/nonfree/','i1040nr') + + def test_5(self): + run('../samples/nonfree/','kampo') + + def test_6(self): + run('../samples/nonfree/','naacl06-shinyama') + +if __name__ == '__main__': + unittest.main() diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 5eb24bf..61d878f 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -28,7 +28,7 @@ def main(argv): # debug option debug = 0 # input option - password = '' + password = b'' pagenos = set() maxpages = 0 # output option @@ -82,7 +82,7 @@ def main(argv): elif outfile.endswith('.tag'): outtype = 'tag' if outfile: - outfp = file(outfile, 'w') + outfp = open(outfile, 'wb') else: outfp = sys.stdout if outtype == 'text': @@ -101,7 +101,7 @@ def main(argv): else: return usage() for fname in args: - fp = file(fname, 'rb') + fp = open(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,