Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!

Sorry, changes should have been more atomic. *In pdf2txt.py:* * Re-wrote main function to use argparse instead of optparse. * Manually tested in Py2/Py3 to get partial consistency. * Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway. * Py2 mode *probably* unchanged, cannot find any bugs yet... * Kept old main function for posterity, for now. *In utils:* * Added a few compatibility functions (some string hax required chardet, new dependency): - make_compat_bytes(in_str)-> (py3->bytes | py2->str) - make_compat_str(in_str)-> (str) - compatible_encode_method(bytesorstring, encoding, erraction)-> (str) *In pdfdevice:* * To handle different output filetypes in Py3, injected lots of calls to new utils methods, as well as some six.PYX checks and logic. These changes are largely responsible for enhanced Py2/Py3 consistency. *In converter:* * To handle output filetypes in Py2, injected a few checks and fixes particularly around the py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 21:08:57 +01:00 · 2015-05-17 21:08:57 +01:00 · 1b47bed306
parent 448aa08bc4
commit 1b47bed306
5 changed files with 169 additions and 25 deletions
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -20,6 +20,7 @@ from .utils import apply_matrix_pt
 from .utils import mult_matrix
 from .utils import enc
 from .utils import bbox2str
 from . import utils
 import six # Python 2+3 compatibility
@ -164,8 +165,11 @@ class TextConverter(PDFConverter):
        return
    def write_text(self, text):
-        if self.codec:
+        text = utils.compatible_encode_method(text, self.codec, 'ignore')
-            text = text.encode(self.codec, 'ignore')
+#        if six.PY2 and self.codec:
 #            text = text.encode(self.codec, 'ignore')
 #        if six.PY3 and isinstance(text, bytes):
 #            text = text.decode(self.codec, 'ignore')
        self.outfp.write(text)
        return
--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -1,11 +1,7 @@
 #!/usr/bin/env python
 from .utils import mult_matrix
 from .utils import translate_matrix
 from .utils import enc
 from .utils import bbox2str
 from .utils import isnumber
 from .pdffont import PDFUnicodeNotDefined
 from . import utils
 ##  PDFDevice
 ##
@ -62,7 +58,7 @@ class PDFDevice(object):
 class PDFTextDevice(PDFDevice):
    def render_string(self, textstate, seq):
-        matrix = mult_matrix(textstate.matrix, self.ctm)
+        matrix = utils.mult_matrix(textstate.matrix, self.ctm)
        font = textstate.font
        fontsize = textstate.fontsize
        scaling = textstate.scaling * .01
@ -87,14 +83,14 @@ class PDFTextDevice(PDFDevice):
        (x, y) = pos
        needcharspace = False
        for obj in seq:
-            if isnumber(obj):
+            if utils.isnumber(obj):
                x -= obj*dxscale
                needcharspace = True
            else:
                for cid in font.decode(obj):
                    if needcharspace:
                        x += charspace
-                    x += self.render_char(translate_matrix(matrix, (x, y)),
+                    x += self.render_char(utils.translate_matrix(matrix, (x, y)),
                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        x += wordspace
@ -106,14 +102,14 @@ class PDFTextDevice(PDFDevice):
        (x, y) = pos
        needcharspace = False
        for obj in seq:
-            if isnumber(obj):
+            if utils.isnumber(obj):
                y -= obj*dxscale
                needcharspace = True
            else:
                for cid in font.decode(obj):
                    if needcharspace:
                        y += charspace
-                    y += self.render_char(translate_matrix(matrix, (x, y)),
+                    y += self.render_char(utils.translate_matrix(matrix, (x, y)),
                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        y += wordspace
@ -140,6 +136,7 @@ class TagExtractor(PDFDevice):
        font = textstate.font
        text = ''
        for obj in seq:
            obj = utils.make_compat_str(obj)
            if not isinstance(obj, str):
                continue
            chars = font.decode(obj)
@ -148,33 +145,36 @@ class TagExtractor(PDFDevice):
                    char = font.to_unichr(cid)
                    text += char
                except PDFUnicodeNotDefined:
                    print(chars)
                    pass
-        self.outfp.write(enc(text, self.codec))
+        self.outfp.write(utils.enc(text, self.codec))
        return
    def begin_page(self, page, ctm):
-        self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
+        output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
-                         (self.pageno, bbox2str(page.mediabox), page.rotate))
+        self.outfp.write(utils.make_compat_bytes(output))
        return
    def end_page(self, page):
-        self.outfp.write('</page>\n')
+        self.outfp.write(utils.make_compat_bytes('</page>\n'))
        self.pageno += 1
        return
    def begin_tag(self, tag, props=None):
        s = ''
        if isinstance(props, dict):
-            s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
+            s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
                        in sorted(props.iteritems()))
-        self.outfp.write('<%s%s>' % (enc(tag.name), s))
+        out_s = '<%s%s>' % (utils.enc(tag.name), s)
        self.outfp.write(utils.make_compat_bytes(out_s))
        self._stack.append(tag)
        return
    def end_tag(self):
        assert self._stack
        tag = self._stack.pop(-1)
-        self.outfp.write('</%s>' % enc(tag.name))
+        out_s = '</%s>' % utils.enc(tag.name)
        self.outfp.write(utils.make_compat_bytes(out_s))
        return
    def do_tag(self, tag, props=None):
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -3,9 +3,40 @@
 Miscellaneous Routines.
 """
 import struct
-INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints
+# from sys import maxint as INF #doesn't work anymore under Python3,
 # but PDF still uses 32 bits ints
 INF = (1<<31) - 1
 import six  #Python 2+3 compatibility
 import chardet  # For str encoding detection in Py3
 def make_compat_bytes(in_str):
    "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
    assert isinstance(in_str, str)
    if six.PY2:
        return in_str
    else:
        return in_str.encode()
 def make_compat_str(in_str):
    "In Py2, does nothing. In Py3, converts to string, guessing encoding."
    assert isinstance(in_str, (bytes, str))
    if six.PY3 and isinstance(in_str, bytes):
        enc = chardet.detect(in_str)
        in_str = in_str.decode(enc['encoding'])
    return in_str
 def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
    "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
    if six.PY2:
        assert isinstance(bytesorstring, str), ("Error: Assumed was calling"
            " encode() on a string in Py2: {}").format(type(bytesorstring))
        return bytesorstring.encode(encoding, erraction)
    if six.PY3:
        if isinstance(bytesorstring, str): return bytesorstring
        assert isinstance(bytesorstring, bytes), ("Error: Assumed was calling"
            " encode() on a bytes in Py3: {}").format(type(bytesorstring))
        return bytesorstring.decode(encoding, erraction)
 ##  PNG Predictor
 ##
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python
-from distutils.core import setup
+#from distutils.core import setup
 from setuptools import setup
 from pdfminer import __version__
 setup(
@ -7,7 +8,7 @@ setup(
    version=__version__,
    packages=['pdfminer',],
    package_data={'pdfminer': ['cmap/*.pickle.gz']},
-    requires=['six'],
+    requires=['six', 'chardet'],
    description='PDF parser and analyzer',
    long_description='''fork of PDFMiner using six for Python 2+3 compatibility
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -1,5 +1,11 @@
 #!/usr/bin/env python
 """
 Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
 """
 import sys
 import logging
 import six
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -9,11 +15,110 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
 from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
 from pdfminer.image import ImageWriter
 import logging
 import six
 # main
 def main(argv):
    import argparse
    P = argparse.ArgumentParser(description=__doc__)
    P.add_argument("files", type=str, nargs="+", help="Files to process.")
    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
    P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
 #    P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
    A = P.parse_args()
    if A.no_laparams:
        laparams = None
    else:
        laparams = LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
            param_arg = getattr(A, param, None)
            if param_arg is not None:
                setattr(laparams, param, param_arg)
    if A.page_numbers:
        A.page_numbers = set([x-1 for x in A.page_numbers])
    if A.pagenos:
        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
    imagewriter = None
    if A.output_dir:
        imagewriter = ImageWriter(A.output_dir)
    if six.PY2 and sys.stdin.encoding:
        A.password = A.password.decode(sys.stdin.encoding)
    if A.output_type == "text" and A.outfile != "-":
        for override, alttype in (  (".htm", "html"),
                                    (".html", "html"),
                                    (".xml", "xml"),
                                    (".tag", "tag") ):
            if A.outfile.endswith(override):
                A.output_type = alttype
    if A.outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            A.codec = 'utf-8'
            #A.codec = outfp.encoding
    else:
        outfp = open(A.outfile, "wb")
    rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
    if A.output_type == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif A.output_type == 'xml':
        if six.PY3 and outfp == sys.stdout:
            outfp = sys.stdout.buffer
        device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=A.strip_control)
    elif A.output_type == 'html':
        if six.PY3 and outfp == sys.stdout:
            outfp = sys.stdout.buffer
        device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
                               layoutmode=A.layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif A.output_type == 'tag':
        if six.PY3 and outfp == sys.stdout:
            outfp = sys.stdout.buffer
        device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
    else:
        return usage()
    for fname in A.files:
        fp = open(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, A.page_numbers,
                                      maxpages=A.maxpages, password=A.password,
                                      caching=not A.disable_caching, check_extractable=True):
            page.rotate = (page.rotate + A.rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
 def main_old(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
@ -98,6 +203,8 @@ def main(argv):
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        if six.PY3 and outfp == sys.stdout:
            outfp = sys.stdout.buffer
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
@ -114,4 +221,5 @@ def main(argv):
    outfp.close()
    return
 #if __name__ == '__main__': sys.exit(main_old(sys.argv))
 if __name__ == '__main__': sys.exit(main(sys.argv))