Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!

Sorry, changes should have been more atomic. *In pdf2txt.py:* * Re-wrote main function to use argparse instead of optparse. * Manually tested in Py2/Py3 to get partial consistency. * Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway. * Py2 mode *probably* unchanged, cannot find any bugs yet... * Kept old main function for posterity, for now. *In utils:* * Added a few compatibility functions (some string hax required chardet, new dependency): - make_compat_bytes(in_str)-> (py3->bytes | py2->str) - make_compat_str(in_str)-> (str) - compatible_encode_method(bytesorstring, encoding, erraction)-> (str) *In pdfdevice:* * To handle different output filetypes in Py3, injected lots of calls to new utils methods, as well as some six.PYX checks and logic. These changes are largely responsible for enhanced Py2/Py3 consistency. *In converter:* * To handle output filetypes in Py2, injected a few checks and fixes particularly around the py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
2015-05-17 21:08:57 +01:00 · 2015-05-17 21:08:57 +01:00 · 1b47bed306
parent 448aa08bc4
commit 1b47bed306
5 changed files with 169 additions and 25 deletions
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -20,6 +20,7 @@ from .utils import apply_matrix_pt
 from .utils import mult_matrix
 from .utils import enc
 from .utils import bbox2str
+from . import utils

 import six # Python 2+3 compatibility

@ -164,8 +165,11 @@ class TextConverter(PDFConverter):
        return

    def write_text(self, text):
-        if self.codec:
-            text = text.encode(self.codec, 'ignore')
+        text = utils.compatible_encode_method(text, self.codec, 'ignore')
+#        if six.PY2 and self.codec:
+#            text = text.encode(self.codec, 'ignore')
+#        if six.PY3 and isinstance(text, bytes):
+#            text = text.decode(self.codec, 'ignore')
        self.outfp.write(text)
        return

--- a/pdfminer/pdfdevice.py
+++ b/pdfminer/pdfdevice.py
@ -1,11 +1,7 @@
 #!/usr/bin/env python
-from .utils import mult_matrix
-from .utils import translate_matrix
-from .utils import enc
-from .utils import bbox2str
-from .utils import isnumber
 from .pdffont import PDFUnicodeNotDefined

+from . import utils

 ##  PDFDevice
 ##
@ -62,7 +58,7 @@ class PDFDevice(object):
 class PDFTextDevice(PDFDevice):

    def render_string(self, textstate, seq):
-        matrix = mult_matrix(textstate.matrix, self.ctm)
+        matrix = utils.mult_matrix(textstate.matrix, self.ctm)
        font = textstate.font
        fontsize = textstate.fontsize
        scaling = textstate.scaling * .01
@ -87,14 +83,14 @@ class PDFTextDevice(PDFDevice):
        (x, y) = pos
        needcharspace = False
        for obj in seq:
-            if isnumber(obj):
+            if utils.isnumber(obj):
                x -= obj*dxscale
                needcharspace = True
            else:
                for cid in font.decode(obj):
                    if needcharspace:
                        x += charspace
-                    x += self.render_char(translate_matrix(matrix, (x, y)),
+                    x += self.render_char(utils.translate_matrix(matrix, (x, y)),
                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        x += wordspace
@ -106,14 +102,14 @@ class PDFTextDevice(PDFDevice):
        (x, y) = pos
        needcharspace = False
        for obj in seq:
-            if isnumber(obj):
+            if utils.isnumber(obj):
                y -= obj*dxscale
                needcharspace = True
            else:
                for cid in font.decode(obj):
                    if needcharspace:
                        y += charspace
-                    y += self.render_char(translate_matrix(matrix, (x, y)),
+                    y += self.render_char(utils.translate_matrix(matrix, (x, y)),
                                          font, fontsize, scaling, rise, cid)
                    if cid == 32 and wordspace:
                        y += wordspace
@ -140,6 +136,7 @@ class TagExtractor(PDFDevice):
        font = textstate.font
        text = ''
        for obj in seq:
+            obj = utils.make_compat_str(obj)
            if not isinstance(obj, str):
                continue
            chars = font.decode(obj)
@ -148,33 +145,36 @@ class TagExtractor(PDFDevice):
                    char = font.to_unichr(cid)
                    text += char
                except PDFUnicodeNotDefined:
+                    print(chars)
                    pass
-        self.outfp.write(enc(text, self.codec))
+        self.outfp.write(utils.enc(text, self.codec))
        return

    def begin_page(self, page, ctm):
-        self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
-                         (self.pageno, bbox2str(page.mediabox), page.rotate))
+        output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
+        self.outfp.write(utils.make_compat_bytes(output))
        return

    def end_page(self, page):
-        self.outfp.write('</page>\n')
+        self.outfp.write(utils.make_compat_bytes('</page>\n'))
        self.pageno += 1
        return

    def begin_tag(self, tag, props=None):
        s = ''
        if isinstance(props, dict):
-            s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
+            s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
                        in sorted(props.iteritems()))
-        self.outfp.write('<%s%s>' % (enc(tag.name), s))
+        out_s = '<%s%s>' % (utils.enc(tag.name), s)
+        self.outfp.write(utils.make_compat_bytes(out_s))
        self._stack.append(tag)
        return

    def end_tag(self):
        assert self._stack
        tag = self._stack.pop(-1)
-        self.outfp.write('</%s>' % enc(tag.name))
+        out_s = '</%s>' % utils.enc(tag.name)
+        self.outfp.write(utils.make_compat_bytes(out_s))
        return

    def do_tag(self, tag, props=None):
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -3,9 +3,40 @@
 Miscellaneous Routines.
 """
 import struct
-INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints
+# from sys import maxint as INF #doesn't work anymore under Python3,
+# but PDF still uses 32 bits ints
+INF = (1<<31) - 1

-import six #Python 2+3 compatibility
+import six  #Python 2+3 compatibility
+import chardet  # For str encoding detection in Py3
+
+def make_compat_bytes(in_str):
+    "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
+    assert isinstance(in_str, str)
+    if six.PY2:
+        return in_str
+    else:
+        return in_str.encode()
+
+def make_compat_str(in_str):
+    "In Py2, does nothing. In Py3, converts to string, guessing encoding."
+    assert isinstance(in_str, (bytes, str))
+    if six.PY3 and isinstance(in_str, bytes):
+        enc = chardet.detect(in_str)
+        in_str = in_str.decode(enc['encoding'])
+    return in_str
+
+def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
+    "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
+    if six.PY2:
+        assert isinstance(bytesorstring, str), ("Error: Assumed was calling"
+            " encode() on a string in Py2: {}").format(type(bytesorstring))
+        return bytesorstring.encode(encoding, erraction)
+    if six.PY3:
+        if isinstance(bytesorstring, str): return bytesorstring
+        assert isinstance(bytesorstring, bytes), ("Error: Assumed was calling"
+            " encode() on a bytes in Py3: {}").format(type(bytesorstring))
+        return bytesorstring.decode(encoding, erraction)

 ##  PNG Predictor
 ##
--- a/setup.py
+++ b/setup.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python
-from distutils.core import setup
+#from distutils.core import setup
+from setuptools import setup
 from pdfminer import __version__

 setup(
@ -7,7 +8,7 @@ setup(
    version=__version__,
    packages=['pdfminer',],
    package_data={'pdfminer': ['cmap/*.pickle.gz']},
-    requires=['six'],
+    requires=['six', 'chardet'],
    description='PDF parser and analyzer',
    long_description='''fork of PDFMiner using six for Python 2+3 compatibility
    
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@ -1,5 +1,11 @@
 #!/usr/bin/env python
+"""
+Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
+"""
 import sys
+import logging
+import six
+
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -9,11 +15,110 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
 from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
 from pdfminer.image import ImageWriter
-import logging
-import six

 # main
 def main(argv):
+    import argparse
+    P = argparse.ArgumentParser(description=__doc__)
+    P.add_argument("files", type=str, nargs="+", help="Files to process.")
+    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
+    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
+    P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
+    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
+#    P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
+    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
+    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
+    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
+    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
+    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
+    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
+    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
+    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
+    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
+    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
+    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
+    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
+    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
+    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
+    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
+    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
+    A = P.parse_args()
+
+    if A.no_laparams:
+        laparams = None
+    else:
+        laparams = LAParams()
+        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+            param_arg = getattr(A, param, None)
+            if param_arg is not None:
+                setattr(laparams, param, param_arg)
+
+    if A.page_numbers:
+        A.page_numbers = set([x-1 for x in A.page_numbers])
+    if A.pagenos:
+        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
+    
+    imagewriter = None
+    if A.output_dir:
+        imagewriter = ImageWriter(A.output_dir)
+
+    if six.PY2 and sys.stdin.encoding:
+        A.password = A.password.decode(sys.stdin.encoding)
+
+    if A.output_type == "text" and A.outfile != "-":
+        for override, alttype in (  (".htm", "html"),
+                                    (".html", "html"),
+                                    (".xml", "xml"),
+                                    (".tag", "tag") ):
+            if A.outfile.endswith(override):
+                A.output_type = alttype
+
+    if A.outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            A.codec = 'utf-8'
+            #A.codec = outfp.encoding
+    else:
+        outfp = open(A.outfile, "wb")
+
+    rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
+
+    if A.output_type == 'text':
+        device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
+                               imagewriter=imagewriter)
+    elif A.output_type == 'xml':
+        if six.PY3 and outfp == sys.stdout:
+            outfp = sys.stdout.buffer
+        device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
+                              imagewriter=imagewriter,
+                              stripcontrol=A.strip_control)
+    elif A.output_type == 'html':
+        if six.PY3 and outfp == sys.stdout:
+            outfp = sys.stdout.buffer
+        device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
+                               layoutmode=A.layoutmode, laparams=laparams,
+                               imagewriter=imagewriter)
+    elif A.output_type == 'tag':
+        if six.PY3 and outfp == sys.stdout:
+            outfp = sys.stdout.buffer
+        device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
+    else:
+        return usage()
+    for fname in A.files:
+        fp = open(fname, 'rb')
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.get_pages(fp, A.page_numbers,
+                                      maxpages=A.maxpages, password=A.password,
+                                      caching=not A.disable_caching, check_extractable=True):
+            page.rotate = (page.rotate + A.rotation) % 360
+            interpreter.process_page(page)
+        fp.close()
+    device.close()
+    outfp.close()
+    return
+
+def main_old(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
@ -98,6 +203,8 @@ def main(argv):
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
+        if six.PY3 and outfp == sys.stdout:
+            outfp = sys.stdout.buffer
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
@ -114,4 +221,5 @@ def main(argv):
    outfp.close()
    return

+#if __name__ == '__main__': sys.exit(main_old(sys.argv))
 if __name__ == '__main__': sys.exit(main(sys.argv))