From 08cb217983d09ee5bcba80e918299b70c60d5df0 Mon Sep 17 00:00:00 2001
From: Cathal Garvey <cathalgarvey@cathalgarvey.me>
Date: Sat, 30 May 2015 16:14:24 +0100
Subject: [PATCH] Progress, progress.. not nearly atomic enough, sorry.

---
 pdfminer/converter.py       |  23 +++-
 setup.py                    |   3 +-
 tests/test_tools_pdf2txt.py |   2 +-
 tools/pdf2txt.py            | 225 ++++++++++++++++++++++++++++--------
 4 files changed, 202 insertions(+), 51 deletions(-)

diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 7dba21b..b0efc0d 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -150,6 +150,23 @@ class PDFConverter(PDFLayoutAnalyzer):
         PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
         self.outfp = outfp
         self.codec = codec
+        if hasattr(self.outfp, 'mode'):
+            if 'b' in self.outfp.mode:
+                self.outfp_binary = True
+            else:
+                self.outfp_binary = False
+        else:
+            import io
+            if isinstance(self.outfp, io.BytesIO):
+                self.outfp_binary = True
+            elif isinstance(self.outfp, io.StringIO):
+                self.outfp_binary = False
+            else:
+                try:
+                    self.outfp.write(u"é")
+                    self.outfp_binary = False
+                except TypeError:
+                    self.outfp_binary = True
         return
 
 
@@ -166,10 +183,8 @@ class TextConverter(PDFConverter):
 
     def write_text(self, text):
         text = utils.compatible_encode_method(text, self.codec, 'ignore')
-#        if six.PY2 and self.codec:
-#            text = text.encode(self.codec, 'ignore')
-#        if six.PY3 and isinstance(text, bytes):
-#            text = text.decode(self.codec, 'ignore')
+        if six.PY3 and self.outfp_binary:
+            text = text.encode()
         self.outfp.write(text)
         return
 
diff --git a/setup.py b/setup.py
index 1a2166f..617020e 100644
--- a/setup.py
+++ b/setup.py
@@ -2,13 +2,14 @@
 #from distutils.core import setup
 from setuptools import setup
 from pdfminer import __version__
+import sys
 
 setup(
     name='pdfminer.six',
     version=__version__,
     packages=['pdfminer',],
     package_data={'pdfminer': ['cmap/*.pickle.gz']},
-    requires=['six', 'chardet'],
+    requires=['six', 'chardet'] if sys.version_info.major>2 else ['six'],
     description='PDF parser and analyzer',
     long_description='''fork of PDFMiner using six for Python 2+3 compatibility
     
diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
index 191267a..6d92df6 100644
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@@ -14,7 +14,7 @@ def run(datapath,filename,options=None):
         s='pdf2txt -o%s %s %s'%(o,options,i)
     else:
          s='pdf2txt -o%s %s'%(o,i)
-    pdf2txt.main(s.split(' '))
+    pdf2txt.main(s.split(' ')[1:])
 
 class TestDumpPDF():
     
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index c121b02..d74c4c5 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -16,17 +16,183 @@ from pdfminer.cmapdb import CMapDB
 from pdfminer.layout import LAParams
 from pdfminer.image import ImageWriter
 
+
+def _check_arg():
+    """
+    Type-checking the ugly way, because we can't do arg annotations and reflection
+    in Python 2.
+    """
+    arg = locals()[arg_name]
+    assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)"
+            " '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg))
+    if contains_permitted is not None and arg:
+        for contained in arg:
+            assert isinstance(contained, contains_permitted), ("Value within"
+                    " argument '{}' should be of type '{}' but is '{}'"
+                    ).format(arg_name, contains_permitted, type(contained))
+
+def extract_text_to_fp(inf, outfp,
+                    output_type='text', codec='utf-8', laparams = None,
+                    maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
+                    layoutmode='normal', output_dir=None, strip_control=False,
+                    debug=False, disable_caching=False, **other):
+    """
+    Parses text from inf-file and writes to outfp file-like object.
+    Takes loads of optional arguments but the defaults are somewhat sane.
+    Beware laparams: Including an empty LAParams is not the same as passing None!
+    Returns nothing, acting as it does on two streams. Use StringIO to get strings.
+    """
+    if six.PY2 and sys.stdin.encoding:
+        password = password.decode(sys.stdin.encoding)
+
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+    
+    rsrcmgr = PDFResourceManager(caching=not disable_caching)
+
+    if output_type == 'text':
+        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                               imagewriter=imagewriter)
+
+    if six.PY3 and outfp == sys.stdout:
+        outfp = sys.stdout.buffer
+
+    if output_type == 'xml':
+        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                              imagewriter=imagewriter,
+                              stripcontrol=strip_control)
+    elif output_type == 'html':
+        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
+                               layoutmode=layoutmode, laparams=laparams,
+                               imagewriter=imagewriter)
+    elif output_type == 'tag':
+        device = TagExtractor(rsrcmgr, outfp, codec=codec)
+
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    for page in PDFPage.get_pages(inf,
+                                  page_numbers,
+                                  maxpages=maxpages,
+                                  password=password,
+                                  caching=not disable_caching,
+                                  check_extractable=True):
+        page.rotate = (page.rotate + rotation) % 360
+        interpreter.process_page(page)    
+    
+
+def extract_text(files=[], outfile='-',
+                     _py2_no_more_posargs=None,  # Bloody Python2 users need a shim for mandatory keyword args..
+                     output_type='text', codec='utf-8', maxpages=0, page_numbers=None, password="", scale=1.0,
+                     all_texts=None, detect_vertical=None, word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
+                     debug=False, layoutmode='normal', no_laparams=False, rotation=0, output_dir=None,
+                     disable_caching=False, strip_control=False, pagenos=None):
+    if _py2_no_more_posargs is not None:
+        raise ValueError("Too many positional arguments passed.")
+    if not files:
+        raise ValueError("Must provide files to work upon!")
+
+    # == Typechecking ==
+    # You can be sure for this many arguments that typechecking will catch errors.
+    # Yet more Py2 stupidity, should be able to use argument annotations to do
+    # type-checking cleanly, but can't. Not bothering to typecheck everything here.
+    if debug:
+        for arg_name, arg_permitted, contains_permitted in (
+                    ("files", list, str),
+                    ("outfile", str, None),
+                    ("password", str, None),
+                    ("scale", float, None),
+                    ("output_type", str, None),
+                    ("codec", str, None),
+                    ("maxpages", int, None),
+                    ("page_numbers", (type(None), list, set), int)
+                ):
+            arg = locals()[arg_name]
+            assert isinstance(arg, arg_permitted), ("Argument '{}' should be of type(s)"
+                    " '{}' but is type '{}'").format(arg_name, arg_permitted, type(arg))
+            if contains_permitted is not None and arg:
+                for contained in arg:
+                    assert isinstance(contained, contains_permitted), ("Value within"
+                            " argument '{}' should be of type '{}' but is '{}'"
+                            ).format(arg_name, contains_permitted, type(contained))
+    # == Typechecking over ==    
+
+    # If any LAParams group arguments were passed, create an LAParams object and
+    # populate with given args. Otherwise, set it to None.
+    if not no_laparams: 
+        laparams = LAParams()
+        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+            paramv = locals().get(param, None)
+            if paramv is not None:
+                setattr(laparams, param, paramv)
+    else:
+        laparams = None
+
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+
+    if six.PY2 and sys.stdin.encoding:
+        password = password.decode(sys.stdin.encoding)
+    
+    if output_type == "text" and outfile != "-":
+        for override, alttype in (  (".htm", "html"),
+                                    (".html", "html"),
+                                    (".xml", "xml"),
+                                    (".tag", "tag") ):
+            if outfile.endswith(override):
+                output_type = alttype
+    
+    if outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            codec = 'utf-8'
+    else:
+        outfp = open(outfile, "wb")
+    
+    rsrcmgr = PDFResourceManager(caching=not disable_caching)
+
+    if output_type == 'text':
+        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                               imagewriter=imagewriter)
+
+    if six.PY3 and outfp == sys.stdout:
+        outfp = sys.stdout.buffer
+
+    if output_type == 'xml':
+        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
+                              imagewriter=imagewriter,
+                              stripcontrol=strip_control)
+    elif output_type == 'html':
+        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
+                               layoutmode=layoutmode, laparams=laparams,
+                               imagewriter=imagewriter)
+    elif output_type == 'tag':
+        device = TagExtractor(rsrcmgr, outfp, codec=codec)
+
+    for fname in files:
+        with open(fname, "rb") as fp:
+            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            for page in PDFPage.get_pages(fp,
+                                          page_numbers,
+                                          maxpages=maxpages,
+                                          password=password,
+                                          caching=not disable_caching,
+                                          check_extractable=True):
+                page.rotate = (page.rotate + rotation) % 360
+                interpreter.process_page(page)
+    device.close()
+    return outfp
+
 # main
-def main(argv):
+def main(args=None):
     import argparse
     P = argparse.ArgumentParser(description=__doc__)
-    P.add_argument("files", type=str, nargs="+", help="Files to process.")
+    P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
     P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
     P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
-    P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+    P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
     P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
     P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
-#    P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
     P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
     P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
     P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
@@ -43,7 +209,7 @@ def main(argv):
     P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
     P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
     P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
-    A = P.parse_args()
+    A = P.parse_args(args=args)
 
     if A.no_laparams:
         laparams = None
@@ -58,7 +224,7 @@ def main(argv):
         A.page_numbers = set([x-1 for x in A.page_numbers])
     if A.pagenos:
         A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
-    
+        
     imagewriter = None
     if A.output_dir:
         imagewriter = ImageWriter(A.output_dir)
@@ -67,56 +233,25 @@ def main(argv):
         A.password = A.password.decode(sys.stdin.encoding)
 
     if A.output_type == "text" and A.outfile != "-":
-        for override, alttype in (  (".htm", "html"),
+        for override, alttype in (  (".htm",  "html"),
                                     (".html", "html"),
-                                    (".xml", "xml"),
-                                    (".tag", "tag") ):
+                                    (".xml",  "xml" ),
+                                    (".tag",  "tag" ) ):
             if A.outfile.endswith(override):
                 A.output_type = alttype
 
     if A.outfile == "-":
         outfp = sys.stdout
         if outfp.encoding is not None:
+            # Why ignore outfp.encoding? :-/ stupid cathal?
             A.codec = 'utf-8'
-            #A.codec = outfp.encoding
     else:
         outfp = open(A.outfile, "wb")
 
-    rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
-
-    if A.output_type == 'text':
-        device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
-                               imagewriter=imagewriter)
-    elif A.output_type == 'xml':
-        if six.PY3 and outfp == sys.stdout:
-            outfp = sys.stdout.buffer
-        device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
-                              imagewriter=imagewriter,
-                              stripcontrol=A.strip_control)
-    elif A.output_type == 'html':
-        if six.PY3 and outfp == sys.stdout:
-            outfp = sys.stdout.buffer
-        device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
-                               layoutmode=A.layoutmode, laparams=laparams,
-                               imagewriter=imagewriter)
-    elif A.output_type == 'tag':
-        if six.PY3 and outfp == sys.stdout:
-            outfp = sys.stdout.buffer
-        device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
-    else:
-        return usage()
-    for fname in A.files:
-        fp = open(fname, 'rb')
-        interpreter = PDFPageInterpreter(rsrcmgr, device)
-        for page in PDFPage.get_pages(fp, A.page_numbers,
-                                      maxpages=A.maxpages, password=A.password,
-                                      caching=not A.disable_caching, check_extractable=True):
-            page.rotate = (page.rotate + A.rotation) % 360
-            interpreter.process_page(page)
-        fp.close()
-    device.close()
+    ## Test Code
+    outfp = extract_text(**vars(A))
     outfp.close()
-    return
+    return None
 
 def main_old(argv):
     import getopt
@@ -222,4 +357,4 @@ def main_old(argv):
     return
 
 #if __name__ == '__main__': sys.exit(main_old(sys.argv))
-if __name__ == '__main__': sys.exit(main(sys.argv))
+if __name__ == '__main__': sys.exit(main())