diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 4a530ff..7dba21b 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -20,6 +20,7 @@ from .utils import apply_matrix_pt from .utils import mult_matrix from .utils import enc from .utils import bbox2str +from . import utils import six # Python 2+3 compatibility @@ -164,8 +165,11 @@ class TextConverter(PDFConverter): return def write_text(self, text): - if self.codec: - text = text.encode(self.codec, 'ignore') + text = utils.compatible_encode_method(text, self.codec, 'ignore') +# if six.PY2 and self.codec: +# text = text.encode(self.codec, 'ignore') +# if six.PY3 and isinstance(text, bytes): +# text = text.decode(self.codec, 'ignore') self.outfp.write(text) return diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 3efee9e..76725a3 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,11 +1,7 @@ #!/usr/bin/env python -from .utils import mult_matrix -from .utils import translate_matrix -from .utils import enc -from .utils import bbox2str -from .utils import isnumber from .pdffont import PDFUnicodeNotDefined +from . import utils ## PDFDevice ## @@ -62,7 +58,7 @@ class PDFDevice(object): class PDFTextDevice(PDFDevice): def render_string(self, textstate, seq): - matrix = mult_matrix(textstate.matrix, self.ctm) + matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize scaling = textstate.scaling * .01 @@ -87,14 +83,14 @@ class PDFTextDevice(PDFDevice): (x, y) = pos needcharspace = False for obj in seq: - if isnumber(obj): + if utils.isnumber(obj): x -= obj*dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: x += charspace - x += self.render_char(translate_matrix(matrix, (x, y)), + x += self.render_char(utils.translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: x += wordspace @@ -106,14 +102,14 @@ class PDFTextDevice(PDFDevice): (x, y) = pos needcharspace = False for obj in seq: - if isnumber(obj): + if utils.isnumber(obj): y -= obj*dxscale needcharspace = True else: for cid in font.decode(obj): if needcharspace: y += charspace - y += self.render_char(translate_matrix(matrix, (x, y)), + y += self.render_char(utils.translate_matrix(matrix, (x, y)), font, fontsize, scaling, rise, cid) if cid == 32 and wordspace: y += wordspace @@ -140,6 +136,7 @@ class TagExtractor(PDFDevice): font = textstate.font text = '' for obj in seq: + obj = utils.make_compat_str(obj) if not isinstance(obj, str): continue chars = font.decode(obj) @@ -148,33 +145,36 @@ class TagExtractor(PDFDevice): char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: + print(chars) pass - self.outfp.write(enc(text, self.codec)) + self.outfp.write(utils.enc(text, self.codec)) return def begin_page(self, page, ctm): - self.outfp.write('' % - (self.pageno, bbox2str(page.mediabox), page.rotate)) + output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) + self.outfp.write(utils.make_compat_bytes(output)) return def end_page(self, page): - self.outfp.write('\n') + self.outfp.write(utils.make_compat_bytes('\n')) self.pageno += 1 return def begin_tag(self, tag, props=None): s = '' if isinstance(props, dict): - s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v) + s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) in sorted(props.iteritems())) - self.outfp.write('<%s%s>' % (enc(tag.name), s)) + out_s = '<%s%s>' % (utils.enc(tag.name), s) + self.outfp.write(utils.make_compat_bytes(out_s)) self._stack.append(tag) return def end_tag(self): assert self._stack tag = self._stack.pop(-1) - self.outfp.write('' % enc(tag.name)) + out_s = '' % utils.enc(tag.name) + self.outfp.write(utils.make_compat_bytes(out_s)) return def do_tag(self, tag, props=None): diff --git a/pdfminer/utils.py b/pdfminer/utils.py index ff6589a..6294e5d 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -3,9 +3,40 @@ Miscellaneous Routines. """ import struct -INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints +# from sys import maxint as INF #doesn't work anymore under Python3, +# but PDF still uses 32 bits ints +INF = (1<<31) - 1 -import six #Python 2+3 compatibility +import six #Python 2+3 compatibility +import chardet # For str encoding detection in Py3 + +def make_compat_bytes(in_str): + "In Py2, does nothing. In Py3, converts to bytes, encoding to unicode." + assert isinstance(in_str, str) + if six.PY2: + return in_str + else: + return in_str.encode() + +def make_compat_str(in_str): + "In Py2, does nothing. In Py3, converts to string, guessing encoding." + assert isinstance(in_str, (bytes, str)) + if six.PY3 and isinstance(in_str, bytes): + enc = chardet.detect(in_str) + in_str = in_str.decode(enc['encoding']) + return in_str + +def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'): + "When Py2 str.encode is called, it often means bytes.encode in Py3. This does either." + if six.PY2: + assert isinstance(bytesorstring, str), ("Error: Assumed was calling" + " encode() on a string in Py2: {}").format(type(bytesorstring)) + return bytesorstring.encode(encoding, erraction) + if six.PY3: + if isinstance(bytesorstring, str): return bytesorstring + assert isinstance(bytesorstring, bytes), ("Error: Assumed was calling" + " encode() on a bytes in Py3: {}").format(type(bytesorstring)) + return bytesorstring.decode(encoding, erraction) ## PNG Predictor ## diff --git a/setup.py b/setup.py index 38f0780..1a2166f 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ #!/usr/bin/env python -from distutils.core import setup +#from distutils.core import setup +from setuptools import setup from pdfminer import __version__ setup( @@ -7,7 +8,7 @@ setup( version=__version__, packages=['pdfminer',], package_data={'pdfminer': ['cmap/*.pickle.gz']}, - requires=['six'], + requires=['six', 'chardet'], description='PDF parser and analyzer', long_description='''fork of PDFMiner using six for Python 2+3 compatibility diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index cb24107..c121b02 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -1,5 +1,11 @@ #!/usr/bin/env python +""" +Converts PDF text content (though not images containing text) to plain text, html, xml or "tags". +""" import sys +import logging +import six + from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter @@ -9,11 +15,110 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter -import logging -import six # main def main(argv): + import argparse + P = argparse.ArgumentParser(description=__doc__) + P.add_argument("files", type=str, nargs="+", help="Files to process.") + P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") + P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") + P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF") +# P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)") + P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)") + P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)") + P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") + P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") + P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") + P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") + P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") + P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") + P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") + P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") + P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") + P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") + P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") + P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") + P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") + P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") + A = P.parse_args() + + if A.no_laparams: + laparams = None + else: + laparams = LAParams() + for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): + param_arg = getattr(A, param, None) + if param_arg is not None: + setattr(laparams, param, param_arg) + + if A.page_numbers: + A.page_numbers = set([x-1 for x in A.page_numbers]) + if A.pagenos: + A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) + + imagewriter = None + if A.output_dir: + imagewriter = ImageWriter(A.output_dir) + + if six.PY2 and sys.stdin.encoding: + A.password = A.password.decode(sys.stdin.encoding) + + if A.output_type == "text" and A.outfile != "-": + for override, alttype in ( (".htm", "html"), + (".html", "html"), + (".xml", "xml"), + (".tag", "tag") ): + if A.outfile.endswith(override): + A.output_type = alttype + + if A.outfile == "-": + outfp = sys.stdout + if outfp.encoding is not None: + A.codec = 'utf-8' + #A.codec = outfp.encoding + else: + outfp = open(A.outfile, "wb") + + rsrcmgr = PDFResourceManager(caching=not A.disable_caching) + + if A.output_type == 'text': + device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams, + imagewriter=imagewriter) + elif A.output_type == 'xml': + if six.PY3 and outfp == sys.stdout: + outfp = sys.stdout.buffer + device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams, + imagewriter=imagewriter, + stripcontrol=A.strip_control) + elif A.output_type == 'html': + if six.PY3 and outfp == sys.stdout: + outfp = sys.stdout.buffer + device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale, + layoutmode=A.layoutmode, laparams=laparams, + imagewriter=imagewriter) + elif A.output_type == 'tag': + if six.PY3 and outfp == sys.stdout: + outfp = sys.stdout.buffer + device = TagExtractor(rsrcmgr, outfp, codec=A.codec) + else: + return usage() + for fname in A.files: + fp = open(fname, 'rb') + interpreter = PDFPageInterpreter(rsrcmgr, device) + for page in PDFPage.get_pages(fp, A.page_numbers, + maxpages=A.maxpages, password=A.password, + caching=not A.disable_caching, check_extractable=True): + page.rotate = (page.rotate + A.rotation) % 360 + interpreter.process_page(page) + fp.close() + device.close() + outfp.close() + return + +def main_old(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' @@ -98,6 +203,8 @@ def main(argv): layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': + if six.PY3 and outfp == sys.stdout: + outfp = sys.stdout.buffer device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() @@ -114,4 +221,5 @@ def main(argv): outfp.close() return +#if __name__ == '__main__': sys.exit(main_old(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))