Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!

Sorry, changes should have been more atomic.

*In pdf2txt.py:*

* Re-wrote main function to use argparse instead of optparse.
* Manually tested in Py2/Py3 to get partial consistency.
* Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway.
* Py2 mode *probably* unchanged, cannot find any bugs yet...
* Kept old main function for posterity, for now.

*In utils:*

* Added a few compatibility functions (some string hax required chardet, new dependency):
    - make_compat_bytes(in_str)-> (py3->bytes | py2->str)
    - make_compat_str(in_str)-> (str)
    - compatible_encode_method(bytesorstring, encoding, erraction)-> (str)

*In pdfdevice:*

* To handle different output filetypes in Py3, injected lots of calls to new utils methods,
  as well as some six.PYX checks and logic. These changes are largely responsible for
  enhanced Py2/Py3 consistency.

*In converter:*

* To handle output filetypes in Py2, injected a few checks and fixes particularly around the
  py2 `str.encode` method and its *assumed* usual use-analogies in Py3.
pull/5/head
Cathal Garvey 2015-05-17 21:08:57 +01:00
parent 448aa08bc4
commit 1b47bed306
5 changed files with 169 additions and 25 deletions

View File

@ -20,6 +20,7 @@ from .utils import apply_matrix_pt
from .utils import mult_matrix from .utils import mult_matrix
from .utils import enc from .utils import enc
from .utils import bbox2str from .utils import bbox2str
from . import utils
import six # Python 2+3 compatibility import six # Python 2+3 compatibility
@ -164,8 +165,11 @@ class TextConverter(PDFConverter):
return return
def write_text(self, text): def write_text(self, text):
if self.codec: text = utils.compatible_encode_method(text, self.codec, 'ignore')
text = text.encode(self.codec, 'ignore') # if six.PY2 and self.codec:
# text = text.encode(self.codec, 'ignore')
# if six.PY3 and isinstance(text, bytes):
# text = text.decode(self.codec, 'ignore')
self.outfp.write(text) self.outfp.write(text)
return return

View File

@ -1,11 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from .utils import mult_matrix
from .utils import translate_matrix
from .utils import enc
from .utils import bbox2str
from .utils import isnumber
from .pdffont import PDFUnicodeNotDefined from .pdffont import PDFUnicodeNotDefined
from . import utils
## PDFDevice ## PDFDevice
## ##
@ -62,7 +58,7 @@ class PDFDevice(object):
class PDFTextDevice(PDFDevice): class PDFTextDevice(PDFDevice):
def render_string(self, textstate, seq): def render_string(self, textstate, seq):
matrix = mult_matrix(textstate.matrix, self.ctm) matrix = utils.mult_matrix(textstate.matrix, self.ctm)
font = textstate.font font = textstate.font
fontsize = textstate.fontsize fontsize = textstate.fontsize
scaling = textstate.scaling * .01 scaling = textstate.scaling * .01
@ -87,14 +83,14 @@ class PDFTextDevice(PDFDevice):
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isnumber(obj): if utils.isnumber(obj):
x -= obj*dxscale x -= obj*dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
x += charspace x += charspace
x += self.render_char(translate_matrix(matrix, (x, y)), x += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
x += wordspace x += wordspace
@ -106,14 +102,14 @@ class PDFTextDevice(PDFDevice):
(x, y) = pos (x, y) = pos
needcharspace = False needcharspace = False
for obj in seq: for obj in seq:
if isnumber(obj): if utils.isnumber(obj):
y -= obj*dxscale y -= obj*dxscale
needcharspace = True needcharspace = True
else: else:
for cid in font.decode(obj): for cid in font.decode(obj):
if needcharspace: if needcharspace:
y += charspace y += charspace
y += self.render_char(translate_matrix(matrix, (x, y)), y += self.render_char(utils.translate_matrix(matrix, (x, y)),
font, fontsize, scaling, rise, cid) font, fontsize, scaling, rise, cid)
if cid == 32 and wordspace: if cid == 32 and wordspace:
y += wordspace y += wordspace
@ -140,6 +136,7 @@ class TagExtractor(PDFDevice):
font = textstate.font font = textstate.font
text = '' text = ''
for obj in seq: for obj in seq:
obj = utils.make_compat_str(obj)
if not isinstance(obj, str): if not isinstance(obj, str):
continue continue
chars = font.decode(obj) chars = font.decode(obj)
@ -148,33 +145,36 @@ class TagExtractor(PDFDevice):
char = font.to_unichr(cid) char = font.to_unichr(cid)
text += char text += char
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
print(chars)
pass pass
self.outfp.write(enc(text, self.codec)) self.outfp.write(utils.enc(text, self.codec))
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' % output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
(self.pageno, bbox2str(page.mediabox), page.rotate)) self.outfp.write(utils.make_compat_bytes(output))
return return
def end_page(self, page): def end_page(self, page):
self.outfp.write('</page>\n') self.outfp.write(utils.make_compat_bytes('</page>\n'))
self.pageno += 1 self.pageno += 1
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v) s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
in sorted(props.iteritems())) in sorted(props.iteritems()))
self.outfp.write('<%s%s>' % (enc(tag.name), s)) out_s = '<%s%s>' % (utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s))
self._stack.append(tag) self._stack.append(tag)
return return
def end_tag(self): def end_tag(self):
assert self._stack assert self._stack
tag = self._stack.pop(-1) tag = self._stack.pop(-1)
self.outfp.write('</%s>' % enc(tag.name)) out_s = '</%s>' % utils.enc(tag.name)
self.outfp.write(utils.make_compat_bytes(out_s))
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):

View File

@ -3,9 +3,40 @@
Miscellaneous Routines. Miscellaneous Routines.
""" """
import struct import struct
INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints # from sys import maxint as INF #doesn't work anymore under Python3,
# but PDF still uses 32 bits ints
INF = (1<<31) - 1
import six #Python 2+3 compatibility import six #Python 2+3 compatibility
import chardet # For str encoding detection in Py3
def make_compat_bytes(in_str):
"In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
assert isinstance(in_str, str)
if six.PY2:
return in_str
else:
return in_str.encode()
def make_compat_str(in_str):
"In Py2, does nothing. In Py3, converts to string, guessing encoding."
assert isinstance(in_str, (bytes, str))
if six.PY3 and isinstance(in_str, bytes):
enc = chardet.detect(in_str)
in_str = in_str.decode(enc['encoding'])
return in_str
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
"When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
if six.PY2:
assert isinstance(bytesorstring, str), ("Error: Assumed was calling"
" encode() on a string in Py2: {}").format(type(bytesorstring))
return bytesorstring.encode(encoding, erraction)
if six.PY3:
if isinstance(bytesorstring, str): return bytesorstring
assert isinstance(bytesorstring, bytes), ("Error: Assumed was calling"
" encode() on a bytes in Py3: {}").format(type(bytesorstring))
return bytesorstring.decode(encoding, erraction)
## PNG Predictor ## PNG Predictor
## ##

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from distutils.core import setup #from distutils.core import setup
from setuptools import setup
from pdfminer import __version__ from pdfminer import __version__
setup( setup(
@ -7,7 +8,7 @@ setup(
version=__version__, version=__version__,
packages=['pdfminer',], packages=['pdfminer',],
package_data={'pdfminer': ['cmap/*.pickle.gz']}, package_data={'pdfminer': ['cmap/*.pickle.gz']},
requires=['six'], requires=['six', 'chardet'],
description='PDF parser and analyzer', description='PDF parser and analyzer',
long_description='''fork of PDFMiner using six for Python 2+3 compatibility long_description='''fork of PDFMiner using six for Python 2+3 compatibility

View File

@ -1,5 +1,11 @@
#!/usr/bin/env python #!/usr/bin/env python
"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys import sys
import logging
import six
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -9,11 +15,110 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter from pdfminer.image import ImageWriter
import logging
import six
# main # main
def main(argv): def main(argv):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
# P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args()
if A.no_laparams:
laparams = None
else:
laparams = LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
param_arg = getattr(A, param, None)
if param_arg is not None:
setattr(laparams, param, param_arg)
if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)
if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if A.outfile.endswith(override):
A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
A.codec = 'utf-8'
#A.codec = outfp.encoding
else:
outfp = open(A.outfile, "wb")
rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
if A.output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
imagewriter=imagewriter)
elif A.output_type == 'xml':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
imagewriter=imagewriter,
stripcontrol=A.strip_control)
elif A.output_type == 'html':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
layoutmode=A.layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif A.output_type == 'tag':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
else:
return usage()
for fname in A.files:
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp, A.page_numbers,
maxpages=A.maxpages, password=A.password,
caching=not A.disable_caching, check_extractable=True):
page.rotate = (page.rotate + A.rotation) % 360
interpreter.process_page(page)
fp.close()
device.close()
outfp.close()
return
def main_old(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
@ -98,6 +203,8 @@ def main(argv):
layoutmode=layoutmode, laparams=laparams, layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter)
elif outtype == 'tag': elif outtype == 'tag':
if six.PY3 and outfp == sys.stdout:
outfp = sys.stdout.buffer
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else:
return usage() return usage()
@ -114,4 +221,5 @@ def main(argv):
outfp.close() outfp.close()
return return
#if __name__ == '__main__': sys.exit(main_old(sys.argv))
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))