Many changes to make pdf2txt.py work better in Py3, some in that script, others in module!
Sorry, changes should have been more atomic. *In pdf2txt.py:* * Re-wrote main function to use argparse instead of optparse. * Manually tested in Py2/Py3 to get partial consistency. * Errors abound including Tags mode, but most modes weren't working at all in Py3 anyway. * Py2 mode *probably* unchanged, cannot find any bugs yet... * Kept old main function for posterity, for now. *In utils:* * Added a few compatibility functions (some string hax required chardet, new dependency): - make_compat_bytes(in_str)-> (py3->bytes | py2->str) - make_compat_str(in_str)-> (str) - compatible_encode_method(bytesorstring, encoding, erraction)-> (str) *In pdfdevice:* * To handle different output filetypes in Py3, injected lots of calls to new utils methods, as well as some six.PYX checks and logic. These changes are largely responsible for enhanced Py2/Py3 consistency. *In converter:* * To handle output filetypes in Py2, injected a few checks and fixes particularly around the py2 `str.encode` method and its *assumed* usual use-analogies in Py3.pull/5/head
parent
448aa08bc4
commit
1b47bed306
|
@ -20,6 +20,7 @@ from .utils import apply_matrix_pt
|
||||||
from .utils import mult_matrix
|
from .utils import mult_matrix
|
||||||
from .utils import enc
|
from .utils import enc
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
|
from . import utils
|
||||||
|
|
||||||
import six # Python 2+3 compatibility
|
import six # Python 2+3 compatibility
|
||||||
|
|
||||||
|
@ -164,8 +165,11 @@ class TextConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
if self.codec:
|
text = utils.compatible_encode_method(text, self.codec, 'ignore')
|
||||||
text = text.encode(self.codec, 'ignore')
|
# if six.PY2 and self.codec:
|
||||||
|
# text = text.encode(self.codec, 'ignore')
|
||||||
|
# if six.PY3 and isinstance(text, bytes):
|
||||||
|
# text = text.decode(self.codec, 'ignore')
|
||||||
self.outfp.write(text)
|
self.outfp.write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from .utils import mult_matrix
|
|
||||||
from .utils import translate_matrix
|
|
||||||
from .utils import enc
|
|
||||||
from .utils import bbox2str
|
|
||||||
from .utils import isnumber
|
|
||||||
from .pdffont import PDFUnicodeNotDefined
|
from .pdffont import PDFUnicodeNotDefined
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
|
||||||
## PDFDevice
|
## PDFDevice
|
||||||
##
|
##
|
||||||
|
@ -62,7 +58,7 @@ class PDFDevice(object):
|
||||||
class PDFTextDevice(PDFDevice):
|
class PDFTextDevice(PDFDevice):
|
||||||
|
|
||||||
def render_string(self, textstate, seq):
|
def render_string(self, textstate, seq):
|
||||||
matrix = mult_matrix(textstate.matrix, self.ctm)
|
matrix = utils.mult_matrix(textstate.matrix, self.ctm)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
fontsize = textstate.fontsize
|
fontsize = textstate.fontsize
|
||||||
scaling = textstate.scaling * .01
|
scaling = textstate.scaling * .01
|
||||||
|
@ -87,14 +83,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isnumber(obj):
|
if utils.isnumber(obj):
|
||||||
x -= obj*dxscale
|
x -= obj*dxscale
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
else:
|
else:
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
x += charspace
|
x += charspace
|
||||||
x += self.render_char(translate_matrix(matrix, (x, y)),
|
x += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
x += wordspace
|
x += wordspace
|
||||||
|
@ -106,14 +102,14 @@ class PDFTextDevice(PDFDevice):
|
||||||
(x, y) = pos
|
(x, y) = pos
|
||||||
needcharspace = False
|
needcharspace = False
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
if isnumber(obj):
|
if utils.isnumber(obj):
|
||||||
y -= obj*dxscale
|
y -= obj*dxscale
|
||||||
needcharspace = True
|
needcharspace = True
|
||||||
else:
|
else:
|
||||||
for cid in font.decode(obj):
|
for cid in font.decode(obj):
|
||||||
if needcharspace:
|
if needcharspace:
|
||||||
y += charspace
|
y += charspace
|
||||||
y += self.render_char(translate_matrix(matrix, (x, y)),
|
y += self.render_char(utils.translate_matrix(matrix, (x, y)),
|
||||||
font, fontsize, scaling, rise, cid)
|
font, fontsize, scaling, rise, cid)
|
||||||
if cid == 32 and wordspace:
|
if cid == 32 and wordspace:
|
||||||
y += wordspace
|
y += wordspace
|
||||||
|
@ -140,6 +136,7 @@ class TagExtractor(PDFDevice):
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
text = ''
|
text = ''
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
|
obj = utils.make_compat_str(obj)
|
||||||
if not isinstance(obj, str):
|
if not isinstance(obj, str):
|
||||||
continue
|
continue
|
||||||
chars = font.decode(obj)
|
chars = font.decode(obj)
|
||||||
|
@ -148,33 +145,36 @@ class TagExtractor(PDFDevice):
|
||||||
char = font.to_unichr(cid)
|
char = font.to_unichr(cid)
|
||||||
text += char
|
text += char
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
|
print(chars)
|
||||||
pass
|
pass
|
||||||
self.outfp.write(enc(text, self.codec))
|
self.outfp.write(utils.enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
output = '<page id="%s" bbox="%s" rotate="%d">' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||||
(self.pageno, bbox2str(page.mediabox), page.rotate))
|
self.outfp.write(utils.make_compat_bytes(output))
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
self.outfp.write('</page>\n')
|
self.outfp.write(utils.make_compat_bytes('</page>\n'))
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
s = ''
|
s = ''
|
||||||
if isinstance(props, dict):
|
if isinstance(props, dict):
|
||||||
s = ''.join(' %s="%s"' % (enc(k), enc(str(v))) for (k, v)
|
s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v)
|
||||||
in sorted(props.iteritems()))
|
in sorted(props.iteritems()))
|
||||||
self.outfp.write('<%s%s>' % (enc(tag.name), s))
|
out_s = '<%s%s>' % (utils.enc(tag.name), s)
|
||||||
|
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||||
self._stack.append(tag)
|
self._stack.append(tag)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_tag(self):
|
def end_tag(self):
|
||||||
assert self._stack
|
assert self._stack
|
||||||
tag = self._stack.pop(-1)
|
tag = self._stack.pop(-1)
|
||||||
self.outfp.write('</%s>' % enc(tag.name))
|
out_s = '</%s>' % utils.enc(tag.name)
|
||||||
|
self.outfp.write(utils.make_compat_bytes(out_s))
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(self, tag, props=None):
|
||||||
|
|
|
@ -3,9 +3,40 @@
|
||||||
Miscellaneous Routines.
|
Miscellaneous Routines.
|
||||||
"""
|
"""
|
||||||
import struct
|
import struct
|
||||||
INF=2147483647 #from sys import maxint as INF #doesn't work anymore under Python3, but PDF still uses 32 bits ints
|
# from sys import maxint as INF #doesn't work anymore under Python3,
|
||||||
|
# but PDF still uses 32 bits ints
|
||||||
|
INF = (1<<31) - 1
|
||||||
|
|
||||||
import six #Python 2+3 compatibility
|
import six #Python 2+3 compatibility
|
||||||
|
import chardet # For str encoding detection in Py3
|
||||||
|
|
||||||
|
def make_compat_bytes(in_str):
|
||||||
|
"In Py2, does nothing. In Py3, converts to bytes, encoding to unicode."
|
||||||
|
assert isinstance(in_str, str)
|
||||||
|
if six.PY2:
|
||||||
|
return in_str
|
||||||
|
else:
|
||||||
|
return in_str.encode()
|
||||||
|
|
||||||
|
def make_compat_str(in_str):
|
||||||
|
"In Py2, does nothing. In Py3, converts to string, guessing encoding."
|
||||||
|
assert isinstance(in_str, (bytes, str))
|
||||||
|
if six.PY3 and isinstance(in_str, bytes):
|
||||||
|
enc = chardet.detect(in_str)
|
||||||
|
in_str = in_str.decode(enc['encoding'])
|
||||||
|
return in_str
|
||||||
|
|
||||||
|
def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
|
||||||
|
"When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."
|
||||||
|
if six.PY2:
|
||||||
|
assert isinstance(bytesorstring, str), ("Error: Assumed was calling"
|
||||||
|
" encode() on a string in Py2: {}").format(type(bytesorstring))
|
||||||
|
return bytesorstring.encode(encoding, erraction)
|
||||||
|
if six.PY3:
|
||||||
|
if isinstance(bytesorstring, str): return bytesorstring
|
||||||
|
assert isinstance(bytesorstring, bytes), ("Error: Assumed was calling"
|
||||||
|
" encode() on a bytes in Py3: {}").format(type(bytesorstring))
|
||||||
|
return bytesorstring.decode(encoding, erraction)
|
||||||
|
|
||||||
## PNG Predictor
|
## PNG Predictor
|
||||||
##
|
##
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from distutils.core import setup
|
#from distutils.core import setup
|
||||||
|
from setuptools import setup
|
||||||
from pdfminer import __version__
|
from pdfminer import __version__
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
|
@ -7,7 +8,7 @@ setup(
|
||||||
version=__version__,
|
version=__version__,
|
||||||
packages=['pdfminer',],
|
packages=['pdfminer',],
|
||||||
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
package_data={'pdfminer': ['cmap/*.pickle.gz']},
|
||||||
requires=['six'],
|
requires=['six', 'chardet'],
|
||||||
description='PDF parser and analyzer',
|
description='PDF parser and analyzer',
|
||||||
long_description='''fork of PDFMiner using six for Python 2+3 compatibility
|
long_description='''fork of PDFMiner using six for Python 2+3 compatibility
|
||||||
|
|
||||||
|
|
112
tools/pdf2txt.py
112
tools/pdf2txt.py
|
@ -1,5 +1,11 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
|
||||||
|
"""
|
||||||
import sys
|
import sys
|
||||||
|
import logging
|
||||||
|
import six
|
||||||
|
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
@ -9,11 +15,110 @@ from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||||
from pdfminer.cmapdb import CMapDB
|
from pdfminer.cmapdb import CMapDB
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
from pdfminer.image import ImageWriter
|
from pdfminer.image import ImageWriter
|
||||||
import logging
|
|
||||||
import six
|
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
import argparse
|
||||||
|
P = argparse.ArgumentParser(description=__doc__)
|
||||||
|
P.add_argument("files", type=str, nargs="+", help="Files to process.")
|
||||||
|
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||||
|
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
||||||
|
P.add_argument("--page-numbers", type=int, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||||
|
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||||
|
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
||||||
|
# P.add_argument("-o", "--outfile", type=argparse.FileType("w"), default=sys.stdout, help="Output file (default stdout)")
|
||||||
|
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
|
||||||
|
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
|
||||||
|
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
||||||
|
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
||||||
|
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
||||||
|
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
||||||
|
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
||||||
|
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
||||||
|
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
||||||
|
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
||||||
|
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
||||||
|
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
||||||
|
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
||||||
|
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
||||||
|
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
||||||
|
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||||
|
A = P.parse_args()
|
||||||
|
|
||||||
|
if A.no_laparams:
|
||||||
|
laparams = None
|
||||||
|
else:
|
||||||
|
laparams = LAParams()
|
||||||
|
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||||
|
param_arg = getattr(A, param, None)
|
||||||
|
if param_arg is not None:
|
||||||
|
setattr(laparams, param, param_arg)
|
||||||
|
|
||||||
|
if A.page_numbers:
|
||||||
|
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||||
|
if A.pagenos:
|
||||||
|
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||||
|
|
||||||
|
imagewriter = None
|
||||||
|
if A.output_dir:
|
||||||
|
imagewriter = ImageWriter(A.output_dir)
|
||||||
|
|
||||||
|
if six.PY2 and sys.stdin.encoding:
|
||||||
|
A.password = A.password.decode(sys.stdin.encoding)
|
||||||
|
|
||||||
|
if A.output_type == "text" and A.outfile != "-":
|
||||||
|
for override, alttype in ( (".htm", "html"),
|
||||||
|
(".html", "html"),
|
||||||
|
(".xml", "xml"),
|
||||||
|
(".tag", "tag") ):
|
||||||
|
if A.outfile.endswith(override):
|
||||||
|
A.output_type = alttype
|
||||||
|
|
||||||
|
if A.outfile == "-":
|
||||||
|
outfp = sys.stdout
|
||||||
|
if outfp.encoding is not None:
|
||||||
|
A.codec = 'utf-8'
|
||||||
|
#A.codec = outfp.encoding
|
||||||
|
else:
|
||||||
|
outfp = open(A.outfile, "wb")
|
||||||
|
|
||||||
|
rsrcmgr = PDFResourceManager(caching=not A.disable_caching)
|
||||||
|
|
||||||
|
if A.output_type == 'text':
|
||||||
|
device = TextConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
|
||||||
|
imagewriter=imagewriter)
|
||||||
|
elif A.output_type == 'xml':
|
||||||
|
if six.PY3 and outfp == sys.stdout:
|
||||||
|
outfp = sys.stdout.buffer
|
||||||
|
device = XMLConverter(rsrcmgr, outfp, codec=A.codec, laparams=laparams,
|
||||||
|
imagewriter=imagewriter,
|
||||||
|
stripcontrol=A.strip_control)
|
||||||
|
elif A.output_type == 'html':
|
||||||
|
if six.PY3 and outfp == sys.stdout:
|
||||||
|
outfp = sys.stdout.buffer
|
||||||
|
device = HTMLConverter(rsrcmgr, outfp, codec=A.codec, scale=A.scale,
|
||||||
|
layoutmode=A.layoutmode, laparams=laparams,
|
||||||
|
imagewriter=imagewriter)
|
||||||
|
elif A.output_type == 'tag':
|
||||||
|
if six.PY3 and outfp == sys.stdout:
|
||||||
|
outfp = sys.stdout.buffer
|
||||||
|
device = TagExtractor(rsrcmgr, outfp, codec=A.codec)
|
||||||
|
else:
|
||||||
|
return usage()
|
||||||
|
for fname in A.files:
|
||||||
|
fp = open(fname, 'rb')
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
for page in PDFPage.get_pages(fp, A.page_numbers,
|
||||||
|
maxpages=A.maxpages, password=A.password,
|
||||||
|
caching=not A.disable_caching, check_extractable=True):
|
||||||
|
page.rotate = (page.rotate + A.rotation) % 360
|
||||||
|
interpreter.process_page(page)
|
||||||
|
fp.close()
|
||||||
|
device.close()
|
||||||
|
outfp.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
def main_old(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
|
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
|
||||||
|
@ -98,6 +203,8 @@ def main(argv):
|
||||||
layoutmode=layoutmode, laparams=laparams,
|
layoutmode=layoutmode, laparams=laparams,
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
|
if six.PY3 and outfp == sys.stdout:
|
||||||
|
outfp = sys.stdout.buffer
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
return usage()
|
return usage()
|
||||||
|
@ -114,4 +221,5 @@ def main(argv):
|
||||||
outfp.close()
|
outfp.close()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
#if __name__ == '__main__': sys.exit(main_old(sys.argv))
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
Loading…
Reference in New Issue