From 65eb0cef826a241787e34332de7994f940b8c9df Mon Sep 17 00:00:00 2001 From: Healthi Date: Wed, 20 Jun 2018 17:17:03 +0530 Subject: [PATCH 1/5] decode cid: 160 and 170 to spaces --- pdfminer/latin_enc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pdfminer/latin_enc.py b/pdfminer/latin_enc.py index 13886c8..ef0f7e8 100644 --- a/pdfminer/latin_enc.py +++ b/pdfminer/latin_enc.py @@ -213,6 +213,8 @@ ENCODING = [ ('six', 54, 54, 54, 54), ('slash', 47, 47, 47, 47), ('space', 32, 32, 32, 32), + ('space', None, 202, 160, None), + ('space', None, 202, 173, None), ('sterling', 163, 163, 163, 163), ('t', 116, 116, 116, 116), ('thorn', None, None, 254, 254), From 95b65536afcaee37f630c33e4564621db652b806 Mon Sep 17 00:00:00 2001 From: Guglielmetti Philippe Date: Thu, 21 Jun 2018 09:28:55 +0200 Subject: [PATCH 2/5] render_string() now takes 3 parameters, not 5 --- pdfminer/pdfinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index a14f64a..b7a0ca7 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -769,7 +769,7 @@ class PDFPageInterpreter(object): if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) + self.device.render_string(self.textstate, seq)#, self.ncs, self.graphicstate.copy()) return # show From 70624a64dd060b995443ab741ef9994502bbdc07 Mon Sep 17 00:00:00 2001 From: Guglielmetti Philippe Date: Thu, 21 Jun 2018 09:49:45 +0200 Subject: [PATCH 3/5] render_string() now takes 3 parameters, not 5 (reverted from commit 95b65536afcaee37f630c33e4564621db652b806) --- pdfminer/pdfinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index b7a0ca7..a14f64a 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -769,7 +769,7 @@ class PDFPageInterpreter(object): if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq)#, self.ncs, self.graphicstate.copy()) + self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) return # show From 1db260609ecfbf701e260e328c85efc36c03ceb0 Mon Sep 17 00:00:00 2001 From: Goulu Date: Thu, 21 Jun 2018 10:21:26 +0200 Subject: [PATCH 4/5] render_string must have 5 params in all PDFDevice classes (#158) --- pdfminer/pdfdevice.py | 394 +++++++++++++++++++++--------------------- 1 file changed, 197 insertions(+), 197 deletions(-) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index ed54fd2..03a5f0e 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,197 +1,197 @@ -# -*- coding: utf-8 -*- - -import six - -from .pdffont import PDFUnicodeNotDefined - -from . import utils - -## PDFDevice -## -class PDFDevice(object): - - def __init__(self, rsrcmgr): - self.rsrcmgr = rsrcmgr - self.ctm = None - return - - def __repr__(self): - return '' - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - def close(self): - return - - def set_ctm(self, ctm): - self.ctm = ctm - return - - def begin_tag(self, tag, props=None): - return - - def end_tag(self): - return - - def do_tag(self, tag, props=None): - return - - def begin_page(self, page, ctm): - return - - def end_page(self, page): - return - - def begin_figure(self, name, bbox, matrix): - return - - def end_figure(self, name): - return - - def paint_path(self, graphicstate, stroke, fill, evenodd, path): - return - - def render_image(self, name, stream): - return - - def render_string(self, textstate, seq): - return - - -## PDFTextDevice -## -class PDFTextDevice(PDFDevice): - - def render_string(self, textstate, seq, ncs, graphicstate): - matrix = utils.mult_matrix(textstate.matrix, self.ctm) - font = textstate.font - fontsize = textstate.fontsize - scaling = textstate.scaling * .01 - charspace = textstate.charspace * scaling - wordspace = textstate.wordspace * scaling - rise = textstate.rise - if font.is_multibyte(): - wordspace = 0 - dxscale = .001 * fontsize * scaling - if font.is_vertical(): - textstate.linematrix = self.render_string_vertical( - seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) - else: - textstate.linematrix = self.render_string_horizontal( - seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) - return - - def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): - (x, y) = pos - needcharspace = False - for obj in seq: - if utils.isnumber(obj): - x -= obj*dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - x += charspace - x += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid, - ncs, graphicstate) - if cid == 32 and wordspace: - x += wordspace - needcharspace = True - return (x, y) - - def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): - (x, y) = pos - needcharspace = False - for obj in seq: - if utils.isnumber(obj): - y -= obj*dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - y += charspace - y += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid, - ncs, graphicstate) - if cid == 32 and wordspace: - y += wordspace - needcharspace = True - return (x, y) - - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): - return 0 - - -## TagExtractor -## -class TagExtractor(PDFDevice): - - def __init__(self, rsrcmgr, outfp, codec='utf-8'): - PDFDevice.__init__(self, rsrcmgr) - self.outfp = outfp - self.codec = codec - self.pageno = 0 - self._stack = [] - return - - def render_string(self, textstate, seq): - font = textstate.font - text = '' - for obj in seq: - if isinstance(obj, six.text_type): - obj = utils.make_compat_bytes(obj) - if not isinstance(obj, six.binary_type): - continue - chars = font.decode(obj) - for cid in chars: - try: - char = font.to_unichr(cid) - text += char - except PDFUnicodeNotDefined: - print(chars) - pass - self.outfp.write(utils.enc(text, self.codec)) - return - - def begin_page(self, page, ctm): - output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) - self.outfp.write(utils.make_compat_bytes(output)) - return - - def end_page(self, page): - self.outfp.write(utils.make_compat_bytes('\n')) - self.pageno += 1 - return - - def begin_tag(self, tag, props=None): - s = '' - if isinstance(props, dict): - s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) - in sorted(props.iteritems())) - out_s = '<%s%s>' % (utils.enc(tag.name), s) - self.outfp.write(utils.make_compat_bytes(out_s)) - self._stack.append(tag) - return - - def end_tag(self): - assert self._stack, str(self.pageno) - tag = self._stack.pop(-1) - out_s = '' % utils.enc(tag.name) - self.outfp.write(utils.make_compat_bytes(out_s)) - return - - def do_tag(self, tag, props=None): - self.begin_tag(tag, props) - self._stack.pop(-1) - return +# -*- coding: utf-8 -*- + +import six + +from .pdffont import PDFUnicodeNotDefined + +from . import utils + +## PDFDevice +## +class PDFDevice(object): + + def __init__(self, rsrcmgr): + self.rsrcmgr = rsrcmgr + self.ctm = None + return + + def __repr__(self): + return '' + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + return + + def set_ctm(self, ctm): + self.ctm = ctm + return + + def begin_tag(self, tag, props=None): + return + + def end_tag(self): + return + + def do_tag(self, tag, props=None): + return + + def begin_page(self, page, ctm): + return + + def end_page(self, page): + return + + def begin_figure(self, name, bbox, matrix): + return + + def end_figure(self, name): + return + + def paint_path(self, graphicstate, stroke, fill, evenodd, path): + return + + def render_image(self, name, stream): + return + + def render_string(self, textstate, seq, ncs, graphicstate): + return + + +## PDFTextDevice +## +class PDFTextDevice(PDFDevice): + + def render_string(self, textstate, seq, ncs, graphicstate): + matrix = utils.mult_matrix(textstate.matrix, self.ctm) + font = textstate.font + fontsize = textstate.fontsize + scaling = textstate.scaling * .01 + charspace = textstate.charspace * scaling + wordspace = textstate.wordspace * scaling + rise = textstate.rise + if font.is_multibyte(): + wordspace = 0 + dxscale = .001 * fontsize * scaling + if font.is_vertical(): + textstate.linematrix = self.render_string_vertical( + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) + else: + textstate.linematrix = self.render_string_horizontal( + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) + return + + def render_string_horizontal(self, seq, matrix, pos, + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): + (x, y) = pos + needcharspace = False + for obj in seq: + if utils.isnumber(obj): + x -= obj*dxscale + needcharspace = True + else: + for cid in font.decode(obj): + if needcharspace: + x += charspace + x += self.render_char(utils.translate_matrix(matrix, (x, y)), + font, fontsize, scaling, rise, cid, + ncs, graphicstate) + if cid == 32 and wordspace: + x += wordspace + needcharspace = True + return (x, y) + + def render_string_vertical(self, seq, matrix, pos, + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): + (x, y) = pos + needcharspace = False + for obj in seq: + if utils.isnumber(obj): + y -= obj*dxscale + needcharspace = True + else: + for cid in font.decode(obj): + if needcharspace: + y += charspace + y += self.render_char(utils.translate_matrix(matrix, (x, y)), + font, fontsize, scaling, rise, cid, + ncs, graphicstate) + if cid == 32 and wordspace: + y += wordspace + needcharspace = True + return (x, y) + + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): + return 0 + + +## TagExtractor +## +class TagExtractor(PDFDevice): + + def __init__(self, rsrcmgr, outfp, codec='utf-8'): + PDFDevice.__init__(self, rsrcmgr) + self.outfp = outfp + self.codec = codec + self.pageno = 0 + self._stack = [] + return + + def render_string(self, textstate, seq, ncs, graphicstate): + font = textstate.font + text = '' + for obj in seq: + if isinstance(obj, six.text_type): + obj = utils.make_compat_bytes(obj) + if not isinstance(obj, six.binary_type): + continue + chars = font.decode(obj) + for cid in chars: + try: + char = font.to_unichr(cid) + text += char + except PDFUnicodeNotDefined: + print(chars) + pass + self.outfp.write(utils.enc(text, self.codec)) + return + + def begin_page(self, page, ctm): + output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) + self.outfp.write(utils.make_compat_bytes(output)) + return + + def end_page(self, page): + self.outfp.write(utils.make_compat_bytes('\n')) + self.pageno += 1 + return + + def begin_tag(self, tag, props=None): + s = '' + if isinstance(props, dict): + s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) + in sorted(props.iteritems())) + out_s = '<%s%s>' % (utils.enc(tag.name), s) + self.outfp.write(utils.make_compat_bytes(out_s)) + self._stack.append(tag) + return + + def end_tag(self): + assert self._stack, str(self.pageno) + tag = self._stack.pop(-1) + out_s = '' % utils.enc(tag.name) + self.outfp.write(utils.make_compat_bytes(out_s)) + return + + def do_tag(self, tag, props=None): + self.begin_tag(tag, props) + self._stack.pop(-1) + return From 7b08cdbff9c0ff92691174f92abd0552bd5c2fba Mon Sep 17 00:00:00 2001 From: Charles Reid Date: Thu, 21 Jun 2018 12:19:48 -0700 Subject: [PATCH 5/5] apply dos2unix to files in pdfminer/ and tools/ to remove \r\n windows line endings --- pdfminer/pdfdevice.py | 394 +++++++++++++++++++++--------------------- tools/pdf2txt.spec | 60 +++---- tools/pdfdiff.py | 234 ++++++++++++------------- tools/pdfdiff.spec | 58 +++---- 4 files changed, 373 insertions(+), 373 deletions(-) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 03a5f0e..0d4c175 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,197 +1,197 @@ -# -*- coding: utf-8 -*- - -import six - -from .pdffont import PDFUnicodeNotDefined - -from . import utils - -## PDFDevice -## -class PDFDevice(object): - - def __init__(self, rsrcmgr): - self.rsrcmgr = rsrcmgr - self.ctm = None - return - - def __repr__(self): - return '' - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() - - def close(self): - return - - def set_ctm(self, ctm): - self.ctm = ctm - return - - def begin_tag(self, tag, props=None): - return - - def end_tag(self): - return - - def do_tag(self, tag, props=None): - return - - def begin_page(self, page, ctm): - return - - def end_page(self, page): - return - - def begin_figure(self, name, bbox, matrix): - return - - def end_figure(self, name): - return - - def paint_path(self, graphicstate, stroke, fill, evenodd, path): - return - - def render_image(self, name, stream): - return - - def render_string(self, textstate, seq, ncs, graphicstate): - return - - -## PDFTextDevice -## -class PDFTextDevice(PDFDevice): - - def render_string(self, textstate, seq, ncs, graphicstate): - matrix = utils.mult_matrix(textstate.matrix, self.ctm) - font = textstate.font - fontsize = textstate.fontsize - scaling = textstate.scaling * .01 - charspace = textstate.charspace * scaling - wordspace = textstate.wordspace * scaling - rise = textstate.rise - if font.is_multibyte(): - wordspace = 0 - dxscale = .001 * fontsize * scaling - if font.is_vertical(): - textstate.linematrix = self.render_string_vertical( - seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) - else: - textstate.linematrix = self.render_string_horizontal( - seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) - return - - def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): - (x, y) = pos - needcharspace = False - for obj in seq: - if utils.isnumber(obj): - x -= obj*dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - x += charspace - x += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid, - ncs, graphicstate) - if cid == 32 and wordspace: - x += wordspace - needcharspace = True - return (x, y) - - def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, - rise, dxscale, ncs, graphicstate): - (x, y) = pos - needcharspace = False - for obj in seq: - if utils.isnumber(obj): - y -= obj*dxscale - needcharspace = True - else: - for cid in font.decode(obj): - if needcharspace: - y += charspace - y += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid, - ncs, graphicstate) - if cid == 32 and wordspace: - y += wordspace - needcharspace = True - return (x, y) - - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): - return 0 - - -## TagExtractor -## -class TagExtractor(PDFDevice): - - def __init__(self, rsrcmgr, outfp, codec='utf-8'): - PDFDevice.__init__(self, rsrcmgr) - self.outfp = outfp - self.codec = codec - self.pageno = 0 - self._stack = [] - return - - def render_string(self, textstate, seq, ncs, graphicstate): - font = textstate.font - text = '' - for obj in seq: - if isinstance(obj, six.text_type): - obj = utils.make_compat_bytes(obj) - if not isinstance(obj, six.binary_type): - continue - chars = font.decode(obj) - for cid in chars: - try: - char = font.to_unichr(cid) - text += char - except PDFUnicodeNotDefined: - print(chars) - pass - self.outfp.write(utils.enc(text, self.codec)) - return - - def begin_page(self, page, ctm): - output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) - self.outfp.write(utils.make_compat_bytes(output)) - return - - def end_page(self, page): - self.outfp.write(utils.make_compat_bytes('\n')) - self.pageno += 1 - return - - def begin_tag(self, tag, props=None): - s = '' - if isinstance(props, dict): - s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) - in sorted(props.iteritems())) - out_s = '<%s%s>' % (utils.enc(tag.name), s) - self.outfp.write(utils.make_compat_bytes(out_s)) - self._stack.append(tag) - return - - def end_tag(self): - assert self._stack, str(self.pageno) - tag = self._stack.pop(-1) - out_s = '' % utils.enc(tag.name) - self.outfp.write(utils.make_compat_bytes(out_s)) - return - - def do_tag(self, tag, props=None): - self.begin_tag(tag, props) - self._stack.pop(-1) - return +# -*- coding: utf-8 -*- + +import six + +from .pdffont import PDFUnicodeNotDefined + +from . import utils + +## PDFDevice +## +class PDFDevice(object): + + def __init__(self, rsrcmgr): + self.rsrcmgr = rsrcmgr + self.ctm = None + return + + def __repr__(self): + return '' + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def close(self): + return + + def set_ctm(self, ctm): + self.ctm = ctm + return + + def begin_tag(self, tag, props=None): + return + + def end_tag(self): + return + + def do_tag(self, tag, props=None): + return + + def begin_page(self, page, ctm): + return + + def end_page(self, page): + return + + def begin_figure(self, name, bbox, matrix): + return + + def end_figure(self, name): + return + + def paint_path(self, graphicstate, stroke, fill, evenodd, path): + return + + def render_image(self, name, stream): + return + + def render_string(self, textstate, seq, ncs, graphicstate): + return + + +## PDFTextDevice +## +class PDFTextDevice(PDFDevice): + + def render_string(self, textstate, seq, ncs, graphicstate): + matrix = utils.mult_matrix(textstate.matrix, self.ctm) + font = textstate.font + fontsize = textstate.fontsize + scaling = textstate.scaling * .01 + charspace = textstate.charspace * scaling + wordspace = textstate.wordspace * scaling + rise = textstate.rise + if font.is_multibyte(): + wordspace = 0 + dxscale = .001 * fontsize * scaling + if font.is_vertical(): + textstate.linematrix = self.render_string_vertical( + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) + else: + textstate.linematrix = self.render_string_horizontal( + seq, matrix, textstate.linematrix, font, fontsize, + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) + return + + def render_string_horizontal(self, seq, matrix, pos, + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): + (x, y) = pos + needcharspace = False + for obj in seq: + if utils.isnumber(obj): + x -= obj*dxscale + needcharspace = True + else: + for cid in font.decode(obj): + if needcharspace: + x += charspace + x += self.render_char(utils.translate_matrix(matrix, (x, y)), + font, fontsize, scaling, rise, cid, + ncs, graphicstate) + if cid == 32 and wordspace: + x += wordspace + needcharspace = True + return (x, y) + + def render_string_vertical(self, seq, matrix, pos, + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): + (x, y) = pos + needcharspace = False + for obj in seq: + if utils.isnumber(obj): + y -= obj*dxscale + needcharspace = True + else: + for cid in font.decode(obj): + if needcharspace: + y += charspace + y += self.render_char(utils.translate_matrix(matrix, (x, y)), + font, fontsize, scaling, rise, cid, + ncs, graphicstate) + if cid == 32 and wordspace: + y += wordspace + needcharspace = True + return (x, y) + + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): + return 0 + + +## TagExtractor +## +class TagExtractor(PDFDevice): + + def __init__(self, rsrcmgr, outfp, codec='utf-8'): + PDFDevice.__init__(self, rsrcmgr) + self.outfp = outfp + self.codec = codec + self.pageno = 0 + self._stack = [] + return + + def render_string(self, textstate, seq, ncs, graphicstate): + font = textstate.font + text = '' + for obj in seq: + if isinstance(obj, six.text_type): + obj = utils.make_compat_bytes(obj) + if not isinstance(obj, six.binary_type): + continue + chars = font.decode(obj) + for cid in chars: + try: + char = font.to_unichr(cid) + text += char + except PDFUnicodeNotDefined: + print(chars) + pass + self.outfp.write(utils.enc(text, self.codec)) + return + + def begin_page(self, page, ctm): + output = '' % (self.pageno, utils.bbox2str(page.mediabox), page.rotate) + self.outfp.write(utils.make_compat_bytes(output)) + return + + def end_page(self, page): + self.outfp.write(utils.make_compat_bytes('\n')) + self.pageno += 1 + return + + def begin_tag(self, tag, props=None): + s = '' + if isinstance(props, dict): + s = ''.join(' %s="%s"' % (utils.enc(k), utils.enc(str(v))) for (k, v) + in sorted(props.iteritems())) + out_s = '<%s%s>' % (utils.enc(tag.name), s) + self.outfp.write(utils.make_compat_bytes(out_s)) + self._stack.append(tag) + return + + def end_tag(self): + assert self._stack, str(self.pageno) + tag = self._stack.pop(-1) + out_s = '' % utils.enc(tag.name) + self.outfp.write(utils.make_compat_bytes(out_s)) + return + + def do_tag(self, tag, props=None): + self.begin_tag(tag, props) + self._stack.pop(-1) + return diff --git a/tools/pdf2txt.spec b/tools/pdf2txt.spec index 8baeb77..c0073e6 100644 --- a/tools/pdf2txt.spec +++ b/tools/pdf2txt.spec @@ -1,30 +1,30 @@ -# -*- mode: python -*- - -block_cipher = None - - -a = Analysis(['pdf2txt.py'], - pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], - binaries=[], - datas=[], - hiddenimports=[], - hookspath=[], - runtime_hooks=[], - excludes=['django','matplotlib','PIL','numpy','qt5'], - win_no_prefer_redirects=False, - win_private_assemblies=False, - cipher=block_cipher) - -pyz = PYZ(a.pure, a.zipped_data, - cipher=block_cipher) -exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='pdf2txt', - debug=False, - strip=False, - upx=True, - runtime_tmpdir=None, - console=True ) +# -*- mode: python -*- + +block_cipher = None + + +a = Analysis(['pdf2txt.py'], + pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=['django','matplotlib','PIL','numpy','qt5'], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) + +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='pdf2txt', + debug=False, + strip=False, + upx=True, + runtime_tmpdir=None, + console=True ) diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py index b01e2f4..f5b8ac4 100644 --- a/tools/pdfdiff.py +++ b/tools/pdfdiff.py @@ -1,117 +1,117 @@ -#!/usr/bin/env python - -""" -compares rwo pdf files. -""" -import sys -import logging -import six -import pdfminer.settings -pdfminer.settings.STRICT = False -import pdfminer.high_level -import pdfminer.layout - -def compare(file1,file2,**args): - if args.get('_py2_no_more_posargs',None) is not None: - raise ValueError("Too many positional arguments passed.") - - - # If any LAParams group arguments were passed, create an LAParams object and - # populate with given args. Otherwise, set it to None. - if args.get('laparams',None) is None: - laparams = pdfminer.layout.LAParams() - for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): - paramv = args.get(param, None) - if paramv is not None: - laparams[param]=paramv - args['laparams']=laparams - - s1=six.StringIO() - with open(file1, "rb") as fp: - pdfminer.high_level.extract_text_to_fp(fp,s1, **args) - - s2=six.StringIO() - with open(file2, "rb") as fp: - pdfminer.high_level.extract_text_to_fp(fp,s2, **args) - - import difflib - s1.seek(0) - s2.seek(0) - s1,s2=s1.readlines(), s2.readlines() - - import os.path - try: - extension = os.path.splitext(args['outfile'])[1][1:4] - if extension.lower()=='htm': - return difflib.HtmlDiff().make_file(s1,s2) - except KeyError: - pass - return difflib.unified_diff(s1,s2,n=args['context_lines']) - - -# main -def main(args=None): - import argparse - P = argparse.ArgumentParser(description=__doc__) - P.add_argument("file1", type=str, default=None, help="File 1 to compare.") - P.add_argument("file2", type=str, default=None, help="File 2 to compare.") - P.add_argument("-o", "--outfile", type=str, default="-", - help="Output file (default/'-' is stdout) \ - if .htm or .html, create an HTML table (or a complete HTML file containing the table) \ - showing a side by side, line by line comparison of text with inter-line \ - and intra-line change highlights. \ - The table can be generated in either full or contextual difference mode." - ) - P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown") - P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") - - # params for pdf2txt - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") - P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") - P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") - P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs") - P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)") - P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") - P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") - P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") - P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") - P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") - P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") - P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") - P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") - P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") - P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") - P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") - P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") - P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") - P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") - - - A = P.parse_args(args=args) - - if A.page_numbers: - A.page_numbers = set([x-1 for x in A.page_numbers]) - if A.pagenos: - A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) - - if six.PY2 and sys.stdin.encoding: - A.password = A.password.decode(sys.stdin.encoding) - - if A.output_type == "text" and A.outfile != "-": - for override, alttype in ( (".htm", "html"), - (".html", "html"), - (".xml", "xml" ), - (".tag", "tag" ) ): - if A.outfile.endswith(override): - A.output_type = alttype - - if A.outfile == "-": - outfp = sys.stdout - else: - outfp = open(A.outfile, "w", encoding='utf-8') - outfp.writelines(compare(**vars(A))) - outfp.close() - return 0 - - -if __name__ == '__main__': sys.exit(main()) +#!/usr/bin/env python + +""" +compares rwo pdf files. +""" +import sys +import logging +import six +import pdfminer.settings +pdfminer.settings.STRICT = False +import pdfminer.high_level +import pdfminer.layout + +def compare(file1,file2,**args): + if args.get('_py2_no_more_posargs',None) is not None: + raise ValueError("Too many positional arguments passed.") + + + # If any LAParams group arguments were passed, create an LAParams object and + # populate with given args. Otherwise, set it to None. + if args.get('laparams',None) is None: + laparams = pdfminer.layout.LAParams() + for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): + paramv = args.get(param, None) + if paramv is not None: + laparams[param]=paramv + args['laparams']=laparams + + s1=six.StringIO() + with open(file1, "rb") as fp: + pdfminer.high_level.extract_text_to_fp(fp,s1, **args) + + s2=six.StringIO() + with open(file2, "rb") as fp: + pdfminer.high_level.extract_text_to_fp(fp,s2, **args) + + import difflib + s1.seek(0) + s2.seek(0) + s1,s2=s1.readlines(), s2.readlines() + + import os.path + try: + extension = os.path.splitext(args['outfile'])[1][1:4] + if extension.lower()=='htm': + return difflib.HtmlDiff().make_file(s1,s2) + except KeyError: + pass + return difflib.unified_diff(s1,s2,n=args['context_lines']) + + +# main +def main(args=None): + import argparse + P = argparse.ArgumentParser(description=__doc__) + P.add_argument("file1", type=str, default=None, help="File 1 to compare.") + P.add_argument("file2", type=str, default=None, help="File 2 to compare.") + P.add_argument("-o", "--outfile", type=str, default="-", + help="Output file (default/'-' is stdout) \ + if .htm or .html, create an HTML table (or a complete HTML file containing the table) \ + showing a side by side, line by line comparison of text with inter-line \ + and intra-line change highlights. \ + The table can be generated in either full or contextual difference mode." + ) + P.add_argument("-N", "--context-lines", default=3, type=int, help = "context lines shown") + P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") + + # params for pdf2txt + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") + P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") + P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") + P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs") + P.add_argument("-t", "--output_type", type=str, default="text", help = "pdf2txt type: text|html|xml|tag (default is text)") + P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") + P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") + P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") + P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") + P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") + P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") + P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") + P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") + P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") + P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") + P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") + P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") + P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") + P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") + + + A = P.parse_args(args=args) + + if A.page_numbers: + A.page_numbers = set([x-1 for x in A.page_numbers]) + if A.pagenos: + A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) + + if six.PY2 and sys.stdin.encoding: + A.password = A.password.decode(sys.stdin.encoding) + + if A.output_type == "text" and A.outfile != "-": + for override, alttype in ( (".htm", "html"), + (".html", "html"), + (".xml", "xml" ), + (".tag", "tag" ) ): + if A.outfile.endswith(override): + A.output_type = alttype + + if A.outfile == "-": + outfp = sys.stdout + else: + outfp = open(A.outfile, "w", encoding='utf-8') + outfp.writelines(compare(**vars(A))) + outfp.close() + return 0 + + +if __name__ == '__main__': sys.exit(main()) diff --git a/tools/pdfdiff.spec b/tools/pdfdiff.spec index e90a37f..6872b32 100644 --- a/tools/pdfdiff.spec +++ b/tools/pdfdiff.spec @@ -1,29 +1,29 @@ -# -*- mode: python -*- - -block_cipher = None - - -a = Analysis(['pdfdiff.py'], - pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], - binaries=[], - datas=[], - hiddenimports=[], - hookspath=[], - runtime_hooks=[], - excludes=['django','matplotlib','PIL','numpy','qt5'], - win_no_prefer_redirects=False, - win_private_assemblies=False, - cipher=block_cipher) -pyz = PYZ(a.pure, a.zipped_data, - cipher=block_cipher) -exe = EXE(pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='pdfdiff', - debug=False, - strip=False, - upx=True, - runtime_tmpdir=None, - console=True ) +# -*- mode: python -*- + +block_cipher = None + + +a = Analysis(['pdfdiff.py'], + pathex=['C:\\Dev\\Python\\pdfminer.six\\tools'], + binaries=[], + datas=[], + hiddenimports=[], + hookspath=[], + runtime_hooks=[], + excludes=['django','matplotlib','PIL','numpy','qt5'], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher) +pyz = PYZ(a.pure, a.zipped_data, + cipher=block_cipher) +exe = EXE(pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='pdfdiff', + debug=False, + strip=False, + upx=True, + runtime_tmpdir=None, + console=True )