From 0ce9a29f83bb9c87df04f49b5e927d7a6aa4c53c Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 11:23:32 +0400 Subject: [PATCH 01/10] Fix colorspace determinism with OrderedDict --- pdfminer/pdfcolor.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 6fe6eaa..582e34d 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,4 +1,4 @@ - +import collections from .psparser import LIT import six #Python 2+3 compatibility @@ -21,17 +21,16 @@ class PDFColorSpace(object): return '' % (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = {} -for (name, n) in six.iteritems({ - 'CalRGB': 3, - 'CalGray': 1, - 'Lab': 3, - 'DeviceRGB': 3, - 'DeviceCMYK': 4, - 'DeviceGray': 1, - 'Separation': 1, - 'Indexed': 1, - 'Pattern': 1, -}) : +PREDEFINED_COLORSPACE = collections.OrderedDict() +for (name, n) in [ + ('CalRGB', 3), + ('CalGray', 1), + ('Lab', 3), + ('DeviceRGB', 3), + ('DeviceCMYK', 4), + ('DeviceGray', 1), + ('Separation', 1), + ('Indexed', 1), + ('Pattern', 1), +]: PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n) - \ No newline at end of file From b6c63bedc6fcd2294aae60643f41df4acb2ee681 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 11:24:07 +0400 Subject: [PATCH 02/10] Make DeviceGray the default color as it should be --- pdfminer/pdfcolor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 582e34d..e067f4b 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -23,12 +23,12 @@ class PDFColorSpace(object): PREDEFINED_COLORSPACE = collections.OrderedDict() for (name, n) in [ + ('DeviceGray', 1), # default value first ('CalRGB', 3), ('CalGray', 1), ('Lab', 3), ('DeviceRGB', 3), ('DeviceCMYK', 4), - ('DeviceGray', 1), ('Separation', 1), ('Indexed', 1), ('Pattern', 1), From 2231f0892e0dd0a9af0d69ed73dc4bc8b2823245 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:11:31 +0400 Subject: [PATCH 03/10] Send non-stroke color to XML conversion Inspired by https://github.com/euske/pdfminer/pull/158 from @andruo11 and https://github.com/euske/pdfminer/pull/197 from @staccatosound. --- pdfminer/converter.py | 9 +++++---- pdfminer/layout.py | 4 +++- pdfminer/pdfdevice.py | 20 ++++++++++++-------- pdfminer/pdfinterp.py | 6 +++--- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 02545e8..af70348 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -112,7 +112,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor)) return - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): try: text = font.to_unichr(cid) assert isinstance(text, six.text_type), str(type(text)) @@ -120,7 +120,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) - item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) + item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate) self.cur_item.add(item) return item.adv @@ -520,8 +520,9 @@ class XMLConverter(PDFConverter): render(child) self.write('\n') elif isinstance(item, LTChar): - self.write('' % - (enc(item.fontname, None), bbox2str(item.bbox), item.size)) + self.write('' % + (enc(item.fontname, None), bbox2str(item.bbox), + item.ncs.name, item.graphicstate.ncolor, item.size)) self.write_text(item.get_text()) self.write('\n') elif isinstance(item, LTText): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 451d4e4..587c221 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -228,11 +228,13 @@ class LTAnno(LTItem, LTText): class LTChar(LTComponent, LTText): def __init__(self, matrix, font, fontsize, scaling, rise, - text, textwidth, textdisp): + text, textwidth, textdisp, ncs, graphicstate): LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname + self.ncs = ncs + self.graphicstate = graphicstate self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. if font.is_vertical(): diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 9435101..a9799ed 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -63,7 +63,7 @@ class PDFDevice(object): ## class PDFTextDevice(PDFDevice): - def render_string(self, textstate, seq): + def render_string(self, textstate, seq, ncs, graphicstate): matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize @@ -77,15 +77,16 @@ class PDFTextDevice(PDFDevice): if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) return def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -97,14 +98,16 @@ class PDFTextDevice(PDFDevice): if needcharspace: x += charspace x += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -116,13 +119,14 @@ class PDFTextDevice(PDFDevice): if needcharspace: y += charspace y += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): return 0 diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 0c2328d..a14f64a 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -586,13 +586,13 @@ class PDFPageInterpreter(object): # setgray-stroking def do_G(self, gray): - self.graphicstate.color = gray + self.graphicstate.scolor = gray #self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): - self.graphicstate.color = gray + self.graphicstate.ncolor = gray #self.do_cs(LITERAL_DEVICE_GRAY) return @@ -769,7 +769,7 @@ class PDFPageInterpreter(object): if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq) + self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) return # show From 94f3d61bb27149a4dd8e09885a6c9d2224124db5 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:41:52 +0400 Subject: [PATCH 04/10] converter: Fix XML syntax --- pdfminer/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index af70348..4e51e37 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -520,7 +520,7 @@ class XMLConverter(PDFConverter): render(child) self.write('\n') elif isinstance(item, LTChar): - self.write('' % + self.write('' % (enc(item.fontname, None), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size)) self.write_text(item.get_text()) From 0911703eba93b18c3063ce47e4afa65cd2885ec3 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:53:11 +0400 Subject: [PATCH 05/10] pdfcolor: Fix Python 2.6 compatibility --- pdfminer/pdfcolor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index e067f4b..ba09ba2 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -21,7 +21,11 @@ class PDFColorSpace(object): return '' % (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = collections.OrderedDict() +if six.PY2: + PREDEFINED_COLORSPACE = {} +else: + PREDEFINED_COLORSPACE = collections.OrderedDict() + for (name, n) in [ ('DeviceGray', 1), # default value first ('CalRGB', 3), From 981e3a575e9fbf14122a58151d8264e02a8b5aed Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Tue, 3 Apr 2018 12:47:40 +1000 Subject: [PATCH 06/10] Fix TypeError caused by bug in _parse_comment; #90 #89 #109 --- pdfminer/psparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 9b214af..7a28d84 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -338,7 +338,7 @@ class PSBaseParser(object): m = EOL.search(s, i) if not m: self._curtoken += s[i:] - return (self._parse_comment, len(s)) + return len(s) j = m.start(0) self._curtoken += s[i:j] self._parse1 = self._parse_main From ed7d8308d9c96b08f3d8348929b35bd2fd6cff8c Mon Sep 17 00:00:00 2001 From: Andy Kluger Date: Tue, 3 Apr 2018 12:26:01 -0400 Subject: [PATCH 07/10] -P is *not* for page numbers, but passwords, so reflect that in the help text --- tools/pdf2txt.py | 2 +- tools/pdfdiff.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 1e8ec0b..3111c5c 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -68,7 +68,7 @@ def main(args=None): P = argparse.ArgumentParser(description=__doc__) P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.") P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF") diff --git a/tools/pdfdiff.py b/tools/pdfdiff.py index 17e41e8..b01e2f4 100644 --- a/tools/pdfdiff.py +++ b/tools/pdfdiff.py @@ -66,7 +66,7 @@ def main(args=None): P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") # params for pdf2txt - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use --page-numbers for more idiomatic argument entry.") P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for both PDFs") From 335c25c0450b8164544677794b9a552f542ecce6 Mon Sep 17 00:00:00 2001 From: Gregory Mori Date: Mon, 9 Apr 2018 12:21:59 -0700 Subject: [PATCH 08/10] only check for bytes input to enc() in python3 In python2, isinstance("", bytes) is true, causing enc() to suppress any string input. This results in fontnames being lost when running pdf2txt.py in python2. As this check was not present in the original python2 version of pdfminer, restrict it to only check when running in python3. --- pdfminer/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 9cbcbb3..0e948c2 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -275,7 +275,7 @@ def decode_text(s): # enc def enc(x, codec='ascii'): """Encodes a string for SGML/XML/HTML""" - if isinstance(x, bytes): + if six.PY3 and isinstance(x, bytes): return '' x = x.replace('&', '&').replace('>', '>').replace('<', '<').replace('"', '"') if codec: From 72d6e930459eb80acf3b0e9c75799745d607adcb Mon Sep 17 00:00:00 2001 From: Tata Ganesh Date: Sun, 10 Jun 2018 19:52:33 +0530 Subject: [PATCH 09/10] FIX: Removed python 2.6 build - python 2.6 is being no longer supported, and some of the PRs are failing the python 2.6 check --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d4107e0..62b15ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.6" - "2.7" - "3.4" - "3.5" From ac8bb81c72660a807324f9632e11c012caa1289f Mon Sep 17 00:00:00 2001 From: Tata Ganesh Date: Sun, 17 Jun 2018 22:37:32 +0530 Subject: [PATCH 10/10] DOCS: Update Readme.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f49dbca..2a96278 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,11 @@ How to Install * Install Python 2.7 or newer. (Python 3.x is supported in pdfminer.six) * Install - $ pip install pdfminer.six + `pip install pdfminer.six` * Run the following test: - $ pdf2txt.py samples/simple1.pdf + `pdf2txt.py samples/simple1.pdf` Command Line Tools @@ -78,6 +78,7 @@ TODO * PEP-8 and PEP-257 conformance. * Better documentation. + * Performance improvements. Terms and Conditions