From 0ce9a29f83bb9c87df04f49b5e927d7a6aa4c53c Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 11:23:32 +0400 Subject: [PATCH 1/5] Fix colorspace determinism with OrderedDict --- pdfminer/pdfcolor.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 6fe6eaa..582e34d 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -1,4 +1,4 @@ - +import collections from .psparser import LIT import six #Python 2+3 compatibility @@ -21,17 +21,16 @@ class PDFColorSpace(object): return '' % (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = {} -for (name, n) in six.iteritems({ - 'CalRGB': 3, - 'CalGray': 1, - 'Lab': 3, - 'DeviceRGB': 3, - 'DeviceCMYK': 4, - 'DeviceGray': 1, - 'Separation': 1, - 'Indexed': 1, - 'Pattern': 1, -}) : +PREDEFINED_COLORSPACE = collections.OrderedDict() +for (name, n) in [ + ('CalRGB', 3), + ('CalGray', 1), + ('Lab', 3), + ('DeviceRGB', 3), + ('DeviceCMYK', 4), + ('DeviceGray', 1), + ('Separation', 1), + ('Indexed', 1), + ('Pattern', 1), +]: PREDEFINED_COLORSPACE[name]=PDFColorSpace(name, n) - \ No newline at end of file From b6c63bedc6fcd2294aae60643f41df4acb2ee681 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 11:24:07 +0400 Subject: [PATCH 2/5] Make DeviceGray the default color as it should be --- pdfminer/pdfcolor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index 582e34d..e067f4b 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -23,12 +23,12 @@ class PDFColorSpace(object): PREDEFINED_COLORSPACE = collections.OrderedDict() for (name, n) in [ + ('DeviceGray', 1), # default value first ('CalRGB', 3), ('CalGray', 1), ('Lab', 3), ('DeviceRGB', 3), ('DeviceCMYK', 4), - ('DeviceGray', 1), ('Separation', 1), ('Indexed', 1), ('Pattern', 1), From 2231f0892e0dd0a9af0d69ed73dc4bc8b2823245 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:11:31 +0400 Subject: [PATCH 3/5] Send non-stroke color to XML conversion Inspired by https://github.com/euske/pdfminer/pull/158 from @andruo11 and https://github.com/euske/pdfminer/pull/197 from @staccatosound. --- pdfminer/converter.py | 9 +++++---- pdfminer/layout.py | 4 +++- pdfminer/pdfdevice.py | 20 ++++++++++++-------- pdfminer/pdfinterp.py | 6 +++--- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 02545e8..af70348 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -112,7 +112,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor)) return - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): try: text = font.to_unichr(cid) assert isinstance(text, six.text_type), str(type(text)) @@ -120,7 +120,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): text = self.handle_undefined_char(font, cid) textwidth = font.char_width(cid) textdisp = font.char_disp(cid) - item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp) + item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp, ncs, graphicstate) self.cur_item.add(item) return item.adv @@ -520,8 +520,9 @@ class XMLConverter(PDFConverter): render(child) self.write('\n') elif isinstance(item, LTChar): - self.write('' % - (enc(item.fontname, None), bbox2str(item.bbox), item.size)) + self.write('' % + (enc(item.fontname, None), bbox2str(item.bbox), + item.ncs.name, item.graphicstate.ncolor, item.size)) self.write_text(item.get_text()) self.write('\n') elif isinstance(item, LTText): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 451d4e4..587c221 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -228,11 +228,13 @@ class LTAnno(LTItem, LTText): class LTChar(LTComponent, LTText): def __init__(self, matrix, font, fontsize, scaling, rise, - text, textwidth, textdisp): + text, textwidth, textdisp, ncs, graphicstate): LTText.__init__(self) self._text = text self.matrix = matrix self.fontname = font.fontname + self.ncs = ncs + self.graphicstate = graphicstate self.adv = textwidth * fontsize * scaling # compute the boundary rectangle. if font.is_vertical(): diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 9435101..a9799ed 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -63,7 +63,7 @@ class PDFDevice(object): ## class PDFTextDevice(PDFDevice): - def render_string(self, textstate, seq): + def render_string(self, textstate, seq, ncs, graphicstate): matrix = utils.mult_matrix(textstate.matrix, self.ctm) font = textstate.font fontsize = textstate.fontsize @@ -77,15 +77,16 @@ class PDFTextDevice(PDFDevice): if font.is_vertical(): textstate.linematrix = self.render_string_vertical( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) else: textstate.linematrix = self.render_string_horizontal( seq, matrix, textstate.linematrix, font, fontsize, - scaling, charspace, wordspace, rise, dxscale) + scaling, charspace, wordspace, rise, dxscale, ncs, graphicstate) return def render_string_horizontal(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -97,14 +98,16 @@ class PDFTextDevice(PDFDevice): if needcharspace: x += charspace x += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: x += wordspace needcharspace = True return (x, y) def render_string_vertical(self, seq, matrix, pos, - font, fontsize, scaling, charspace, wordspace, rise, dxscale): + font, fontsize, scaling, charspace, wordspace, + rise, dxscale, ncs, graphicstate): (x, y) = pos needcharspace = False for obj in seq: @@ -116,13 +119,14 @@ class PDFTextDevice(PDFDevice): if needcharspace: y += charspace y += self.render_char(utils.translate_matrix(matrix, (x, y)), - font, fontsize, scaling, rise, cid) + font, fontsize, scaling, rise, cid, + ncs, graphicstate) if cid == 32 and wordspace: y += wordspace needcharspace = True return (x, y) - def render_char(self, matrix, font, fontsize, scaling, rise, cid): + def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): return 0 diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 0c2328d..a14f64a 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -586,13 +586,13 @@ class PDFPageInterpreter(object): # setgray-stroking def do_G(self, gray): - self.graphicstate.color = gray + self.graphicstate.scolor = gray #self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): - self.graphicstate.color = gray + self.graphicstate.ncolor = gray #self.do_cs(LITERAL_DEVICE_GRAY) return @@ -769,7 +769,7 @@ class PDFPageInterpreter(object): if settings.STRICT: raise PDFInterpreterError('No font specified!') return - self.device.render_string(self.textstate, seq) + self.device.render_string(self.textstate, seq, self.ncs, self.graphicstate.copy()) return # show From 94f3d61bb27149a4dd8e09885a6c9d2224124db5 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:41:52 +0400 Subject: [PATCH 4/5] converter: Fix XML syntax --- pdfminer/converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index af70348..4e51e37 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -520,7 +520,7 @@ class XMLConverter(PDFConverter): render(child) self.write('\n') elif isinstance(item, LTChar): - self.write('' % + self.write('' % (enc(item.fontname, None), bbox2str(item.bbox), item.ncs.name, item.graphicstate.ncolor, item.size)) self.write_text(item.get_text()) From 0911703eba93b18c3063ce47e4afa65cd2885ec3 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 6 Mar 2018 14:53:11 +0400 Subject: [PATCH 5/5] pdfcolor: Fix Python 2.6 compatibility --- pdfminer/pdfcolor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index e067f4b..ba09ba2 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -21,7 +21,11 @@ class PDFColorSpace(object): return '' % (self.name, self.ncomponents) -PREDEFINED_COLORSPACE = collections.OrderedDict() +if six.PY2: + PREDEFINED_COLORSPACE = {} +else: + PREDEFINED_COLORSPACE = collections.OrderedDict() + for (name, n) in [ ('DeviceGray', 1), # default value first ('CalRGB', 3),