From e83dd26671e5e6962960ee7b55aa4b8ab45e85d5 Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Mon, 12 Oct 2020 11:53:00 -0400 Subject: [PATCH] Fix .paint_path for non-rectangle quadrilaterals (#512) * Fix paint_path bug noted in issue #473 Focuses on the handling of non-rect quadrilaterals, the decomposition of complex (m.*h)* paths into subpaths, and assigning those subpaths the correct LTCurve/LTRect type. Also adds a test for cases presented in issue #473 * Tweak paint_path fix per @pietermarsman review - Adjusts logic to adhere to if-elif-else rather than early returns. - Shortens subpath detection/reprocessing step, using re.finditer(). * Reorder paint_path() if-else statements once more * Fix flake8 issues * Fix error: should select item 1 and 2 from the list, and possible items [3, 4], and so on. Co-authored-by: Pieter Marsman --- CHANGELOG.md | 1 + pdfminer/converter.py | 82 ++++++++++++++++++++++------------------- tests/test_converter.py | 65 +++++++++++++++++++++++++++++++- 3 files changed, 109 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d77ae73..ddf622d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475)) +- Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#473](https://github.com/pdfminer/pdfminer.six/pull/473)) - Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483)) ### Removed diff --git a/pdfminer/converter.py b/pdfminer/converter.py index dbe034f..dcdb055 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -28,8 +28,6 @@ log = logging.getLogger(__name__) class PDFLayoutAnalyzer(PDFTextDevice): - RECTS = re.compile('^(mlllh)+$') - def __init__(self, rsrcmgr, pageno=1, laparams=None): PDFTextDevice.__init__(self, rsrcmgr) self.pageno = pageno @@ -77,45 +75,53 @@ class PDFLayoutAnalyzer(PDFTextDevice): def paint_path(self, gstate, stroke, fill, evenodd, path): """Paint paths described in section 4.4 of the PDF reference manual""" shape = ''.join(x[0] for x in path) - if shape == 'ml': - # horizontal/vertical line - (_, x0, y0) = path[0] - (_, x1, y1) = path[1] - (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0)) - (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1)) - if x0 == x1 or y0 == y1: - line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke, - fill, evenodd, gstate.scolor, gstate.ncolor) - self.cur_item.add(line) - elif shape == 'mlllh': - # rectangle - (_, x0, y0) = path[0] - (_, x1, y1) = path[1] - (_, x2, y2) = path[2] - (_, x3, y3) = path[3] - (x0, y0) = apply_matrix_pt(self.ctm, (x0, y0)) - (x1, y1) = apply_matrix_pt(self.ctm, (x1, y1)) - (x2, y2) = apply_matrix_pt(self.ctm, (x2, y2)) - (x3, y3) = apply_matrix_pt(self.ctm, (x3, y3)) - if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ - (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): - rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke, - fill, evenodd, gstate.scolor, gstate.ncolor) - self.cur_item.add(rect) - - elif self.RECTS.match(shape): - for paths in zip(*(iter(path),) * 5): - self.paint_path(gstate, stroke, fill, evenodd, list(paths)) + if shape.count('m') > 1: + # recurse if there are multiple m's in this shape + for m in re.finditer(r'm[^m]+', shape): + subpath = path[m.start(0):m.end(0)] + self.paint_path(gstate, stroke, fill, evenodd, subpath) else: - pts = [] - for p in path: - for i in range(1, len(p), 2): - pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) - curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, - gstate.scolor, gstate.ncolor) - self.cur_item.add(curve) + if shape == 'ml': + # single line segment + (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:]) + (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:]) + if x0 == x1 or y0 == y1: + line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke, + fill, evenodd, gstate.scolor, gstate.ncolor) + self.cur_item.add(line) + + elif shape == 'mlllh': + (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:]) + (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:]) + (x2, y2) = apply_matrix_pt(self.ctm, path[2][1:]) + (x3, y3) = apply_matrix_pt(self.ctm, path[3][1:]) + + if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ + (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): + rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke, + fill, evenodd, gstate.scolor, gstate.ncolor) + self.cur_item.add(rect) + else: + curve = self._create_curve(gstate, stroke, fill, evenodd, + path) + self.cur_item.add(curve) + + else: + curve = self._create_curve(gstate, stroke, fill, evenodd, path) + self.cur_item.add(curve) + + def _create_curve(self, gstate, stroke, fill, evenodd, path): + """Create a `LTCurve` object for the paint path operator""" + pts = [ + apply_matrix_pt(self.ctm, point) + for p in path + for point in zip(p[1::2], p[2::2]) + ] + curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, + gstate.scolor, gstate.ncolor) + return curve def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): diff --git a/tests/test_converter.py b/tests/test_converter.py index 6cbdfbc..c36a572 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -1,7 +1,7 @@ from nose.tools import assert_equal from pdfminer.converter import PDFLayoutAnalyzer -from pdfminer.layout import LTContainer +from pdfminer.layout import LTContainer, LTRect, LTCurve from pdfminer.pdfinterp import PDFGraphicState @@ -32,6 +32,69 @@ class TestPaintPath(): analyzer.paint_path(PDFGraphicState(), False, False, False, path) assert_equal(len(analyzer.cur_item._objs), 3) + def test_paint_path_quadrilaterals(self): + """via https://github.com/pdfminer/pdfminer.six/issues/473""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + return analyzer.cur_item._objs + + def get_types(path): + return list(map(type, parse(path))) + + assert_equal(get_types([ + ("m", 10, 90), + ("l", 90, 90), + ("l", 90, 10), + ("l", 10, 10), + ("h",), + ]), [LTRect]) + + assert_equal(get_types([ + ("m", 110, 90), + ("l", 190, 10), + ("l", 190, 90), + ("l", 110, 10), + ("h",), + ]), [LTCurve]) + + assert_equal(get_types([ + ("m", 210, 90), + ("l", 290, 60), + ("l", 290, 10), + ("l", 210, 10), + ("h",), + ]), [LTCurve]) + + assert_equal(get_types([ + ("m", 310, 90), + ("l", 350, 90), + ("l", 350, 10), + ("l", 310, 10), + ("h",), + ("m", 350, 90), + ("l", 390, 90), + ("l", 390, 10), + ("l", 350, 10), + ("h",), + ]), [LTRect, LTRect]) + + assert_equal(get_types([ + ("m", 410, 90), + ("l", 445, 90), + ("l", 445, 10), + ("l", 410, 10), + ("h",), + ("m", 455, 70), + ("l", 475, 90), + ("l", 490, 70), + ("l", 490, 10), + ("l", 455, 10), + ("h",), + ]), [LTRect, LTCurve]) + def _get_analyzer(self): analyzer = PDFLayoutAnalyzer(None) analyzer.set_ctm([1, 0, 0, 1, 0, 0])