Fix .paint_path for non-rectangle quadrilaterals (#512)

* Fix paint_path bug noted in issue #473

Focuses on the handling of non-rect quadrilaterals, the decomposition of
complex (m.*h)* paths into subpaths, and assigning those subpaths the
correct LTCurve/LTRect type.

Also adds a test for cases presented in issue #473

* Tweak paint_path fix per @pietermarsman review

- Adjusts logic to adhere to if-elif-else rather than early returns.

- Shortens subpath detection/reprocessing step, using re.finditer().

* Reorder paint_path() if-else statements once more

* Fix flake8 issues

* Fix error: should select item 1 and 2 from the list, and possible items [3, 4], and so on.

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/544/head
Jeremy Singer-Vine 2020-10-12 11:53:00 -04:00 committed by GitHub
parent 360b1efc0b
commit e83dd26671
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 39 deletions

View File

@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475)) - Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
- Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#473](https://github.com/pdfminer/pdfminer.six/pull/473))
- Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483)) - Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
### Removed ### Removed

View File

@ -28,8 +28,6 @@ log = logging.getLogger(__name__)
class PDFLayoutAnalyzer(PDFTextDevice): class PDFLayoutAnalyzer(PDFTextDevice):
RECTS = re.compile('^(mlllh)+$')
def __init__(self, rsrcmgr, pageno=1, laparams=None): def __init__(self, rsrcmgr, pageno=1, laparams=None):
PDFTextDevice.__init__(self, rsrcmgr) PDFTextDevice.__init__(self, rsrcmgr)
self.pageno = pageno self.pageno = pageno
@ -77,45 +75,53 @@ class PDFLayoutAnalyzer(PDFTextDevice):
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
"""Paint paths described in section 4.4 of the PDF reference manual""" """Paint paths described in section 4.4 of the PDF reference manual"""
shape = ''.join(x[0] for x in path) shape = ''.join(x[0] for x in path)
if shape.count('m') > 1:
# recurse if there are multiple m's in this shape
for m in re.finditer(r'm[^m]+', shape):
subpath = path[m.start(0):m.end(0)]
self.paint_path(gstate, stroke, fill, evenodd, subpath)
else:
if shape == 'ml': if shape == 'ml':
# horizontal/vertical line # single line segment
(_, x0, y0) = path[0] (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(_, x1, y1) = path[1] (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
if x0 == x1 or y0 == y1: if x0 == x1 or y0 == y1:
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke, line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor) fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(line) self.cur_item.add(line)
elif shape == 'mlllh': elif shape == 'mlllh':
# rectangle (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(_, x0, y0) = path[0] (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
(_, x1, y1) = path[1] (x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
(_, x2, y2) = path[2] (x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
(_, x3, y3) = path[3]
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke, rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor) fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(rect) self.cur_item.add(rect)
else:
elif self.RECTS.match(shape): curve = self._create_curve(gstate, stroke, fill, evenodd,
for paths in zip(*(iter(path),) * 5): path)
self.paint_path(gstate, stroke, fill, evenodd, list(paths)) self.cur_item.add(curve)
else: else:
pts = [] curve = self._create_curve(gstate, stroke, fill, evenodd, path)
for p in path: self.cur_item.add(curve)
for i in range(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) def _create_curve(self, gstate, stroke, fill, evenodd, path):
"""Create a `LTCurve` object for the paint path operator"""
pts = [
apply_matrix_pt(self.ctm, point)
for p in path
for point in zip(p[1::2], p[2::2])
]
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor) gstate.scolor, gstate.ncolor)
self.cur_item.add(curve) return curve
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate): graphicstate):

View File

@ -1,7 +1,7 @@
from nose.tools import assert_equal from nose.tools import assert_equal
from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.layout import LTContainer from pdfminer.layout import LTContainer, LTRect, LTCurve
from pdfminer.pdfinterp import PDFGraphicState from pdfminer.pdfinterp import PDFGraphicState
@ -32,6 +32,69 @@ class TestPaintPath():
analyzer.paint_path(PDFGraphicState(), False, False, False, path) analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item._objs), 3) assert_equal(len(analyzer.cur_item._objs), 3)
def test_paint_path_quadrilaterals(self):
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
return analyzer.cur_item._objs
def get_types(path):
return list(map(type, parse(path)))
assert_equal(get_types([
("m", 10, 90),
("l", 90, 90),
("l", 90, 10),
("l", 10, 10),
("h",),
]), [LTRect])
assert_equal(get_types([
("m", 110, 90),
("l", 190, 10),
("l", 190, 90),
("l", 110, 10),
("h",),
]), [LTCurve])
assert_equal(get_types([
("m", 210, 90),
("l", 290, 60),
("l", 290, 10),
("l", 210, 10),
("h",),
]), [LTCurve])
assert_equal(get_types([
("m", 310, 90),
("l", 350, 90),
("l", 350, 10),
("l", 310, 10),
("h",),
("m", 350, 90),
("l", 390, 90),
("l", 390, 10),
("l", 350, 10),
("h",),
]), [LTRect, LTRect])
assert_equal(get_types([
("m", 410, 90),
("l", 445, 90),
("l", 445, 10),
("l", 410, 10),
("h",),
("m", 455, 70),
("l", 475, 90),
("l", 490, 70),
("l", 490, 10),
("l", 455, 10),
("h",),
]), [LTRect, LTCurve])
def _get_analyzer(self): def _get_analyzer(self):
analyzer = PDFLayoutAnalyzer(None) analyzer = PDFLayoutAnalyzer(None)
analyzer.set_ctm([1, 0, 0, 1, 0, 0]) analyzer.set_ctm([1, 0, 0, 1, 0, 0])