Fix .paint_path for non-rectangle quadrilaterals (#512)
* Fix paint_path bug noted in issue #473 Focuses on the handling of non-rect quadrilaterals, the decomposition of complex (m.*h)* paths into subpaths, and assigning those subpaths the correct LTCurve/LTRect type. Also adds a test for cases presented in issue #473 * Tweak paint_path fix per @pietermarsman review - Adjusts logic to adhere to if-elif-else rather than early returns. - Shortens subpath detection/reprocessing step, using re.finditer(). * Reorder paint_path() if-else statements once more * Fix flake8 issues * Fix error: should select item 1 and 2 from the list, and possible items [3, 4], and so on. Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/544/head
parent
360b1efc0b
commit
e83dd26671
|
@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Fixed
|
||||
- Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
|
||||
- Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#473](https://github.com/pdfminer/pdfminer.six/pull/473))
|
||||
- Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
|
||||
|
||||
### Removed
|
||||
|
|
|
@ -28,8 +28,6 @@ log = logging.getLogger(__name__)
|
|||
|
||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||
|
||||
RECTS = re.compile('^(mlllh)+$')
|
||||
|
||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||
PDFTextDevice.__init__(self, rsrcmgr)
|
||||
self.pageno = pageno
|
||||
|
@ -77,45 +75,53 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||
shape = ''.join(x[0] for x in path)
|
||||
|
||||
if shape.count('m') > 1:
|
||||
# recurse if there are multiple m's in this shape
|
||||
for m in re.finditer(r'm[^m]+', shape):
|
||||
subpath = path[m.start(0):m.end(0)]
|
||||
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
||||
|
||||
else:
|
||||
if shape == 'ml':
|
||||
# horizontal/vertical line
|
||||
(_, x0, y0) = path[0]
|
||||
(_, x1, y1) = path[1]
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
# single line segment
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
if x0 == x1 or y0 == y1:
|
||||
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(line)
|
||||
|
||||
elif shape == 'mlllh':
|
||||
# rectangle
|
||||
(_, x0, y0) = path[0]
|
||||
(_, x1, y1) = path[1]
|
||||
(_, x2, y2) = path[2]
|
||||
(_, x3, y3) = path[3]
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
|
||||
|
||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(rect)
|
||||
|
||||
elif self.RECTS.match(shape):
|
||||
for paths in zip(*(iter(path),) * 5):
|
||||
self.paint_path(gstate, stroke, fill, evenodd, list(paths))
|
||||
else:
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd,
|
||||
path)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
else:
|
||||
pts = []
|
||||
for p in path:
|
||||
for i in range(1, len(p), 2):
|
||||
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def _create_curve(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Create a `LTCurve` object for the paint path operator"""
|
||||
pts = [
|
||||
apply_matrix_pt(self.ctm, point)
|
||||
for p in path
|
||||
for point in zip(p[1::2], p[2::2])
|
||||
]
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(curve)
|
||||
return curve
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from nose.tools import assert_equal
|
||||
|
||||
from pdfminer.converter import PDFLayoutAnalyzer
|
||||
from pdfminer.layout import LTContainer
|
||||
from pdfminer.layout import LTContainer, LTRect, LTCurve
|
||||
from pdfminer.pdfinterp import PDFGraphicState
|
||||
|
||||
|
||||
|
@ -32,6 +32,69 @@ class TestPaintPath():
|
|||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
assert_equal(len(analyzer.cur_item._objs), 3)
|
||||
|
||||
def test_paint_path_quadrilaterals(self):
|
||||
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
|
||||
|
||||
def parse(path):
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
return analyzer.cur_item._objs
|
||||
|
||||
def get_types(path):
|
||||
return list(map(type, parse(path)))
|
||||
|
||||
assert_equal(get_types([
|
||||
("m", 10, 90),
|
||||
("l", 90, 90),
|
||||
("l", 90, 10),
|
||||
("l", 10, 10),
|
||||
("h",),
|
||||
]), [LTRect])
|
||||
|
||||
assert_equal(get_types([
|
||||
("m", 110, 90),
|
||||
("l", 190, 10),
|
||||
("l", 190, 90),
|
||||
("l", 110, 10),
|
||||
("h",),
|
||||
]), [LTCurve])
|
||||
|
||||
assert_equal(get_types([
|
||||
("m", 210, 90),
|
||||
("l", 290, 60),
|
||||
("l", 290, 10),
|
||||
("l", 210, 10),
|
||||
("h",),
|
||||
]), [LTCurve])
|
||||
|
||||
assert_equal(get_types([
|
||||
("m", 310, 90),
|
||||
("l", 350, 90),
|
||||
("l", 350, 10),
|
||||
("l", 310, 10),
|
||||
("h",),
|
||||
("m", 350, 90),
|
||||
("l", 390, 90),
|
||||
("l", 390, 10),
|
||||
("l", 350, 10),
|
||||
("h",),
|
||||
]), [LTRect, LTRect])
|
||||
|
||||
assert_equal(get_types([
|
||||
("m", 410, 90),
|
||||
("l", 445, 90),
|
||||
("l", 445, 10),
|
||||
("l", 410, 10),
|
||||
("h",),
|
||||
("m", 455, 70),
|
||||
("l", 475, 90),
|
||||
("l", 490, 70),
|
||||
("l", 490, 10),
|
||||
("l", 455, 10),
|
||||
("h",),
|
||||
]), [LTRect, LTCurve])
|
||||
|
||||
def _get_analyzer(self):
|
||||
analyzer = PDFLayoutAnalyzer(None)
|
||||
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
||||
|
|
Loading…
Reference in New Issue