Fix .paint_path for non-rectangle quadrilaterals (#512)
* Fix paint_path bug noted in issue #473 Focuses on the handling of non-rect quadrilaterals, the decomposition of complex (m.*h)* paths into subpaths, and assigning those subpaths the correct LTCurve/LTRect type. Also adds a test for cases presented in issue #473 * Tweak paint_path fix per @pietermarsman review - Adjusts logic to adhere to if-elif-else rather than early returns. - Shortens subpath detection/reprocessing step, using re.finditer(). * Reorder paint_path() if-else statements once more * Fix flake8 issues * Fix error: should select item 1 and 2 from the list, and possible items [3, 4], and so on. Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/544/head
parent
360b1efc0b
commit
e83dd26671
|
@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
|
- Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
|
||||||
|
- Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#473](https://github.com/pdfminer/pdfminer.six/pull/473))
|
||||||
- Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
|
- Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
|
|
|
@ -28,8 +28,6 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
class PDFLayoutAnalyzer(PDFTextDevice):
|
class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
|
|
||||||
RECTS = re.compile('^(mlllh)+$')
|
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
def __init__(self, rsrcmgr, pageno=1, laparams=None):
|
||||||
PDFTextDevice.__init__(self, rsrcmgr)
|
PDFTextDevice.__init__(self, rsrcmgr)
|
||||||
self.pageno = pageno
|
self.pageno = pageno
|
||||||
|
@ -77,45 +75,53 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
"""Paint paths described in section 4.4 of the PDF reference manual"""
|
||||||
shape = ''.join(x[0] for x in path)
|
shape = ''.join(x[0] for x in path)
|
||||||
|
|
||||||
|
if shape.count('m') > 1:
|
||||||
|
# recurse if there are multiple m's in this shape
|
||||||
|
for m in re.finditer(r'm[^m]+', shape):
|
||||||
|
subpath = path[m.start(0):m.end(0)]
|
||||||
|
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
||||||
|
|
||||||
|
else:
|
||||||
if shape == 'ml':
|
if shape == 'ml':
|
||||||
# horizontal/vertical line
|
# single line segment
|
||||||
(_, x0, y0) = path[0]
|
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||||
(_, x1, y1) = path[1]
|
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
|
||||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
|
||||||
if x0 == x1 or y0 == y1:
|
if x0 == x1 or y0 == y1:
|
||||||
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||||
self.cur_item.add(line)
|
self.cur_item.add(line)
|
||||||
|
|
||||||
elif shape == 'mlllh':
|
elif shape == 'mlllh':
|
||||||
# rectangle
|
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||||
(_, x0, y0) = path[0]
|
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||||
(_, x1, y1) = path[1]
|
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
|
||||||
(_, x2, y2) = path[2]
|
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
|
||||||
(_, x3, y3) = path[3]
|
|
||||||
(x0, y0) = apply_matrix_pt(self.ctm, (x0, y0))
|
|
||||||
(x1, y1) = apply_matrix_pt(self.ctm, (x1, y1))
|
|
||||||
(x2, y2) = apply_matrix_pt(self.ctm, (x2, y2))
|
|
||||||
(x3, y3) = apply_matrix_pt(self.ctm, (x3, y3))
|
|
||||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||||
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||||
self.cur_item.add(rect)
|
self.cur_item.add(rect)
|
||||||
|
else:
|
||||||
elif self.RECTS.match(shape):
|
curve = self._create_curve(gstate, stroke, fill, evenodd,
|
||||||
for paths in zip(*(iter(path),) * 5):
|
path)
|
||||||
self.paint_path(gstate, stroke, fill, evenodd, list(paths))
|
self.cur_item.add(curve)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
pts = []
|
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
|
||||||
for p in path:
|
self.cur_item.add(curve)
|
||||||
for i in range(1, len(p), 2):
|
|
||||||
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
|
def _create_curve(self, gstate, stroke, fill, evenodd, path):
|
||||||
|
"""Create a `LTCurve` object for the paint path operator"""
|
||||||
|
pts = [
|
||||||
|
apply_matrix_pt(self.ctm, point)
|
||||||
|
for p in path
|
||||||
|
for point in zip(p[1::2], p[2::2])
|
||||||
|
]
|
||||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||||
gstate.scolor, gstate.ncolor)
|
gstate.scolor, gstate.ncolor)
|
||||||
self.cur_item.add(curve)
|
return curve
|
||||||
|
|
||||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||||
graphicstate):
|
graphicstate):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from nose.tools import assert_equal
|
from nose.tools import assert_equal
|
||||||
|
|
||||||
from pdfminer.converter import PDFLayoutAnalyzer
|
from pdfminer.converter import PDFLayoutAnalyzer
|
||||||
from pdfminer.layout import LTContainer
|
from pdfminer.layout import LTContainer, LTRect, LTCurve
|
||||||
from pdfminer.pdfinterp import PDFGraphicState
|
from pdfminer.pdfinterp import PDFGraphicState
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,6 +32,69 @@ class TestPaintPath():
|
||||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
assert_equal(len(analyzer.cur_item._objs), 3)
|
assert_equal(len(analyzer.cur_item._objs), 3)
|
||||||
|
|
||||||
|
def test_paint_path_quadrilaterals(self):
|
||||||
|
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
|
||||||
|
|
||||||
|
def parse(path):
|
||||||
|
analyzer = self._get_analyzer()
|
||||||
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||||
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||||
|
return analyzer.cur_item._objs
|
||||||
|
|
||||||
|
def get_types(path):
|
||||||
|
return list(map(type, parse(path)))
|
||||||
|
|
||||||
|
assert_equal(get_types([
|
||||||
|
("m", 10, 90),
|
||||||
|
("l", 90, 90),
|
||||||
|
("l", 90, 10),
|
||||||
|
("l", 10, 10),
|
||||||
|
("h",),
|
||||||
|
]), [LTRect])
|
||||||
|
|
||||||
|
assert_equal(get_types([
|
||||||
|
("m", 110, 90),
|
||||||
|
("l", 190, 10),
|
||||||
|
("l", 190, 90),
|
||||||
|
("l", 110, 10),
|
||||||
|
("h",),
|
||||||
|
]), [LTCurve])
|
||||||
|
|
||||||
|
assert_equal(get_types([
|
||||||
|
("m", 210, 90),
|
||||||
|
("l", 290, 60),
|
||||||
|
("l", 290, 10),
|
||||||
|
("l", 210, 10),
|
||||||
|
("h",),
|
||||||
|
]), [LTCurve])
|
||||||
|
|
||||||
|
assert_equal(get_types([
|
||||||
|
("m", 310, 90),
|
||||||
|
("l", 350, 90),
|
||||||
|
("l", 350, 10),
|
||||||
|
("l", 310, 10),
|
||||||
|
("h",),
|
||||||
|
("m", 350, 90),
|
||||||
|
("l", 390, 90),
|
||||||
|
("l", 390, 10),
|
||||||
|
("l", 350, 10),
|
||||||
|
("h",),
|
||||||
|
]), [LTRect, LTRect])
|
||||||
|
|
||||||
|
assert_equal(get_types([
|
||||||
|
("m", 410, 90),
|
||||||
|
("l", 445, 90),
|
||||||
|
("l", 445, 10),
|
||||||
|
("l", 410, 10),
|
||||||
|
("h",),
|
||||||
|
("m", 455, 70),
|
||||||
|
("l", 475, 90),
|
||||||
|
("l", 490, 70),
|
||||||
|
("l", 490, 10),
|
||||||
|
("l", 455, 10),
|
||||||
|
("h",),
|
||||||
|
]), [LTRect, LTCurve])
|
||||||
|
|
||||||
def _get_analyzer(self):
|
def _get_analyzer(self):
|
||||||
analyzer = PDFLayoutAnalyzer(None)
|
analyzer = PDFLayoutAnalyzer(None)
|
||||||
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
||||||
|
|
Loading…
Reference in New Issue