Fix .paint_path handling of single line segments (#530)

* Fix .paint_path handling of single line segments

- Fixes typo ("ml" should have been "mlh")

- Removes if-statement that required individual line segments to be
  strictly horizontal or vertical.

* Treat 'ml'-shape paths as lines not curves

Althoguh 'mlh' is the canonical implementation for a single line
segment, 'ml' is fairly common.

Adds tests and sample PDF.

* Fix trailing whitespace

* Fix point-extraction from Beziér path commands

This commit corrects the manner in which "pts" are extracted from Beziér
path commands. See Table 4.9 of PDF reference manual, and new comments
in code for details. Previously, depending on whether the command (c,
v, or y) the code was extracting some combination of control points (not
on curve) and the actual points-on-curve.

This commit also refactors .paint_path, so that apply_matrix_pt is only
called in one place, and to treat the "h" command in a manner more
consistent with other path commands.

* Add comments to test_paint_path_quadrilaterals

* Parse rect-forming mllll paths as rects not curves

Now that .paint_path has been refactored, adding support for
rect-forming mllll paths requires no extra code, beyond a minor tweak to
the relevant elif statement.

* One changelog line with ref to mr

* Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring

* Extract variables from if statement to make it easier to read

* Optimize imports order

* Trigger travis build

* Revert "Trigger travis build"

This reverts commit 41c05184

* Update travis badge

* Update travis badge

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/642/head^2
Jeremy Singer-Vine 2021-07-27 12:27:32 -04:00 committed by GitHub
parent 22f90521b8
commit 016239c146
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 130 additions and 44 deletions

View File

@ -3,12 +3,12 @@ All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
### Fixed
- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
## Removed
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))

View File

@ -1,7 +1,7 @@
pdfminer.six
============
[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six)
[![Build Status](https://travis-ci.com/pdfminer/pdfminer.six.svg?branch=develop)](https://travis-ci.com/pdfminer/pdfminer.six)
[![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/)
[![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium)

View File

@ -2,27 +2,27 @@ import io
import logging
import re
import sys
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from . import utils
from .layout import LTChar
from .layout import LTContainer
from .layout import LTPage
from .layout import LTText
from .layout import LTLine
from .layout import LTRect
from .layout import LTCurve
from .layout import LTFigure
from .layout import LTImage
from .layout import LTChar
from .layout import LTTextLine
from .layout import LTLine
from .layout import LTPage
from .layout import LTRect
from .layout import LTText
from .layout import LTTextBox
from .layout import LTTextBoxVertical
from .layout import LTTextGroup
from .layout import LTTextLine
from .pdfdevice import PDFTextDevice
from .pdffont import PDFUnicodeNotDefined
from .utils import apply_matrix_pt
from .utils import mult_matrix
from .utils import enc
from .utils import bbox2str
from . import utils
from .utils import enc
from .utils import mult_matrix
log = logging.getLogger(__name__)
@ -84,46 +84,46 @@ class PDFLayoutAnalyzer(PDFTextDevice):
self.paint_path(gstate, stroke, fill, evenodd, subpath)
else:
if shape == 'ml':
# Although the 'h' command does not not literally provide a
# point-position, its position is (by definition) equal to the
# subpath's starting point.
#
# And, per Section 4.4's Table 4.9, all other path commands place
# their point-position in their final two arguments. (Any preceding
# arguments represent control points on Bézier curves.)
raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path]
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
if shape in {'mlh', 'ml'}:
# single line segment
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
if x0 == x1 or y0 == y1:
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(line)
#
# Note: 'ml', in conditional above, is a frequent anomaly
# that we want to support.
line = LTLine(gstate.linewidth, pts[0], pts[1], stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(line)
elif shape == 'mlllh':
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
elif shape in {'mlllh', 'mllll'}:
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
is_closed_loop = (pts[0] == pts[4])
has_square_coordinates = \
(x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \
or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
if is_closed_loop and has_square_coordinates:
rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke,
fill, evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(rect)
else:
curve = self._create_curve(gstate, stroke, fill, evenodd,
path)
curve = LTCurve(gstate.linewidth, pts, stroke, fill,
evenodd, gstate.scolor, gstate.ncolor)
self.cur_item.add(curve)
else:
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor)
self.cur_item.add(curve)
def _create_curve(self, gstate, stroke, fill, evenodd, path):
"""Create a `LTCurve` object for the paint path operator"""
pts = [
apply_matrix_pt(self.ctm, point)
for p in path
for point in zip(p[1::2], p[2::2])
]
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
gstate.scolor, gstate.ncolor)
return curve
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
graphicstate):
try:

Binary file not shown.

View File

@ -4,7 +4,8 @@ from tempfile import TemporaryFile
from nose.tools import assert_equal, assert_false, assert_true
from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter
from pdfminer.layout import LTContainer, LTRect, LTCurve
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve
from pdfminer.pdfinterp import PDFGraphicState
@ -47,6 +48,7 @@ class TestPaintPath():
def get_types(path):
return list(map(type, parse(path)))
# Standard rect
assert_equal(get_types([
("m", 10, 90),
("l", 90, 90),
@ -55,6 +57,16 @@ class TestPaintPath():
("h",),
]), [LTRect])
# Same but mllll variation
assert_equal(get_types([
("m", 10, 90),
("l", 90, 90),
("l", 90, 10),
("l", 10, 10),
("l", 10, 90),
]), [LTRect])
# Bowtie shape
assert_equal(get_types([
("m", 110, 90),
("l", 190, 10),
@ -63,6 +75,7 @@ class TestPaintPath():
("h",),
]), [LTCurve])
# Quadrilateral with one slanted side
assert_equal(get_types([
("m", 210, 90),
("l", 290, 60),
@ -71,6 +84,7 @@ class TestPaintPath():
("h",),
]), [LTCurve])
# Path with two rect subpaths
assert_equal(get_types([
("m", 310, 90),
("l", 350, 90),
@ -84,6 +98,7 @@ class TestPaintPath():
("h",),
]), [LTRect, LTRect])
# Path with one rect subpath and one pentagon
assert_equal(get_types([
("m", 410, 90),
("l", 445, 90),
@ -98,11 +113,82 @@ class TestPaintPath():
("h",),
]), [LTRect, LTCurve])
# Three types of simple lines
assert_equal(get_types([
# Vertical line
("m", 10, 30),
("l", 10, 40),
("h",),
# Horizontal line
("m", 10, 50),
("l", 70, 50),
("h",),
# Diagonal line
("m", 10, 10),
("l", 30, 30),
("h",),
]), [LTLine, LTLine, LTLine])
# Same as above, but 'ml' variation
assert_equal(get_types([
# Vertical line
("m", 10, 30),
("l", 10, 40),
# Horizontal line
("m", 10, 50),
("l", 70, 50),
# Diagonal line
("m", 10, 10),
("l", 30, 30),
]), [LTLine, LTLine, LTLine])
# There are six lines in this one-page PDF;
# they all have shape 'ml' not 'mlh'
ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf")
ml_pdf_page = list(ml_pdf)[0]
assert sum(type(item) == LTLine for item in ml_pdf_page) == 6
def _get_analyzer(self):
analyzer = PDFLayoutAnalyzer(None)
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
return analyzer
def test_paint_path_beziers(self):
"""See section 4.4, table 4.9 of the PDF reference manual"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
return analyzer.cur_item._objs
# "c" operator
assert parse([
("m", 72.41, 433.89),
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "v" operator
assert parse([
("m", 72.41, 433.89),
("v", 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "y" operator
assert parse([
("m", 72.41, 433.89),
("y", 72.41, 434.45, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
class TestBinaryDetector():
def test_stringio(self):