Fix .paint_path handling of single line segments (#530)
* Fix .paint_path handling of single line segments
- Fixes typo ("ml" should have been "mlh")
- Removes if-statement that required individual line segments to be
strictly horizontal or vertical.
* Treat 'ml'-shape paths as lines not curves
Althoguh 'mlh' is the canonical implementation for a single line
segment, 'ml' is fairly common.
Adds tests and sample PDF.
* Fix trailing whitespace
* Fix point-extraction from Beziér path commands
This commit corrects the manner in which "pts" are extracted from Beziér
path commands. See Table 4.9 of PDF reference manual, and new comments
in code for details. Previously, depending on whether the command (c,
v, or y) the code was extracting some combination of control points (not
on curve) and the actual points-on-curve.
This commit also refactors .paint_path, so that apply_matrix_pt is only
called in one place, and to treat the "h" command in a manner more
consistent with other path commands.
* Add comments to test_paint_path_quadrilaterals
* Parse rect-forming mllll paths as rects not curves
Now that .paint_path has been refactored, adding support for
rect-forming mllll paths requires no extra code, beyond a minor tweak to
the relevant elif statement.
* One changelog line with ref to mr
* Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring
* Extract variables from if statement to make it easier to read
* Optimize imports order
* Trigger travis build
* Revert "Trigger travis build"
This reverts commit 41c05184
* Update travis badge
* Update travis badge
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/642/head^2
parent
22f90521b8
commit
016239c146
|
@ -3,12 +3,12 @@ All notable changes in pdfminer.six will be documented in this file.
|
|||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
|
||||
- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
|
||||
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
|
||||
|
||||
## Removed
|
||||
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
pdfminer.six
|
||||
============
|
||||
|
||||
[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six)
|
||||
[![Build Status](https://travis-ci.com/pdfminer/pdfminer.six.svg?branch=develop)](https://travis-ci.com/pdfminer/pdfminer.six)
|
||||
[![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/)
|
||||
[![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium)
|
||||
|
||||
|
|
|
@ -2,27 +2,27 @@ import io
|
|||
import logging
|
||||
import re
|
||||
import sys
|
||||
from .pdfdevice import PDFTextDevice
|
||||
from .pdffont import PDFUnicodeNotDefined
|
||||
|
||||
from . import utils
|
||||
from .layout import LTChar
|
||||
from .layout import LTContainer
|
||||
from .layout import LTPage
|
||||
from .layout import LTText
|
||||
from .layout import LTLine
|
||||
from .layout import LTRect
|
||||
from .layout import LTCurve
|
||||
from .layout import LTFigure
|
||||
from .layout import LTImage
|
||||
from .layout import LTChar
|
||||
from .layout import LTTextLine
|
||||
from .layout import LTLine
|
||||
from .layout import LTPage
|
||||
from .layout import LTRect
|
||||
from .layout import LTText
|
||||
from .layout import LTTextBox
|
||||
from .layout import LTTextBoxVertical
|
||||
from .layout import LTTextGroup
|
||||
from .layout import LTTextLine
|
||||
from .pdfdevice import PDFTextDevice
|
||||
from .pdffont import PDFUnicodeNotDefined
|
||||
from .utils import apply_matrix_pt
|
||||
from .utils import mult_matrix
|
||||
from .utils import enc
|
||||
from .utils import bbox2str
|
||||
from . import utils
|
||||
|
||||
from .utils import enc
|
||||
from .utils import mult_matrix
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -84,45 +84,45 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
self.paint_path(gstate, stroke, fill, evenodd, subpath)
|
||||
|
||||
else:
|
||||
if shape == 'ml':
|
||||
# Although the 'h' command does not not literally provide a
|
||||
# point-position, its position is (by definition) equal to the
|
||||
# subpath's starting point.
|
||||
#
|
||||
# And, per Section 4.4's Table 4.9, all other path commands place
|
||||
# their point-position in their final two arguments. (Any preceding
|
||||
# arguments represent control points on Bézier curves.)
|
||||
raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path]
|
||||
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
||||
|
||||
if shape in {'mlh', 'ml'}:
|
||||
# single line segment
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
if x0 == x1 or y0 == y1:
|
||||
line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke,
|
||||
#
|
||||
# Note: 'ml', in conditional above, is a frequent anomaly
|
||||
# that we want to support.
|
||||
line = LTLine(gstate.linewidth, pts[0], pts[1], stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(line)
|
||||
|
||||
elif shape == 'mlllh':
|
||||
(x0, y0) = apply_matrix_pt(self.ctm, path[0][1:])
|
||||
(x1, y1) = apply_matrix_pt(self.ctm, path[1][1:])
|
||||
(x2, y2) = apply_matrix_pt(self.ctm, path[2][1:])
|
||||
(x3, y3) = apply_matrix_pt(self.ctm, path[3][1:])
|
||||
elif shape in {'mlllh', 'mllll'}:
|
||||
(x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts
|
||||
|
||||
if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \
|
||||
(y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0):
|
||||
rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke,
|
||||
is_closed_loop = (pts[0] == pts[4])
|
||||
has_square_coordinates = \
|
||||
(x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \
|
||||
or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
|
||||
if is_closed_loop and has_square_coordinates:
|
||||
rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke,
|
||||
fill, evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(rect)
|
||||
else:
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd,
|
||||
path)
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill,
|
||||
evenodd, gstate.scolor, gstate.ncolor)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
else:
|
||||
curve = self._create_curve(gstate, stroke, fill, evenodd, path)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def _create_curve(self, gstate, stroke, fill, evenodd, path):
|
||||
"""Create a `LTCurve` object for the paint path operator"""
|
||||
pts = [
|
||||
apply_matrix_pt(self.ctm, point)
|
||||
for p in path
|
||||
for point in zip(p[1::2], p[2::2])
|
||||
]
|
||||
curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd,
|
||||
gstate.scolor, gstate.ncolor)
|
||||
return curve
|
||||
self.cur_item.add(curve)
|
||||
|
||||
def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs,
|
||||
graphicstate):
|
||||
|
|
Binary file not shown.
|
@ -4,7 +4,8 @@ from tempfile import TemporaryFile
|
|||
from nose.tools import assert_equal, assert_false, assert_true
|
||||
|
||||
from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter
|
||||
from pdfminer.layout import LTContainer, LTRect, LTCurve
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve
|
||||
from pdfminer.pdfinterp import PDFGraphicState
|
||||
|
||||
|
||||
|
@ -47,6 +48,7 @@ class TestPaintPath():
|
|||
def get_types(path):
|
||||
return list(map(type, parse(path)))
|
||||
|
||||
# Standard rect
|
||||
assert_equal(get_types([
|
||||
("m", 10, 90),
|
||||
("l", 90, 90),
|
||||
|
@ -55,6 +57,16 @@ class TestPaintPath():
|
|||
("h",),
|
||||
]), [LTRect])
|
||||
|
||||
# Same but mllll variation
|
||||
assert_equal(get_types([
|
||||
("m", 10, 90),
|
||||
("l", 90, 90),
|
||||
("l", 90, 10),
|
||||
("l", 10, 10),
|
||||
("l", 10, 90),
|
||||
]), [LTRect])
|
||||
|
||||
# Bowtie shape
|
||||
assert_equal(get_types([
|
||||
("m", 110, 90),
|
||||
("l", 190, 10),
|
||||
|
@ -63,6 +75,7 @@ class TestPaintPath():
|
|||
("h",),
|
||||
]), [LTCurve])
|
||||
|
||||
# Quadrilateral with one slanted side
|
||||
assert_equal(get_types([
|
||||
("m", 210, 90),
|
||||
("l", 290, 60),
|
||||
|
@ -71,6 +84,7 @@ class TestPaintPath():
|
|||
("h",),
|
||||
]), [LTCurve])
|
||||
|
||||
# Path with two rect subpaths
|
||||
assert_equal(get_types([
|
||||
("m", 310, 90),
|
||||
("l", 350, 90),
|
||||
|
@ -84,6 +98,7 @@ class TestPaintPath():
|
|||
("h",),
|
||||
]), [LTRect, LTRect])
|
||||
|
||||
# Path with one rect subpath and one pentagon
|
||||
assert_equal(get_types([
|
||||
("m", 410, 90),
|
||||
("l", 445, 90),
|
||||
|
@ -98,11 +113,82 @@ class TestPaintPath():
|
|||
("h",),
|
||||
]), [LTRect, LTCurve])
|
||||
|
||||
# Three types of simple lines
|
||||
assert_equal(get_types([
|
||||
# Vertical line
|
||||
("m", 10, 30),
|
||||
("l", 10, 40),
|
||||
("h",),
|
||||
# Horizontal line
|
||||
("m", 10, 50),
|
||||
("l", 70, 50),
|
||||
("h",),
|
||||
# Diagonal line
|
||||
("m", 10, 10),
|
||||
("l", 30, 30),
|
||||
("h",),
|
||||
]), [LTLine, LTLine, LTLine])
|
||||
|
||||
# Same as above, but 'ml' variation
|
||||
assert_equal(get_types([
|
||||
# Vertical line
|
||||
("m", 10, 30),
|
||||
("l", 10, 40),
|
||||
# Horizontal line
|
||||
("m", 10, 50),
|
||||
("l", 70, 50),
|
||||
# Diagonal line
|
||||
("m", 10, 10),
|
||||
("l", 30, 30),
|
||||
]), [LTLine, LTLine, LTLine])
|
||||
|
||||
# There are six lines in this one-page PDF;
|
||||
# they all have shape 'ml' not 'mlh'
|
||||
ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf")
|
||||
ml_pdf_page = list(ml_pdf)[0]
|
||||
assert sum(type(item) == LTLine for item in ml_pdf_page) == 6
|
||||
|
||||
def _get_analyzer(self):
|
||||
analyzer = PDFLayoutAnalyzer(None)
|
||||
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
||||
return analyzer
|
||||
|
||||
def test_paint_path_beziers(self):
|
||||
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
||||
|
||||
def parse(path):
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
return analyzer.cur_item._objs
|
||||
|
||||
# "c" operator
|
||||
assert parse([
|
||||
("m", 72.41, 433.89),
|
||||
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
# "v" operator
|
||||
assert parse([
|
||||
("m", 72.41, 433.89),
|
||||
("v", 71.96, 434.89, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
# "y" operator
|
||||
assert parse([
|
||||
("m", 72.41, 433.89),
|
||||
("y", 72.41, 434.45, 71.41, 434.89),
|
||||
])[0].pts == [
|
||||
(72.41, 433.89),
|
||||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
|
||||
class TestBinaryDetector():
|
||||
def test_stringio(self):
|
||||
|
|
Loading…
Reference in New Issue