diff --git a/CHANGELOG.md b/CHANGELOG.md index 30d9e11..a385a89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,12 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - ## [Unreleased] ### Fixed - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529)) - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469)) +- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) ## Removed - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) diff --git a/README.md b/README.md index 3f61ce0..bed07bc 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ pdfminer.six ============ -[![Build Status](https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master)](https://travis-ci.org/pdfminer/pdfminer.six) +[![Build Status](https://travis-ci.com/pdfminer/pdfminer.six.svg?branch=develop)](https://travis-ci.com/pdfminer/pdfminer.six) [![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/) [![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 0c5742c..4b2b62e 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -2,27 +2,27 @@ import io import logging import re import sys -from .pdfdevice import PDFTextDevice -from .pdffont import PDFUnicodeNotDefined + +from . import utils +from .layout import LTChar from .layout import LTContainer -from .layout import LTPage -from .layout import LTText -from .layout import LTLine -from .layout import LTRect from .layout import LTCurve from .layout import LTFigure from .layout import LTImage -from .layout import LTChar -from .layout import LTTextLine +from .layout import LTLine +from .layout import LTPage +from .layout import LTRect +from .layout import LTText from .layout import LTTextBox from .layout import LTTextBoxVertical from .layout import LTTextGroup +from .layout import LTTextLine +from .pdfdevice import PDFTextDevice +from .pdffont import PDFUnicodeNotDefined from .utils import apply_matrix_pt -from .utils import mult_matrix -from .utils import enc from .utils import bbox2str -from . import utils - +from .utils import enc +from .utils import mult_matrix log = logging.getLogger(__name__) @@ -84,46 +84,46 @@ class PDFLayoutAnalyzer(PDFTextDevice): self.paint_path(gstate, stroke, fill, evenodd, subpath) else: - if shape == 'ml': + # Although the 'h' command does not not literally provide a + # point-position, its position is (by definition) equal to the + # subpath's starting point. + # + # And, per Section 4.4's Table 4.9, all other path commands place + # their point-position in their final two arguments. (Any preceding + # arguments represent control points on Bézier curves.) + raw_pts = [p[-2:] if p[0] != 'h' else path[0][-2:] for p in path] + pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + + if shape in {'mlh', 'ml'}: # single line segment - (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:]) - (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:]) - if x0 == x1 or y0 == y1: - line = LTLine(gstate.linewidth, (x0, y0), (x1, y1), stroke, - fill, evenodd, gstate.scolor, gstate.ncolor) - self.cur_item.add(line) + # + # Note: 'ml', in conditional above, is a frequent anomaly + # that we want to support. + line = LTLine(gstate.linewidth, pts[0], pts[1], stroke, + fill, evenodd, gstate.scolor, gstate.ncolor) + self.cur_item.add(line) - elif shape == 'mlllh': - (x0, y0) = apply_matrix_pt(self.ctm, path[0][1:]) - (x1, y1) = apply_matrix_pt(self.ctm, path[1][1:]) - (x2, y2) = apply_matrix_pt(self.ctm, path[2][1:]) - (x3, y3) = apply_matrix_pt(self.ctm, path[3][1:]) + elif shape in {'mlllh', 'mllll'}: + (x0, y0), (x1, y1), (x2, y2), (x3, y3), _ = pts - if (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or \ - (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0): - rect = LTRect(gstate.linewidth, (x0, y0, x2, y2), stroke, + is_closed_loop = (pts[0] == pts[4]) + has_square_coordinates = \ + (x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) \ + or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) + if is_closed_loop and has_square_coordinates: + rect = LTRect(gstate.linewidth, (*pts[0], *pts[2]), stroke, fill, evenodd, gstate.scolor, gstate.ncolor) self.cur_item.add(rect) else: - curve = self._create_curve(gstate, stroke, fill, evenodd, - path) + curve = LTCurve(gstate.linewidth, pts, stroke, fill, + evenodd, gstate.scolor, gstate.ncolor) self.cur_item.add(curve) else: - curve = self._create_curve(gstate, stroke, fill, evenodd, path) + curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, + gstate.scolor, gstate.ncolor) self.cur_item.add(curve) - def _create_curve(self, gstate, stroke, fill, evenodd, path): - """Create a `LTCurve` object for the paint path operator""" - pts = [ - apply_matrix_pt(self.ctm, point) - for p in path - for point in zip(p[1::2], p[2::2]) - ] - curve = LTCurve(gstate.linewidth, pts, stroke, fill, evenodd, - gstate.scolor, gstate.ncolor) - return curve - def render_char(self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate): try: diff --git a/samples/contrib/pr-00530-ml-lines.pdf b/samples/contrib/pr-00530-ml-lines.pdf new file mode 100644 index 0000000..7549446 Binary files /dev/null and b/samples/contrib/pr-00530-ml-lines.pdf differ diff --git a/tests/test_converter.py b/tests/test_converter.py index cfd1f2a..99d1dc9 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -4,7 +4,8 @@ from tempfile import TemporaryFile from nose.tools import assert_equal, assert_false, assert_true from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter -from pdfminer.layout import LTContainer, LTRect, LTCurve +from pdfminer.high_level import extract_pages +from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve from pdfminer.pdfinterp import PDFGraphicState @@ -47,6 +48,7 @@ class TestPaintPath(): def get_types(path): return list(map(type, parse(path))) + # Standard rect assert_equal(get_types([ ("m", 10, 90), ("l", 90, 90), @@ -55,6 +57,16 @@ class TestPaintPath(): ("h",), ]), [LTRect]) + # Same but mllll variation + assert_equal(get_types([ + ("m", 10, 90), + ("l", 90, 90), + ("l", 90, 10), + ("l", 10, 10), + ("l", 10, 90), + ]), [LTRect]) + + # Bowtie shape assert_equal(get_types([ ("m", 110, 90), ("l", 190, 10), @@ -63,6 +75,7 @@ class TestPaintPath(): ("h",), ]), [LTCurve]) + # Quadrilateral with one slanted side assert_equal(get_types([ ("m", 210, 90), ("l", 290, 60), @@ -71,6 +84,7 @@ class TestPaintPath(): ("h",), ]), [LTCurve]) + # Path with two rect subpaths assert_equal(get_types([ ("m", 310, 90), ("l", 350, 90), @@ -84,6 +98,7 @@ class TestPaintPath(): ("h",), ]), [LTRect, LTRect]) + # Path with one rect subpath and one pentagon assert_equal(get_types([ ("m", 410, 90), ("l", 445, 90), @@ -98,11 +113,82 @@ class TestPaintPath(): ("h",), ]), [LTRect, LTCurve]) + # Three types of simple lines + assert_equal(get_types([ + # Vertical line + ("m", 10, 30), + ("l", 10, 40), + ("h",), + # Horizontal line + ("m", 10, 50), + ("l", 70, 50), + ("h",), + # Diagonal line + ("m", 10, 10), + ("l", 30, 30), + ("h",), + ]), [LTLine, LTLine, LTLine]) + + # Same as above, but 'ml' variation + assert_equal(get_types([ + # Vertical line + ("m", 10, 30), + ("l", 10, 40), + # Horizontal line + ("m", 10, 50), + ("l", 70, 50), + # Diagonal line + ("m", 10, 10), + ("l", 30, 30), + ]), [LTLine, LTLine, LTLine]) + + # There are six lines in this one-page PDF; + # they all have shape 'ml' not 'mlh' + ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf") + ml_pdf_page = list(ml_pdf)[0] + assert sum(type(item) == LTLine for item in ml_pdf_page) == 6 + def _get_analyzer(self): analyzer = PDFLayoutAnalyzer(None) analyzer.set_ctm([1, 0, 0, 1, 0, 0]) return analyzer + def test_paint_path_beziers(self): + """See section 4.4, table 4.9 of the PDF reference manual""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + return analyzer.cur_item._objs + + # "c" operator + assert parse([ + ("m", 72.41, 433.89), + ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), + ])[0].pts == [ + (72.41, 433.89), + (71.41, 434.89), + ] + + # "v" operator + assert parse([ + ("m", 72.41, 433.89), + ("v", 71.96, 434.89, 71.41, 434.89), + ])[0].pts == [ + (72.41, 433.89), + (71.41, 434.89), + ] + + # "y" operator + assert parse([ + ("m", 72.41, 433.89), + ("y", 72.41, 434.45, 71.41, 434.89), + ])[0].pts == [ + (72.41, 433.89), + (71.41, 434.89), + ] + class TestBinaryDetector(): def test_stringio(self):