diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a1ead6..3ffbe88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) - Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790)) - Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829)) +- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801)) ### Fixed diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 6b367aa..8e48d86 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -138,6 +138,19 @@ class PDFLayoutAnalyzer(PDFTextDevice): ] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + operators = [str(operation[0]) for operation in path] + transformed_points = [ + [ + apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) + for operand1, operand2 in zip(operation[1::2], operation[2::2]) + ] + for operation in path + ] + transformed_path = [ + cast(PathSegment, (o, *p)) + for o, p in zip(operators, transformed_points) + ] + if shape in {"mlh", "ml"}: # single line segment # @@ -152,6 +165,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor, + original_path=transformed_path, + dashing_style=gstate.dash, ) self.cur_item.add(line) @@ -171,6 +186,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(rect) else: @@ -182,9 +199,10 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(curve) - else: curve = LTCurve( gstate.linewidth, @@ -194,6 +212,8 @@ class PDFLayoutAnalyzer(PDFTextDevice): evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(curve) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 5bfe759..a4159e4 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -20,7 +20,7 @@ from .pdffont import PDFFont from .pdfinterp import Color from .pdfinterp import PDFGraphicState from .pdftypes import PDFStream -from .utils import INF +from .utils import INF, PathSegment from .utils import LTComponentT from .utils import Matrix from .utils import Plane @@ -210,7 +210,14 @@ class LTComponent(LTItem): class LTCurve(LTComponent): - """A generic Bezier curve""" + """ + A generic Bezier curve + + The parameter `original_path` contains the original + pathing information from the pdf (e.g. for reconstructing Bezier Curves). + + `dashing_style` contains the Dashing information if any. + """ def __init__( self, @@ -221,6 +228,8 @@ class LTCurve(LTComponent): evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts @@ -230,6 +239,8 @@ class LTCurve(LTComponent): self.evenodd = evenodd self.stroking_color = stroking_color self.non_stroking_color = non_stroking_color + self.original_path = original_path + self.dashing_style = dashing_style def get_pts(self) -> str: return ",".join("%.3f,%.3f" % p for p in self.pts) @@ -251,6 +262,8 @@ class LTLine(LTCurve): evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: LTCurve.__init__( self, @@ -261,6 +274,8 @@ class LTLine(LTCurve): evenodd, stroking_color, non_stroking_color, + original_path, + dashing_style, ) @@ -279,6 +294,8 @@ class LTRect(LTCurve): evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__( @@ -290,6 +307,8 @@ class LTRect(LTCurve): evenodd, stroking_color, non_stroking_color, + original_path, + dashing_style, ) diff --git a/tests/test_converter.py b/tests/test_converter.py index 80de019..5bd560e 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -216,6 +216,45 @@ class TestPaintPath: (71.41, 434.89), ] + def test_paint_path_beziers_check_raw(self): + """See section 4.4, table 4.9 of the PDF reference manual""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + return analyzer.cur_item._objs + + # "c" operator + assert parse( + [ + ("m", 72.41, 433.89), + ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), + ] + )[0].original_path == [ + ("m", (72.41, 433.89)), + ("c", (72.41, 434.45), (71.96, 434.89), (71.41, 434.89)), + ] + + def test_paint_path_dashed(self): + """See section 4.4, table 4.9 of the PDF reference manual""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + graphicstate = PDFGraphicState() + graphicstate.dash = ([1, 1], 0) + analyzer.paint_path(graphicstate, False, False, False, path) + return analyzer.cur_item._objs + + # "c" operator + assert parse( + [ + ("m", 72.41, 433.89), + ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), + ] + )[0].dashing_style == ([1, 1], 0) + def test_paint_path_without_starting_m(self): gs = PDFGraphicState() analyzer = self._get_analyzer()