Storing Bezier path and dashing style of line in LTCurve (#801)

* Fixes #672 and #630

Add raw points to get full information from bezier segments and dashing style

* Use intermediate variables for constructing tranformed_path

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/825/merge
Julian 2022-11-06 16:50:37 +01:00 committed by GitHub
parent ebf7bcdb98
commit 5114acdda6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 82 additions and 3 deletions

View File

@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
- Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790)) - Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829)) - Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801))
### Fixed ### Fixed

View File

@ -138,6 +138,19 @@ class PDFLayoutAnalyzer(PDFTextDevice):
] ]
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
operators = [str(operation[0]) for operation in path]
transformed_points = [
[
apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
for operand1, operand2 in zip(operation[1::2], operation[2::2])
]
for operation in path
]
transformed_path = [
cast(PathSegment, (o, *p))
for o, p in zip(operators, transformed_points)
]
if shape in {"mlh", "ml"}: if shape in {"mlh", "ml"}:
# single line segment # single line segment
# #
@ -152,6 +165,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
evenodd, evenodd,
gstate.scolor, gstate.scolor,
gstate.ncolor, gstate.ncolor,
original_path=transformed_path,
dashing_style=gstate.dash,
) )
self.cur_item.add(line) self.cur_item.add(line)
@ -171,6 +186,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
evenodd, evenodd,
gstate.scolor, gstate.scolor,
gstate.ncolor, gstate.ncolor,
transformed_path,
gstate.dash,
) )
self.cur_item.add(rect) self.cur_item.add(rect)
else: else:
@ -182,9 +199,10 @@ class PDFLayoutAnalyzer(PDFTextDevice):
evenodd, evenodd,
gstate.scolor, gstate.scolor,
gstate.ncolor, gstate.ncolor,
transformed_path,
gstate.dash,
) )
self.cur_item.add(curve) self.cur_item.add(curve)
else: else:
curve = LTCurve( curve = LTCurve(
gstate.linewidth, gstate.linewidth,
@ -194,6 +212,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
evenodd, evenodd,
gstate.scolor, gstate.scolor,
gstate.ncolor, gstate.ncolor,
transformed_path,
gstate.dash,
) )
self.cur_item.add(curve) self.cur_item.add(curve)

View File

@ -20,7 +20,7 @@ from .pdffont import PDFFont
from .pdfinterp import Color from .pdfinterp import Color
from .pdfinterp import PDFGraphicState from .pdfinterp import PDFGraphicState
from .pdftypes import PDFStream from .pdftypes import PDFStream
from .utils import INF from .utils import INF, PathSegment
from .utils import LTComponentT from .utils import LTComponentT
from .utils import Matrix from .utils import Matrix
from .utils import Plane from .utils import Plane
@ -210,7 +210,14 @@ class LTComponent(LTItem):
class LTCurve(LTComponent): class LTCurve(LTComponent):
"""A generic Bezier curve""" """
A generic Bezier curve
The parameter `original_path` contains the original
pathing information from the pdf (e.g. for reconstructing Bezier Curves).
`dashing_style` contains the Dashing information if any.
"""
def __init__( def __init__(
self, self,
@ -221,6 +228,8 @@ class LTCurve(LTComponent):
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
) -> None: ) -> None:
LTComponent.__init__(self, get_bound(pts)) LTComponent.__init__(self, get_bound(pts))
self.pts = pts self.pts = pts
@ -230,6 +239,8 @@ class LTCurve(LTComponent):
self.evenodd = evenodd self.evenodd = evenodd
self.stroking_color = stroking_color self.stroking_color = stroking_color
self.non_stroking_color = non_stroking_color self.non_stroking_color = non_stroking_color
self.original_path = original_path
self.dashing_style = dashing_style
def get_pts(self) -> str: def get_pts(self) -> str:
return ",".join("%.3f,%.3f" % p for p in self.pts) return ",".join("%.3f,%.3f" % p for p in self.pts)
@ -251,6 +262,8 @@ class LTLine(LTCurve):
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
) -> None: ) -> None:
LTCurve.__init__( LTCurve.__init__(
self, self,
@ -261,6 +274,8 @@ class LTLine(LTCurve):
evenodd, evenodd,
stroking_color, stroking_color,
non_stroking_color, non_stroking_color,
original_path,
dashing_style,
) )
@ -279,6 +294,8 @@ class LTRect(LTCurve):
evenodd: bool = False, evenodd: bool = False,
stroking_color: Optional[Color] = None, stroking_color: Optional[Color] = None,
non_stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None,
original_path: Optional[List[PathSegment]] = None,
dashing_style: Optional[Tuple[object, object]] = None,
) -> None: ) -> None:
(x0, y0, x1, y1) = bbox (x0, y0, x1, y1) = bbox
LTCurve.__init__( LTCurve.__init__(
@ -290,6 +307,8 @@ class LTRect(LTCurve):
evenodd, evenodd,
stroking_color, stroking_color,
non_stroking_color, non_stroking_color,
original_path,
dashing_style,
) )

View File

@ -216,6 +216,45 @@ class TestPaintPath:
(71.41, 434.89), (71.41, 434.89),
] ]
def test_paint_path_beziers_check_raw(self):
"""See section 4.4, table 4.9 of the PDF reference manual"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
return analyzer.cur_item._objs
# "c" operator
assert parse(
[
("m", 72.41, 433.89),
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
]
)[0].original_path == [
("m", (72.41, 433.89)),
("c", (72.41, 434.45), (71.96, 434.89), (71.41, 434.89)),
]
def test_paint_path_dashed(self):
"""See section 4.4, table 4.9 of the PDF reference manual"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
graphicstate = PDFGraphicState()
graphicstate.dash = ([1, 1], 0)
analyzer.paint_path(graphicstate, False, False, False, path)
return analyzer.cur_item._objs
# "c" operator
assert parse(
[
("m", 72.41, 433.89),
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
]
)[0].dashing_style == ([1, 1], 0)
def test_paint_path_without_starting_m(self): def test_paint_path_without_starting_m(self):
gs = PDFGraphicState() gs = PDFGraphicState()
analyzer = self._get_analyzer() analyzer = self._get_analyzer()