Storing Bezier path and dashing style of line in LTCurve (#801)
* Fixes #672 and #630 Add raw points to get full information from bezier segments and dashing style * Use intermediate variables for constructing tranformed_path Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/825/merge
parent
ebf7bcdb98
commit
5114acdda6
|
@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
|
||||
- Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
|
||||
- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
|
||||
- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801))
|
||||
|
||||
### Fixed
|
||||
|
||||
|
|
|
@ -138,6 +138,19 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
]
|
||||
pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts]
|
||||
|
||||
operators = [str(operation[0]) for operation in path]
|
||||
transformed_points = [
|
||||
[
|
||||
apply_matrix_pt(self.ctm, (float(operand1), float(operand2)))
|
||||
for operand1, operand2 in zip(operation[1::2], operation[2::2])
|
||||
]
|
||||
for operation in path
|
||||
]
|
||||
transformed_path = [
|
||||
cast(PathSegment, (o, *p))
|
||||
for o, p in zip(operators, transformed_points)
|
||||
]
|
||||
|
||||
if shape in {"mlh", "ml"}:
|
||||
# single line segment
|
||||
#
|
||||
|
@ -152,6 +165,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
original_path=transformed_path,
|
||||
dashing_style=gstate.dash,
|
||||
)
|
||||
self.cur_item.add(line)
|
||||
|
||||
|
@ -171,6 +186,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
transformed_path,
|
||||
gstate.dash,
|
||||
)
|
||||
self.cur_item.add(rect)
|
||||
else:
|
||||
|
@ -182,9 +199,10 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
transformed_path,
|
||||
gstate.dash,
|
||||
)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
else:
|
||||
curve = LTCurve(
|
||||
gstate.linewidth,
|
||||
|
@ -194,6 +212,8 @@ class PDFLayoutAnalyzer(PDFTextDevice):
|
|||
evenodd,
|
||||
gstate.scolor,
|
||||
gstate.ncolor,
|
||||
transformed_path,
|
||||
gstate.dash,
|
||||
)
|
||||
self.cur_item.add(curve)
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from .pdffont import PDFFont
|
|||
from .pdfinterp import Color
|
||||
from .pdfinterp import PDFGraphicState
|
||||
from .pdftypes import PDFStream
|
||||
from .utils import INF
|
||||
from .utils import INF, PathSegment
|
||||
from .utils import LTComponentT
|
||||
from .utils import Matrix
|
||||
from .utils import Plane
|
||||
|
@ -210,7 +210,14 @@ class LTComponent(LTItem):
|
|||
|
||||
|
||||
class LTCurve(LTComponent):
|
||||
"""A generic Bezier curve"""
|
||||
"""
|
||||
A generic Bezier curve
|
||||
|
||||
The parameter `original_path` contains the original
|
||||
pathing information from the pdf (e.g. for reconstructing Bezier Curves).
|
||||
|
||||
`dashing_style` contains the Dashing information if any.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -221,6 +228,8 @@ class LTCurve(LTComponent):
|
|||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
original_path: Optional[List[PathSegment]] = None,
|
||||
dashing_style: Optional[Tuple[object, object]] = None,
|
||||
) -> None:
|
||||
LTComponent.__init__(self, get_bound(pts))
|
||||
self.pts = pts
|
||||
|
@ -230,6 +239,8 @@ class LTCurve(LTComponent):
|
|||
self.evenodd = evenodd
|
||||
self.stroking_color = stroking_color
|
||||
self.non_stroking_color = non_stroking_color
|
||||
self.original_path = original_path
|
||||
self.dashing_style = dashing_style
|
||||
|
||||
def get_pts(self) -> str:
|
||||
return ",".join("%.3f,%.3f" % p for p in self.pts)
|
||||
|
@ -251,6 +262,8 @@ class LTLine(LTCurve):
|
|||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
original_path: Optional[List[PathSegment]] = None,
|
||||
dashing_style: Optional[Tuple[object, object]] = None,
|
||||
) -> None:
|
||||
LTCurve.__init__(
|
||||
self,
|
||||
|
@ -261,6 +274,8 @@ class LTLine(LTCurve):
|
|||
evenodd,
|
||||
stroking_color,
|
||||
non_stroking_color,
|
||||
original_path,
|
||||
dashing_style,
|
||||
)
|
||||
|
||||
|
||||
|
@ -279,6 +294,8 @@ class LTRect(LTCurve):
|
|||
evenodd: bool = False,
|
||||
stroking_color: Optional[Color] = None,
|
||||
non_stroking_color: Optional[Color] = None,
|
||||
original_path: Optional[List[PathSegment]] = None,
|
||||
dashing_style: Optional[Tuple[object, object]] = None,
|
||||
) -> None:
|
||||
(x0, y0, x1, y1) = bbox
|
||||
LTCurve.__init__(
|
||||
|
@ -290,6 +307,8 @@ class LTRect(LTCurve):
|
|||
evenodd,
|
||||
stroking_color,
|
||||
non_stroking_color,
|
||||
original_path,
|
||||
dashing_style,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -216,6 +216,45 @@ class TestPaintPath:
|
|||
(71.41, 434.89),
|
||||
]
|
||||
|
||||
def test_paint_path_beziers_check_raw(self):
|
||||
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
||||
|
||||
def parse(path):
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
||||
return analyzer.cur_item._objs
|
||||
|
||||
# "c" operator
|
||||
assert parse(
|
||||
[
|
||||
("m", 72.41, 433.89),
|
||||
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||
]
|
||||
)[0].original_path == [
|
||||
("m", (72.41, 433.89)),
|
||||
("c", (72.41, 434.45), (71.96, 434.89), (71.41, 434.89)),
|
||||
]
|
||||
|
||||
def test_paint_path_dashed(self):
|
||||
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
||||
|
||||
def parse(path):
|
||||
analyzer = self._get_analyzer()
|
||||
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
||||
graphicstate = PDFGraphicState()
|
||||
graphicstate.dash = ([1, 1], 0)
|
||||
analyzer.paint_path(graphicstate, False, False, False, path)
|
||||
return analyzer.cur_item._objs
|
||||
|
||||
# "c" operator
|
||||
assert parse(
|
||||
[
|
||||
("m", 72.41, 433.89),
|
||||
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
||||
]
|
||||
)[0].dashing_style == ([1, 1], 0)
|
||||
|
||||
def test_paint_path_without_starting_m(self):
|
||||
gs = PDFGraphicState()
|
||||
analyzer = self._get_analyzer()
|
||||
|
|
Loading…
Reference in New Issue