2020-10-25 13:37:12 +00:00
|
|
|
import io
|
|
|
|
from tempfile import TemporaryFile
|
2020-07-11 15:34:38 +00:00
|
|
|
|
2020-10-25 13:37:12 +00:00
|
|
|
from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter
|
2021-07-27 16:27:32 +00:00
|
|
|
from pdfminer.high_level import extract_pages
|
|
|
|
from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve
|
2020-07-11 15:34:38 +00:00
|
|
|
from pdfminer.pdfinterp import PDFGraphicState
|
|
|
|
|
|
|
|
|
2022-02-02 21:24:32 +00:00
|
|
|
class TestPaintPath:
|
2020-07-11 15:34:38 +00:00
|
|
|
def test_paint_path(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
path = [("m", 6, 7), ("l", 7, 7)]
|
2020-07-11 15:34:38 +00:00
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item._objs) == 1
|
2020-07-11 15:34:38 +00:00
|
|
|
|
|
|
|
def test_paint_path_mlllh(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
path = [("m", 6, 7), ("l", 7, 7), ("l", 7, 91), ("l", 6, 91), ("h",)]
|
2020-07-11 15:34:38 +00:00
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item) == 1
|
2020-07-11 15:34:38 +00:00
|
|
|
|
|
|
|
def test_paint_path_multiple_mlllh(self):
|
|
|
|
"""Path from samples/contrib/issue-00369-excel.pdf"""
|
|
|
|
path = [
|
2022-02-11 21:46:51 +00:00
|
|
|
("m", 6, 7),
|
|
|
|
("l", 7, 7),
|
|
|
|
("l", 7, 91),
|
|
|
|
("l", 6, 91),
|
|
|
|
("h",),
|
|
|
|
("m", 4, 7),
|
|
|
|
("l", 6, 7),
|
|
|
|
("l", 6, 91),
|
|
|
|
("l", 4, 91),
|
|
|
|
("h",),
|
|
|
|
("m", 67, 2),
|
|
|
|
("l", 68, 2),
|
|
|
|
("l", 68, 3),
|
|
|
|
("l", 67, 3),
|
|
|
|
("h",),
|
2020-07-11 15:34:38 +00:00
|
|
|
]
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 100, 0, 100])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
2022-02-02 21:24:32 +00:00
|
|
|
assert len(analyzer.cur_item._objs) == 3
|
2020-07-11 15:34:38 +00:00
|
|
|
|
2020-10-12 15:53:00 +00:00
|
|
|
def test_paint_path_quadrilaterals(self):
|
|
|
|
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
|
|
|
|
|
|
|
|
def parse(path):
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
|
|
|
return analyzer.cur_item._objs
|
|
|
|
|
|
|
|
def get_types(path):
|
|
|
|
return list(map(type, parse(path)))
|
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Standard rect
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 10, 90),
|
|
|
|
("l", 90, 90),
|
|
|
|
("l", 90, 10),
|
|
|
|
("l", 10, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Same but mllll variation
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 10, 90),
|
|
|
|
("l", 90, 90),
|
|
|
|
("l", 90, 10),
|
|
|
|
("l", 10, 10),
|
|
|
|
("l", 10, 90),
|
|
|
|
]
|
|
|
|
) == [LTRect]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# Bowtie shape
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 110, 90),
|
|
|
|
("l", 190, 10),
|
|
|
|
("l", 190, 90),
|
|
|
|
("l", 110, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Quadrilateral with one slanted side
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 210, 90),
|
|
|
|
("l", 290, 60),
|
|
|
|
("l", 290, 10),
|
|
|
|
("l", 210, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Path with two rect subpaths
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 310, 90),
|
|
|
|
("l", 350, 90),
|
|
|
|
("l", 350, 10),
|
|
|
|
("l", 310, 10),
|
|
|
|
("h",),
|
|
|
|
("m", 350, 90),
|
|
|
|
("l", 390, 90),
|
|
|
|
("l", 390, 10),
|
|
|
|
("l", 350, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect, LTRect]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Path with one rect subpath and one pentagon
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
("m", 410, 90),
|
|
|
|
("l", 445, 90),
|
|
|
|
("l", 445, 10),
|
|
|
|
("l", 410, 10),
|
|
|
|
("h",),
|
|
|
|
("m", 455, 70),
|
|
|
|
("l", 475, 90),
|
|
|
|
("l", 490, 70),
|
|
|
|
("l", 490, 10),
|
|
|
|
("l", 455, 10),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTRect, LTCurve]
|
2020-10-12 15:53:00 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
# Three types of simple lines
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
# Vertical line
|
|
|
|
("m", 10, 30),
|
|
|
|
("l", 10, 40),
|
|
|
|
("h",),
|
|
|
|
# Horizontal line
|
|
|
|
("m", 10, 50),
|
|
|
|
("l", 70, 50),
|
|
|
|
("h",),
|
|
|
|
# Diagonal line
|
|
|
|
("m", 10, 10),
|
|
|
|
("l", 30, 30),
|
|
|
|
("h",),
|
|
|
|
]
|
|
|
|
) == [LTLine, LTLine, LTLine]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# Same as above, but 'ml' variation
|
2022-02-02 21:24:32 +00:00
|
|
|
assert get_types(
|
|
|
|
[
|
|
|
|
# Vertical line
|
|
|
|
("m", 10, 30),
|
|
|
|
("l", 10, 40),
|
|
|
|
# Horizontal line
|
|
|
|
("m", 10, 50),
|
|
|
|
("l", 70, 50),
|
|
|
|
# Diagonal line
|
|
|
|
("m", 10, 10),
|
|
|
|
("l", 30, 30),
|
|
|
|
]
|
|
|
|
) == [LTLine, LTLine, LTLine]
|
2021-07-27 16:27:32 +00:00
|
|
|
|
|
|
|
# There are six lines in this one-page PDF;
|
|
|
|
# they all have shape 'ml' not 'mlh'
|
|
|
|
ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf")
|
|
|
|
ml_pdf_page = list(ml_pdf)[0]
|
|
|
|
assert sum(type(item) == LTLine for item in ml_pdf_page) == 6
|
|
|
|
|
2020-07-11 15:34:38 +00:00
|
|
|
def _get_analyzer(self):
|
|
|
|
analyzer = PDFLayoutAnalyzer(None)
|
|
|
|
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
|
|
|
|
return analyzer
|
2020-10-25 13:37:12 +00:00
|
|
|
|
2021-07-27 16:27:32 +00:00
|
|
|
def test_paint_path_beziers(self):
|
|
|
|
"""See section 4.4, table 4.9 of the PDF reference manual"""
|
|
|
|
|
|
|
|
def parse(path):
|
|
|
|
analyzer = self._get_analyzer()
|
|
|
|
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
|
|
|
|
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
|
|
|
|
return analyzer.cur_item._objs
|
|
|
|
|
|
|
|
# "c" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse(
|
|
|
|
[
|
|
|
|
("m", 72.41, 433.89),
|
|
|
|
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
|
|
|
|
]
|
|
|
|
)[0].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
|
|
|
# "v" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse([("m", 72.41, 433.89), ("v", 71.96, 434.89, 71.41, 434.89)])[
|
|
|
|
0
|
|
|
|
].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
|
|
|
# "y" operator
|
2022-02-11 21:46:51 +00:00
|
|
|
assert parse([("m", 72.41, 433.89), ("y", 72.41, 434.45, 71.41, 434.89)])[
|
|
|
|
0
|
|
|
|
].pts == [
|
2021-07-27 16:27:32 +00:00
|
|
|
(72.41, 433.89),
|
|
|
|
(71.41, 434.89),
|
|
|
|
]
|
|
|
|
|
2020-10-25 13:37:12 +00:00
|
|
|
|
2022-02-11 21:46:51 +00:00
|
|
|
class TestBinaryDetector:
|
2020-10-25 13:37:12 +00:00
|
|
|
def test_stringio(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(io.StringIO())
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_bytesio(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(io.BytesIO())
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_tmpfile(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
with TemporaryFile(mode="w") as f:
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(f)
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_binary_tmpfile(self):
|
2022-02-11 21:46:51 +00:00
|
|
|
with TemporaryFile(mode="wb") as f:
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(f)
|
2020-10-25 13:37:12 +00:00
|
|
|
|
|
|
|
def test_non_file_like_object_defaults_to_binary(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert PDFConverter._is_binary_stream(object())
|
Fix bug: _is_binary_stream should recognize TextIOWrapper as non-binary, escaped \r\n should be removed (#616)
* detect TextIOWrapper as non-binary
* I don't understand the CHANGELOG.md format, hope this is good enough
* Delete \\\r\n in Literal Strings (ref. section 7.3.4.2 of PDF32000_2008)
* Keep Travis CI happy
* Added test
* Remove pdfminer/Changelog
* Prettify _parse_string_1
* Add CHANGELOG.md
* Satisfy flake8
* Update CHANGELOG.md
* Use logging.Logger.warning instead of warning.warn in most cases, following
the Python official guidance that warning.warn is directed at _developers_,
not users
* (pdfdocument.py) remove declarations of PDFTextExtractionNotAllowedWarning,
PDFNoValidXRefWarning
* (pdfpage.py) Don't import warning, don't use PDFTextExtractionNotAllowedWarning
* (tools/dumppdf.py) Don't import warning, don't use PDFNoValidXRefWarning
* (tests/test_tools_dumppdf.py) Don't import warning, check for logging.WARN rather
than PDFNoValidXRefWarning
* get name right
* make flake8 happy
* Revert "make flake8 happy"
This reverts commit 45927696869abff5041cc5a338aa9390cd98606e.
* Revert "get name right"
This reverts commit 80091ea211c279511d206d14b2ad6cb0fb887a1f.
* Revert "Use logging.Logger.warning instead of warning.warn in most cases, following"
This reverts commit 3c1e3d66064e0c42d86a7191c357e16d1406449d.
* Revert "Merge branch 'preferLoggingToWarning' into hst"
This reverts commit 9d9d1399216d589ab600755d6548240d935c3ff3, reversing
changes made to 80091ea211c279511d206d14b2ad6cb0fb887a1f.
* Revert "Revert "Merge branch 'preferLoggingToWarning' into hst""
This reverts commit b3da21934d29c5cfa9354d7a41018368b6d51e9f.
Co-authored-by: Henry S. Thompson <ht@home.hst.name>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-09-27 18:30:40 +00:00
|
|
|
|
|
|
|
def test_textiowrapper(self):
|
2022-02-02 21:24:32 +00:00
|
|
|
assert not PDFConverter._is_binary_stream(io.TextIOBase())
|