pdfminer.six/tests/test_converter.py

210 lines
6.4 KiB
Python
Raw Normal View History

import io
from tempfile import TemporaryFile
from nose.tools import assert_equal, assert_false, assert_true
from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTContainer, LTRect, LTLine, LTCurve
from pdfminer.pdfinterp import PDFGraphicState
class TestPaintPath():
def test_paint_path(self):
path = [('m', 6, 7), ('l', 7, 7)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item._objs), 1)
def test_paint_path_mlllh(self):
path = [('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',)]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item), 1)
def test_paint_path_multiple_mlllh(self):
"""Path from samples/contrib/issue-00369-excel.pdf"""
path = [
('m', 6, 7), ('l', 7, 7), ('l', 7, 91), ('l', 6, 91), ('h',),
('m', 4, 7), ('l', 6, 7), ('l', 6, 91), ('l', 4, 91), ('h',),
('m', 67, 2), ('l', 68, 2), ('l', 68, 3), ('l', 67, 3), ('h',)
]
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 100, 0, 100])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
assert_equal(len(analyzer.cur_item._objs), 3)
def test_paint_path_quadrilaterals(self):
"""via https://github.com/pdfminer/pdfminer.six/issues/473"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
return analyzer.cur_item._objs
def get_types(path):
return list(map(type, parse(path)))
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Standard rect
assert_equal(get_types([
("m", 10, 90),
("l", 90, 90),
("l", 90, 10),
("l", 10, 10),
("h",),
]), [LTRect])
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Same but mllll variation
assert_equal(get_types([
("m", 10, 90),
("l", 90, 90),
("l", 90, 10),
("l", 10, 10),
("l", 10, 90),
]), [LTRect])
# Bowtie shape
assert_equal(get_types([
("m", 110, 90),
("l", 190, 10),
("l", 190, 90),
("l", 110, 10),
("h",),
]), [LTCurve])
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Quadrilateral with one slanted side
assert_equal(get_types([
("m", 210, 90),
("l", 290, 60),
("l", 290, 10),
("l", 210, 10),
("h",),
]), [LTCurve])
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Path with two rect subpaths
assert_equal(get_types([
("m", 310, 90),
("l", 350, 90),
("l", 350, 10),
("l", 310, 10),
("h",),
("m", 350, 90),
("l", 390, 90),
("l", 390, 10),
("l", 350, 10),
("h",),
]), [LTRect, LTRect])
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Path with one rect subpath and one pentagon
assert_equal(get_types([
("m", 410, 90),
("l", 445, 90),
("l", 445, 10),
("l", 410, 10),
("h",),
("m", 455, 70),
("l", 475, 90),
("l", 490, 70),
("l", 490, 10),
("l", 455, 10),
("h",),
]), [LTRect, LTCurve])
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
# Three types of simple lines
assert_equal(get_types([
# Vertical line
("m", 10, 30),
("l", 10, 40),
("h",),
# Horizontal line
("m", 10, 50),
("l", 70, 50),
("h",),
# Diagonal line
("m", 10, 10),
("l", 30, 30),
("h",),
]), [LTLine, LTLine, LTLine])
# Same as above, but 'ml' variation
assert_equal(get_types([
# Vertical line
("m", 10, 30),
("l", 10, 40),
# Horizontal line
("m", 10, 50),
("l", 70, 50),
# Diagonal line
("m", 10, 10),
("l", 30, 30),
]), [LTLine, LTLine, LTLine])
# There are six lines in this one-page PDF;
# they all have shape 'ml' not 'mlh'
ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf")
ml_pdf_page = list(ml_pdf)[0]
assert sum(type(item) == LTLine for item in ml_pdf_page) == 6
def _get_analyzer(self):
analyzer = PDFLayoutAnalyzer(None)
analyzer.set_ctm([1, 0, 0, 1, 0, 0])
return analyzer
Fix .paint_path handling of single line segments (#530) * Fix .paint_path handling of single line segments - Fixes typo ("ml" should have been "mlh") - Removes if-statement that required individual line segments to be strictly horizontal or vertical. * Treat 'ml'-shape paths as lines not curves Althoguh 'mlh' is the canonical implementation for a single line segment, 'ml' is fairly common. Adds tests and sample PDF. * Fix trailing whitespace * Fix point-extraction from Beziér path commands This commit corrects the manner in which "pts" are extracted from Beziér path commands. See Table 4.9 of PDF reference manual, and new comments in code for details. Previously, depending on whether the command (c, v, or y) the code was extracting some combination of control points (not on curve) and the actual points-on-curve. This commit also refactors .paint_path, so that apply_matrix_pt is only called in one place, and to treat the "h" command in a manner more consistent with other path commands. * Add comments to test_paint_path_quadrilaterals * Parse rect-forming mllll paths as rects not curves Now that .paint_path has been refactored, adding support for rect-forming mllll paths requires no extra code, beyond a minor tweak to the relevant elif statement. * One changelog line with ref to mr * Remove PDFLayoutAnalyzer._create_curve because implementation has become trivial due to refactoring * Extract variables from if statement to make it easier to read * Optimize imports order * Trigger travis build * Revert "Trigger travis build" This reverts commit 41c05184 * Update travis badge * Update travis badge Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2021-07-27 16:27:32 +00:00
def test_paint_path_beziers(self):
"""See section 4.4, table 4.9 of the PDF reference manual"""
def parse(path):
analyzer = self._get_analyzer()
analyzer.cur_item = LTContainer([0, 1000, 0, 1000])
analyzer.paint_path(PDFGraphicState(), False, False, False, path)
return analyzer.cur_item._objs
# "c" operator
assert parse([
("m", 72.41, 433.89),
("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "v" operator
assert parse([
("m", 72.41, 433.89),
("v", 71.96, 434.89, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
# "y" operator
assert parse([
("m", 72.41, 433.89),
("y", 72.41, 434.45, 71.41, 434.89),
])[0].pts == [
(72.41, 433.89),
(71.41, 434.89),
]
class TestBinaryDetector():
def test_stringio(self):
assert_false(PDFConverter._is_binary_stream(io.StringIO()))
def test_bytesio(self):
assert_true(PDFConverter._is_binary_stream(io.BytesIO()))
def test_tmpfile(self):
with TemporaryFile(mode='w') as f:
assert_false(PDFConverter._is_binary_stream(f))
def test_binary_tmpfile(self):
with TemporaryFile(mode='wb') as f:
assert_true(PDFConverter._is_binary_stream(f))
def test_non_file_like_object_defaults_to_binary(self):
assert_true(PDFConverter._is_binary_stream(object()))