Fix bug in computing character bounding box (#348)

* Remove scaling font height/width with size of font bounding box

* Refactor LTChar bounding box computation

* Change expected outcome of `python tools/pdf2txt.py samples/simple3.pdf`, because it looks like an improvement. However, when I view `samples/simple3.pdf` I don't see any text at all. The change in expected outcome is explained by the fact that the bounding boxes of characters can be different, depending on the `/FontBBox` parameter of the font.

* Add test for font sizes, and for this a high-level function that returns an iterator of LTPage objects

* Add line to CHANGELOG
pull/357/head^2
Pieter Marsman 2020-01-16 22:15:50 +01:00 committed by GitHub
parent 2f7f5d2667
commit fff3ac2ba6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 69 additions and 24 deletions

View File

@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
## [20200104] - 2019-01-04

View File

@ -1,15 +1,16 @@
"""Functions that can be used for the most common use-cases for pdfminer.six"""
import sys
import logging
import sys
from io import StringIO
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfdevice import TagExtractor
from .pdfpage import PDFPage
from .converter import XMLConverter, HTMLConverter, TextConverter
from .converter import XMLConverter, HTMLConverter, TextConverter, \
PDFPageAggregator
from .image import ImageWriter
from .layout import LAParams
from io import StringIO
from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -88,10 +89,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None):
"""
Parses and returns the text contained in a PDF file.
Takes loads of optional arguments but the defaults are somewhat sane.
Returns a string containing all of the text extracted.
"""Parse and return the text contained in a PDF file.
:param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt.
@ -99,7 +97,9 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param codec: Text decoding codec
:param laparams: LAParams object from pdfminer.layout.
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: a string containing all of the text extracted.
"""
if laparams is None:
laparams = LAParams()
@ -121,3 +121,30 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
interpreter.process_page(page)
return output_string.getvalue()
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None):
"""Extract and yield LTPage objects
:param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return:
"""
if laparams is None:
laparams = LAParams()
with open(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
password=password, caching=caching):
interpreter.process_page(page)
layout = device.get_result()
yield layout

View File

@ -280,28 +280,23 @@ class LTChar(LTComponent, LTText):
# compute the boundary rectangle.
if font.is_vertical():
# vertical
width = font.get_width() * fontsize
(vx, vy) = textdisp
if vx is None:
vx = width * 0.5
vx = fontsize * 0.5
else:
vx = vx * fontsize * .001
vy = (1000 - vy) * fontsize * .001
tx = -vx
ty = vy + rise
bll = (tx, ty+self.adv)
bur = (tx+width, ty)
bbox_lower_left = (-vx, vy + rise + self.adv)
bbox_upper_right = (-vx + fontsize, vy + rise)
else:
# horizontal
height = font.get_height() * fontsize
descent = font.get_descent() * fontsize
ty = descent + rise
bll = (0, ty)
bur = (self.adv, ty+height)
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0)
(x0, y0) = apply_matrix_pt(self.matrix, bll)
(x1, y1) = apply_matrix_pt(self.matrix, bur)
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0:
(x0, x1) = (x1, x0)
if y1 < y0:

BIN
samples/font-size-test.pdf Normal file

Binary file not shown.

22
tests/test_font_size.py Normal file
View File

@ -0,0 +1,22 @@
from helpers import absolute_sample_path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTTextBox
def test_font_size():
path = absolute_sample_path('font-size-test.pdf')
for page in extract_pages(path):
for text_box in page:
if isinstance(text_box, LTTextBox):
for line in text_box:
possible_number = line.get_text().strip()
if possible_number.isdigit():
expected_size = int(possible_number)
for char in line:
if isinstance(char, LTChar):
actual_size = int(round(char.size))
print(char, actual_size, expected_size)
assert expected_size == actual_size
else:
print(repr(line.get_text()))

View File

@ -15,7 +15,7 @@ test_strings = {
"H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f",
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
}