Fix bug in computing character bounding box (#348)

* Remove scaling font height/width with size of font bounding box

* Refactor LTChar bounding box computation

* Change expected outcome of `python tools/pdf2txt.py samples/simple3.pdf`, because it looks like an improvement. However, when I view `samples/simple3.pdf` I don't see any text at all. The change in expected outcome is explained by the fact that the bounding boxes of characters can be different, depending on the `/FontBBox` parameter of the font.

* Add test for font sizes, and for this a high-level function that returns an iterator of LTPage objects

* Add line to CHANGELOG
pull/357/head^2
Pieter Marsman 2020-01-16 22:15:50 +01:00 committed by GitHub
parent 2f7f5d2667
commit fff3ac2ba6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 69 additions and 24 deletions

View File

@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Fixed ### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352)) - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338)) - KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
## [20200104] - 2019-01-04 ## [20200104] - 2019-01-04

View File

@ -1,15 +1,16 @@
"""Functions that can be used for the most common use-cases for pdfminer.six""" """Functions that can be used for the most common use-cases for pdfminer.six"""
import sys
import logging import logging
import sys
from io import StringIO
from .pdfinterp import PDFResourceManager, PDFPageInterpreter from .converter import XMLConverter, HTMLConverter, TextConverter, \
from .pdfdevice import TagExtractor PDFPageAggregator
from .pdfpage import PDFPage
from .converter import XMLConverter, HTMLConverter, TextConverter
from .image import ImageWriter from .image import ImageWriter
from .layout import LAParams from .layout import LAParams
from io import StringIO from .pdfdevice import TagExtractor
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
from .pdfpage import PDFPage
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -88,10 +89,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0, def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, codec='utf-8', laparams=None): caching=True, codec='utf-8', laparams=None):
""" """Parse and return the text contained in a PDF file.
Parses and returns the text contained in a PDF file.
Takes loads of optional arguments but the defaults are somewhat sane.
Returns a string containing all of the text extracted.
:param pdf_file: Path to the PDF file to be worked on :param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt. :param password: For encrypted PDFs, the password to decrypt.
@ -99,7 +97,9 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
:param maxpages: The maximum number of pages to parse :param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached :param caching: If resources should be cached
:param codec: Text decoding codec :param codec: Text decoding codec
:param laparams: LAParams object from pdfminer.layout. :param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return: a string containing all of the text extracted.
""" """
if laparams is None: if laparams is None:
laparams = LAParams() laparams = LAParams()
@ -121,3 +121,30 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
interpreter.process_page(page) interpreter.process_page(page)
return output_string.getvalue() return output_string.getvalue()
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
caching=True, laparams=None):
"""Extract and yield LTPage objects
:param pdf_file: Path to the PDF file to be worked on
:param password: For encrypted PDFs, the password to decrypt.
:param page_numbers: List of zero-indexed page numbers to extract.
:param maxpages: The maximum number of pages to parse
:param caching: If resources should be cached
:param laparams: An LAParams object from pdfminer.layout. If None, uses
some default settings that often work well.
:return:
"""
if laparams is None:
laparams = LAParams()
with open(pdf_file, "rb") as fp:
resource_manager = PDFResourceManager()
device = PDFPageAggregator(resource_manager, laparams=laparams)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
password=password, caching=caching):
interpreter.process_page(page)
layout = device.get_result()
yield layout

View File

@ -280,28 +280,23 @@ class LTChar(LTComponent, LTText):
# compute the boundary rectangle. # compute the boundary rectangle.
if font.is_vertical(): if font.is_vertical():
# vertical # vertical
width = font.get_width() * fontsize
(vx, vy) = textdisp (vx, vy) = textdisp
if vx is None: if vx is None:
vx = width * 0.5 vx = fontsize * 0.5
else: else:
vx = vx * fontsize * .001 vx = vx * fontsize * .001
vy = (1000 - vy) * fontsize * .001 vy = (1000 - vy) * fontsize * .001
tx = -vx bbox_lower_left = (-vx, vy + rise + self.adv)
ty = vy + rise bbox_upper_right = (-vx + fontsize, vy + rise)
bll = (tx, ty+self.adv)
bur = (tx+width, ty)
else: else:
# horizontal # horizontal
height = font.get_height() * fontsize
descent = font.get_descent() * fontsize descent = font.get_descent() * fontsize
ty = descent + rise bbox_lower_left = (0, descent + rise)
bll = (0, ty) bbox_upper_right = (self.adv, descent + rise + fontsize)
bur = (self.adv, ty+height)
(a, b, c, d, e, f) = self.matrix (a, b, c, d, e, f) = self.matrix
self.upright = (0 < a*d*scaling and b*c <= 0) self.upright = (0 < a*d*scaling and b*c <= 0)
(x0, y0) = apply_matrix_pt(self.matrix, bll) (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
(x1, y1) = apply_matrix_pt(self.matrix, bur) (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
if x1 < x0: if x1 < x0:
(x0, x1) = (x1, x0) (x0, x1) = (x1, x0)
if y1 < y0: if y1 < y0:

BIN
samples/font-size-test.pdf Normal file

Binary file not shown.

22
tests/test_font_size.py Normal file
View File

@ -0,0 +1,22 @@
from helpers import absolute_sample_path
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTChar, LTTextBox
def test_font_size():
path = absolute_sample_path('font-size-test.pdf')
for page in extract_pages(path):
for text_box in page:
if isinstance(text_box, LTTextBox):
for line in text_box:
possible_number = line.get_text().strip()
if possible_number.isdigit():
expected_size = int(possible_number)
for char in line:
if isinstance(char, LTChar):
actual_size = int(round(char.size))
print(char, actual_size, expected_size)
assert expected_size == actual_size
else:
print(repr(line.get_text()))

View File

@ -15,7 +15,7 @@ test_strings = {
"H e l l o \n\nW o r l d\n\n" "H e l l o \n\nW o r l d\n\n"
"H e l l o \n\nW o r l d\n\n\f", "H e l l o \n\nW o r l d\n\n\f",
"simple2.pdf": "\f", "simple2.pdf": "\f",
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f", "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
} }