Fix bug in computing character bounding box (#348)
* Remove scaling font height/width with size of font bounding box * Refactor LTChar bounding box computation * Change expected outcome of `python tools/pdf2txt.py samples/simple3.pdf`, because it looks like an improvement. However, when I view `samples/simple3.pdf` I don't see any text at all. The change in expected outcome is explained by the fact that the bounding boxes of characters can be different, depending on the `/FontBBox` parameter of the font. * Add test for font sizes, and for this a high-level function that returns an iterator of LTPage objects * Add line to CHANGELOGpull/357/head^2
parent
2f7f5d2667
commit
fff3ac2ba6
|
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
||||||
|
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
|
||||||
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
||||||
|
|
||||||
## [20200104] - 2019-01-04
|
## [20200104] - 2019-01-04
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
"""Functions that can be used for the most common use-cases for pdfminer.six"""
|
"""Functions that can be used for the most common use-cases for pdfminer.six"""
|
||||||
|
|
||||||
import sys
|
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
||||||
from .pdfdevice import TagExtractor
|
PDFPageAggregator
|
||||||
from .pdfpage import PDFPage
|
|
||||||
from .converter import XMLConverter, HTMLConverter, TextConverter
|
|
||||||
from .image import ImageWriter
|
from .image import ImageWriter
|
||||||
from .layout import LAParams
|
from .layout import LAParams
|
||||||
from io import StringIO
|
from .pdfdevice import TagExtractor
|
||||||
|
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from .pdfpage import PDFPage
|
||||||
|
|
||||||
|
|
||||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
|
@ -88,10 +89,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
|
|
||||||
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
caching=True, codec='utf-8', laparams=None):
|
caching=True, codec='utf-8', laparams=None):
|
||||||
"""
|
"""Parse and return the text contained in a PDF file.
|
||||||
Parses and returns the text contained in a PDF file.
|
|
||||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
|
||||||
Returns a string containing all of the text extracted.
|
|
||||||
|
|
||||||
:param pdf_file: Path to the PDF file to be worked on
|
:param pdf_file: Path to the PDF file to be worked on
|
||||||
:param password: For encrypted PDFs, the password to decrypt.
|
:param password: For encrypted PDFs, the password to decrypt.
|
||||||
|
@ -99,7 +97,9 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
:param maxpages: The maximum number of pages to parse
|
:param maxpages: The maximum number of pages to parse
|
||||||
:param caching: If resources should be cached
|
:param caching: If resources should be cached
|
||||||
:param codec: Text decoding codec
|
:param codec: Text decoding codec
|
||||||
:param laparams: LAParams object from pdfminer.layout.
|
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||||
|
some default settings that often work well.
|
||||||
|
:return: a string containing all of the text extracted.
|
||||||
"""
|
"""
|
||||||
if laparams is None:
|
if laparams is None:
|
||||||
laparams = LAParams()
|
laparams = LAParams()
|
||||||
|
@ -121,3 +121,30 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
|
||||||
return output_string.getvalue()
|
return output_string.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||||
|
caching=True, laparams=None):
|
||||||
|
"""Extract and yield LTPage objects
|
||||||
|
|
||||||
|
:param pdf_file: Path to the PDF file to be worked on
|
||||||
|
:param password: For encrypted PDFs, the password to decrypt.
|
||||||
|
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||||
|
:param maxpages: The maximum number of pages to parse
|
||||||
|
:param caching: If resources should be cached
|
||||||
|
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||||
|
some default settings that often work well.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if laparams is None:
|
||||||
|
laparams = LAParams()
|
||||||
|
|
||||||
|
with open(pdf_file, "rb") as fp:
|
||||||
|
resource_manager = PDFResourceManager()
|
||||||
|
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||||
|
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
|
||||||
|
password=password, caching=caching):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
yield layout
|
||||||
|
|
|
@ -280,28 +280,23 @@ class LTChar(LTComponent, LTText):
|
||||||
# compute the boundary rectangle.
|
# compute the boundary rectangle.
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
# vertical
|
# vertical
|
||||||
width = font.get_width() * fontsize
|
|
||||||
(vx, vy) = textdisp
|
(vx, vy) = textdisp
|
||||||
if vx is None:
|
if vx is None:
|
||||||
vx = width * 0.5
|
vx = fontsize * 0.5
|
||||||
else:
|
else:
|
||||||
vx = vx * fontsize * .001
|
vx = vx * fontsize * .001
|
||||||
vy = (1000 - vy) * fontsize * .001
|
vy = (1000 - vy) * fontsize * .001
|
||||||
tx = -vx
|
bbox_lower_left = (-vx, vy + rise + self.adv)
|
||||||
ty = vy + rise
|
bbox_upper_right = (-vx + fontsize, vy + rise)
|
||||||
bll = (tx, ty+self.adv)
|
|
||||||
bur = (tx+width, ty)
|
|
||||||
else:
|
else:
|
||||||
# horizontal
|
# horizontal
|
||||||
height = font.get_height() * fontsize
|
|
||||||
descent = font.get_descent() * fontsize
|
descent = font.get_descent() * fontsize
|
||||||
ty = descent + rise
|
bbox_lower_left = (0, descent + rise)
|
||||||
bll = (0, ty)
|
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
||||||
bur = (self.adv, ty+height)
|
|
||||||
(a, b, c, d, e, f) = self.matrix
|
(a, b, c, d, e, f) = self.matrix
|
||||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||||
(x0, y0) = apply_matrix_pt(self.matrix, bll)
|
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
||||||
(x1, y1) = apply_matrix_pt(self.matrix, bur)
|
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
||||||
if x1 < x0:
|
if x1 < x0:
|
||||||
(x0, x1) = (x1, x0)
|
(x0, x1) = (x1, x0)
|
||||||
if y1 < y0:
|
if y1 < y0:
|
||||||
|
|
Binary file not shown.
|
@ -0,0 +1,22 @@
|
||||||
|
from helpers import absolute_sample_path
|
||||||
|
from pdfminer.high_level import extract_pages
|
||||||
|
from pdfminer.layout import LTChar, LTTextBox
|
||||||
|
|
||||||
|
|
||||||
|
def test_font_size():
|
||||||
|
path = absolute_sample_path('font-size-test.pdf')
|
||||||
|
for page in extract_pages(path):
|
||||||
|
for text_box in page:
|
||||||
|
if isinstance(text_box, LTTextBox):
|
||||||
|
for line in text_box:
|
||||||
|
possible_number = line.get_text().strip()
|
||||||
|
if possible_number.isdigit():
|
||||||
|
expected_size = int(possible_number)
|
||||||
|
|
||||||
|
for char in line:
|
||||||
|
if isinstance(char, LTChar):
|
||||||
|
actual_size = int(round(char.size))
|
||||||
|
print(char, actual_size, expected_size)
|
||||||
|
assert expected_size == actual_size
|
||||||
|
else:
|
||||||
|
print(repr(line.get_text()))
|
|
@ -15,7 +15,7 @@ test_strings = {
|
||||||
"H e l l o \n\nW o r l d\n\n"
|
"H e l l o \n\nW o r l d\n\n"
|
||||||
"H e l l o \n\nW o r l d\n\n\f",
|
"H e l l o \n\nW o r l d\n\n\f",
|
||||||
"simple2.pdf": "\f",
|
"simple2.pdf": "\f",
|
||||||
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue