Fix bug in computing character bounding box (#348)
* Remove scaling font height/width with size of font bounding box * Refactor LTChar bounding box computation * Change expected outcome of `python tools/pdf2txt.py samples/simple3.pdf`, because it looks like an improvement. However, when I view `samples/simple3.pdf` I don't see any text at all. The change in expected outcome is explained by the fact that the bounding boxes of characters can be different, depending on the `/FontBBox` parameter of the font. * Add test for font sizes, and for this a high-level function that returns an iterator of LTPage objects * Add line to CHANGELOGpull/357/head^2
parent
2f7f5d2667
commit
fff3ac2ba6
|
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|||
|
||||
### Fixed
|
||||
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
|
||||
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
|
||||
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
|
||||
|
||||
## [20200104] - 2019-01-04
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
"""Functions that can be used for the most common use-cases for pdfminer.six"""
|
||||
|
||||
import sys
|
||||
import logging
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from .pdfdevice import TagExtractor
|
||||
from .pdfpage import PDFPage
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter, \
|
||||
PDFPageAggregator
|
||||
from .image import ImageWriter
|
||||
from .layout import LAParams
|
||||
from io import StringIO
|
||||
from .pdfdevice import TagExtractor
|
||||
from .pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from .pdfpage import PDFPage
|
||||
|
||||
|
||||
def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||
|
@ -88,10 +89,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
|||
|
||||
def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||
caching=True, codec='utf-8', laparams=None):
|
||||
"""
|
||||
Parses and returns the text contained in a PDF file.
|
||||
Takes loads of optional arguments but the defaults are somewhat sane.
|
||||
Returns a string containing all of the text extracted.
|
||||
"""Parse and return the text contained in a PDF file.
|
||||
|
||||
:param pdf_file: Path to the PDF file to be worked on
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
|
@ -99,7 +97,9 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
:param maxpages: The maximum number of pages to parse
|
||||
:param caching: If resources should be cached
|
||||
:param codec: Text decoding codec
|
||||
:param laparams: LAParams object from pdfminer.layout.
|
||||
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||
some default settings that often work well.
|
||||
:return: a string containing all of the text extracted.
|
||||
"""
|
||||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
@ -121,3 +121,30 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
|
|||
interpreter.process_page(page)
|
||||
|
||||
return output_string.getvalue()
|
||||
|
||||
|
||||
def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
|
||||
caching=True, laparams=None):
|
||||
"""Extract and yield LTPage objects
|
||||
|
||||
:param pdf_file: Path to the PDF file to be worked on
|
||||
:param password: For encrypted PDFs, the password to decrypt.
|
||||
:param page_numbers: List of zero-indexed page numbers to extract.
|
||||
:param maxpages: The maximum number of pages to parse
|
||||
:param caching: If resources should be cached
|
||||
:param laparams: An LAParams object from pdfminer.layout. If None, uses
|
||||
some default settings that often work well.
|
||||
:return:
|
||||
"""
|
||||
if laparams is None:
|
||||
laparams = LAParams()
|
||||
|
||||
with open(pdf_file, "rb") as fp:
|
||||
resource_manager = PDFResourceManager()
|
||||
device = PDFPageAggregator(resource_manager, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
|
||||
password=password, caching=caching):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
yield layout
|
||||
|
|
|
@ -280,28 +280,23 @@ class LTChar(LTComponent, LTText):
|
|||
# compute the boundary rectangle.
|
||||
if font.is_vertical():
|
||||
# vertical
|
||||
width = font.get_width() * fontsize
|
||||
(vx, vy) = textdisp
|
||||
if vx is None:
|
||||
vx = width * 0.5
|
||||
vx = fontsize * 0.5
|
||||
else:
|
||||
vx = vx * fontsize * .001
|
||||
vy = (1000 - vy) * fontsize * .001
|
||||
tx = -vx
|
||||
ty = vy + rise
|
||||
bll = (tx, ty+self.adv)
|
||||
bur = (tx+width, ty)
|
||||
bbox_lower_left = (-vx, vy + rise + self.adv)
|
||||
bbox_upper_right = (-vx + fontsize, vy + rise)
|
||||
else:
|
||||
# horizontal
|
||||
height = font.get_height() * fontsize
|
||||
descent = font.get_descent() * fontsize
|
||||
ty = descent + rise
|
||||
bll = (0, ty)
|
||||
bur = (self.adv, ty+height)
|
||||
bbox_lower_left = (0, descent + rise)
|
||||
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
||||
(a, b, c, d, e, f) = self.matrix
|
||||
self.upright = (0 < a*d*scaling and b*c <= 0)
|
||||
(x0, y0) = apply_matrix_pt(self.matrix, bll)
|
||||
(x1, y1) = apply_matrix_pt(self.matrix, bur)
|
||||
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
||||
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
||||
if x1 < x0:
|
||||
(x0, x1) = (x1, x0)
|
||||
if y1 < y0:
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,22 @@
|
|||
from helpers import absolute_sample_path
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTChar, LTTextBox
|
||||
|
||||
|
||||
def test_font_size():
|
||||
path = absolute_sample_path('font-size-test.pdf')
|
||||
for page in extract_pages(path):
|
||||
for text_box in page:
|
||||
if isinstance(text_box, LTTextBox):
|
||||
for line in text_box:
|
||||
possible_number = line.get_text().strip()
|
||||
if possible_number.isdigit():
|
||||
expected_size = int(possible_number)
|
||||
|
||||
for char in line:
|
||||
if isinstance(char, LTChar):
|
||||
actual_size = int(round(char.size))
|
||||
print(char, actual_size, expected_size)
|
||||
assert expected_size == actual_size
|
||||
else:
|
||||
print(repr(line.get_text()))
|
|
@ -15,7 +15,7 @@ test_strings = {
|
|||
"H e l l o \n\nW o r l d\n\n"
|
||||
"H e l l o \n\nW o r l d\n\n\f",
|
||||
"simple2.pdf": "\f",
|
||||
"simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
|
||||
"simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue