Fix bug in computing character bounding box (#348)

* Remove scaling font height/width with size of font bounding box * Refactor LTChar bounding box computation * Change expected outcome of `python tools/pdf2txt.py samples/simple3.pdf`, because it looks like an improvement. However, when I view `samples/simple3.pdf` I don't see any text at all. The change in expected outcome is explained by the fact that the bounding boxes of characters can be different, depending on the `/FontBBox` parameter of the font. * Add test for font sizes, and for this a high-level function that returns an iterator of LTPage objects * Add line to CHANGELOG
2020-01-16 22:15:50 +01:00 · 2020-01-16 22:15:50 +01:00 · fff3ac2ba6
parent 2f7f5d2667
commit fff3ac2ba6
6 changed files with 69 additions and 24 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

 ### Fixed
 - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
+- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
 - KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))

 ## [20200104] - 2019-01-04
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -1,15 +1,16 @@
 """Functions that can be used for the most common use-cases for pdfminer.six"""

-import sys
 import logging
+import sys
+from io import StringIO

-from .pdfinterp import PDFResourceManager, PDFPageInterpreter
-from .pdfdevice import TagExtractor
-from .pdfpage import PDFPage
-from .converter import XMLConverter, HTMLConverter, TextConverter
+from .converter import XMLConverter, HTMLConverter, TextConverter, \
+    PDFPageAggregator
 from .image import ImageWriter
 from .layout import LAParams
-from io import StringIO
+from .pdfdevice import TagExtractor
+from .pdfinterp import PDFResourceManager, PDFPageInterpreter
+from .pdfpage import PDFPage


 def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
@ -88,10 +89,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',

 def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 caching=True, codec='utf-8', laparams=None):
-    """
-    Parses and returns the text contained in a PDF file.
-    Takes loads of optional arguments but the defaults are somewhat sane.
-    Returns a string containing all of the text extracted.
+    """Parse and return the text contained in a PDF file.

    :param pdf_file: Path to the PDF file to be worked on
    :param password: For encrypted PDFs, the password to decrypt.
@ -99,7 +97,9 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
-    :param laparams: LAParams object from pdfminer.layout.
+    :param laparams: An LAParams object from pdfminer.layout. If None, uses
+        some default settings that often work well.
+    :return: a string containing all of the text extracted.
    """
    if laparams is None:
        laparams = LAParams()
@ -121,3 +121,30 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
            interpreter.process_page(page)

        return output_string.getvalue()
+
+
+def extract_pages(pdf_file, password='', page_numbers=None, maxpages=0,
+                  caching=True, laparams=None):
+    """Extract and yield LTPage objects
+
+    :param pdf_file: Path to the PDF file to be worked on
+    :param password: For encrypted PDFs, the password to decrypt.
+    :param page_numbers: List of zero-indexed page numbers to extract.
+    :param maxpages: The maximum number of pages to parse
+    :param caching: If resources should be cached
+    :param laparams: An LAParams object from pdfminer.layout. If None, uses
+        some default settings that often work well.
+    :return:
+    """
+    if laparams is None:
+        laparams = LAParams()
+
+    with open(pdf_file, "rb") as fp:
+        resource_manager = PDFResourceManager()
+        device = PDFPageAggregator(resource_manager, laparams=laparams)
+        interpreter = PDFPageInterpreter(resource_manager, device)
+        for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,
+                                      password=password, caching=caching):
+            interpreter.process_page(page)
+            layout = device.get_result()
+            yield layout
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@ -280,28 +280,23 @@ class LTChar(LTComponent, LTText):
        # compute the boundary rectangle.
        if font.is_vertical():
            # vertical
-            width = font.get_width() * fontsize
            (vx, vy) = textdisp
            if vx is None:
-                vx = width * 0.5
+                vx = fontsize * 0.5
            else:
                vx = vx * fontsize * .001
            vy = (1000 - vy) * fontsize * .001
-            tx = -vx
-            ty = vy + rise
-            bll = (tx, ty+self.adv)
-            bur = (tx+width, ty)
+            bbox_lower_left = (-vx, vy + rise + self.adv)
+            bbox_upper_right = (-vx + fontsize, vy + rise)
        else:
            # horizontal
-            height = font.get_height() * fontsize
            descent = font.get_descent() * fontsize
-            ty = descent + rise
-            bll = (0, ty)
-            bur = (self.adv, ty+height)
+            bbox_lower_left = (0, descent + rise)
+            bbox_upper_right = (self.adv, descent + rise + fontsize)
        (a, b, c, d, e, f) = self.matrix
        self.upright = (0 < a*d*scaling and b*c <= 0)
-        (x0, y0) = apply_matrix_pt(self.matrix, bll)
-        (x1, y1) = apply_matrix_pt(self.matrix, bur)
+        (x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
+        (x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
        if x1 < x0:
            (x0, x1) = (x1, x0)
        if y1 < y0:
--- a/samples/font-size-test.pdf
+++ b/samples/font-size-test.pdf
--- a/tests/test_font_size.py
+++ b/tests/test_font_size.py
@ -0,0 +1,22 @@
+from helpers import absolute_sample_path
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTChar, LTTextBox
+
+
+def test_font_size():
+    path = absolute_sample_path('font-size-test.pdf')
+    for page in extract_pages(path):
+        for text_box in page:
+            if isinstance(text_box, LTTextBox):
+                for line in text_box:
+                    possible_number = line.get_text().strip()
+                    if possible_number.isdigit():
+                        expected_size = int(possible_number)
+
+                        for char in line:
+                            if isinstance(char, LTChar):
+                                actual_size = int(round(char.size))
+                                print(char, actual_size, expected_size)
+                                assert expected_size == actual_size
+                    else:
+                        print(repr(line.get_text()))
--- a/tests/test_highlevel_extracttext.py
+++ b/tests/test_highlevel_extracttext.py
@ -15,7 +15,7 @@ test_strings = {
                   "H e l l o  \n\nW o r l d\n\n"
                   "H e l l o  \n\nW o r l d\n\n\f",
    "simple2.pdf": "\f",
-    "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
+    "simple3.pdf": "Hello\n\nHello\n\nWorld\n\nWorld\n\n\f",
 }