Add HOCRConverter (fixes #650) (#651)

* Add HOCRConverter * Add line to README.md * Test cicd * Test cicd 2 * Changes based on review comments * Remove whitespace changes to CHANGELOG.md * Remove duplicated html output * Add link to hocr wiki * Add tests for extracting hocr and html Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
2022-08-14 10:52:50 +01:00 · 2022-08-14 10:52:50 +01:00 · 77df431871
parent f79ad56f48
commit 77df431871
5 changed files with 200 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,11 +1,12 @@
 # Changelog
 All notable changes in pdfminer.six will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
 ### Fixed
 - 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
--- a/README.md
+++ b/README.md
@ -23,6 +23,7 @@ Features
 * Written entirely in Python.
 * Parse, analyze, and convert PDF documents.
 * Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
 * PDF-1.7 specification support. (well, almost).
 * CJK languages and vertical writing scripts support.
 * Various font types (Type1, TrueType, Type3, and CID) support.
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace
 from . import utils
 from .image import ImageWriter
 from .layout import LAParams, LTComponent, TextGroupElement
 from .layout import LTAnno
 from .layout import LTChar
 from .layout import LTContainer
 from .layout import LTCurve
@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]):
    def close(self) -> None:
        self.write_footer()
        return
 class HOCRConverter(PDFConverter[AnyIO]):
    """Extract an hOCR representation from explicit text information within a PDF."""
    #   Where text is being extracted from a variety of types of PDF within a
    #   business process, those PDFs where the text is only present in image
    #   form will need to be analysed using an OCR tool which will typically
    #   output hOCR. This converter extracts the explicit text information from
    #   those PDFs that do have it and uses it to genxerate a basic hOCR
    #   representation that is designed to be used in conjunction with the image
    #   of the PDF in the same way as genuine OCR output would be, but without the
    #   inevitable OCR errors.
    #   The converter does not handle images, diagrams or text colors.
    #   In the examples processed by the contributor it was necessary to set
    #   LAParams.all_texts to True.
    CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
    def __init__(
        self,
        rsrcmgr: PDFResourceManager,
        outfp: AnyIO,
        codec: str = "utf8",
        pageno: int = 1,
        laparams: Optional[LAParams] = None,
        stripcontrol: bool = False,
    ):
        PDFConverter.__init__(
            self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
        )
        self.stripcontrol = stripcontrol
        self.within_chars = False
        self.write_header()
    def bbox_repr(self, bbox: Rect) -> str:
        (in_x0, in_y0, in_x1, in_y1) = bbox
        # PDF y-coordinates are the other way round from hOCR coordinates
        out_x0 = int(in_x0)
        out_y0 = int(self.page_bbox[3] - in_y1)
        out_x1 = int(in_x1)
        out_y1 = int(self.page_bbox[3] - in_y0)
        return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
    def write(self, text: str) -> None:
        if self.codec:
            encoded_text = text.encode(self.codec)
            cast(BinaryIO, self.outfp).write(encoded_text)
        else:
            cast(TextIO, self.outfp).write(text)
    def write_header(self) -> None:
        if self.codec:
            self.write(
                "<html xmlns='http://www.w3.org/1999/xhtml' "
                "xml:lang='en' lang='en' charset='%s'>\n" % self.codec
            )
        else:
            self.write(
                "<html xmlns='http://www.w3.org/1999/xhtml' "
                "xml:lang='en' lang='en'>\n"
            )
        self.write("<head>\n")
        self.write("<title></title>\n")
        self.write(
            "<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
        )
        self.write(
            "<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
        )
        self.write(
            "  <meta name='ocr-capabilities'"
            " content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
        )
        self.write("</head>\n")
        self.write("<body>\n")
    def write_footer(self) -> None:
        self.write("<!-- comment in the following line to debug -->\n")
        self.write(
            "<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
        )
    def write_text(self, text: str) -> None:
        if self.stripcontrol:
            text = self.CONTROL.sub("", text)
        self.write(text)
    def write_word(self) -> None:
        if len(self.working_text) > 0:
            bold_and_italic_styles = ""
            if "Italic" in self.working_font:
                bold_and_italic_styles = "font-style: italic; "
            if "Bold" in self.working_font:
                bold_and_italic_styles += "font-weight: bold; "
            self.write(
                "<span style='font:\"%s\"; font-size:%d; %s' "
                "class='ocrx_word' title='%s; x_font %s; "
                "x_fsize %d'>%s</span>"
                % (
                    (
                        self.working_font,
                        self.working_size,
                        bold_and_italic_styles,
                        self.bbox_repr(self.working_bbox),
                        self.working_font,
                        self.working_size,
                        self.working_text.strip(),
                    )
                )
            )
        self.within_chars = False
    def receive_layout(self, ltpage: LTPage) -> None:
        def render(item: LTItem) -> None:
            if self.within_chars and isinstance(item, LTAnno):
                self.write_word()
            if isinstance(item, LTPage):
                self.page_bbox = item.bbox
                self.write(
                    "<div class='ocr_page' id='%s' title='%s'>\n"
                    % (item.pageid, self.bbox_repr(item.bbox))
                )
                for child in item:
                    render(child)
                self.write("</div>\n")
            elif isinstance(item, LTTextLine):
                self.write(
                    "<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
                )
                for child_line in item:
                    render(child_line)
                self.write("</span>\n")
            elif isinstance(item, LTTextBox):
                self.write(
                    "<div class='ocr_block' id='%d' title='%s'>\n"
                    % (item.index, self.bbox_repr(item.bbox))
                )
                for child in item:
                    render(child)
                self.write("</div>\n")
            elif isinstance(item, LTChar):
                if not self.within_chars:
                    self.within_chars = True
                    self.working_text = item.get_text()
                    self.working_bbox = item.bbox
                    self.working_font = item.fontname
                    self.working_size = item.size
                else:
                    if len(item.get_text().strip()) == 0:
                        self.write_word()
                        self.write(item.get_text())
                    else:
                        if (
                            self.working_bbox[1] != item.bbox[1]
                            or self.working_font != item.fontname
                            or self.working_size != item.size
                        ):
                            self.write_word()
                            self.working_bbox = item.bbox
                            self.working_font = item.fontname
                            self.working_size = item.size
                        self.working_text += item.get_text()
                        self.working_bbox = (
                            self.working_bbox[0],
                            self.working_bbox[1],
                            item.bbox[2],
                            self.working_bbox[3],
                        )
        render(ltpage)
    def close(self) -> None:
        self.write_footer()
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@ -5,7 +5,13 @@ import sys
 from io import StringIO
 from typing import Any, BinaryIO, Container, Iterator, Optional, cast
-from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
+from .converter import (
    XMLConverter,
    HTMLConverter,
    TextConverter,
    PDFPageAggregator,
    HOCRConverter,
 )
 from .image import ImageWriter
 from .layout import LAParams, LTPage
 from .pdfdevice import PDFDevice, TagExtractor
@ -41,8 +47,8 @@ def extract_text_to_fp(
    :param inf: a file-like object to read PDF structure from, such as a
        file handler (using the builtin `open()` function) or a `BytesIO`.
    :param outfp: a file-like object to write the text to.
-    :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
+    :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
-        properly.
+        Only 'text' works properly.
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. Default is None
        but may not layout correctly.
@ -100,6 +106,11 @@ def extract_text_to_fp(
            imagewriter=imagewriter,
        )
    elif output_type == "hocr":
        device = HOCRConverter(
            rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
        )
    elif output_type == "tag":
        # Binary I/O is required, but we have no good way to test it here.
        device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@ -111,6 +111,12 @@ class TestPdf2Txt:
    def test_encryption_rc4_128(self):
        run("encryption/rc4-128.pdf", "-P foo")
    def test_html_simple1(self):
        run("simple1.pdf", "-t html")
    def test_hocr_simple1(self):
        run("simple1.pdf", "-t hocr")
 class TestDumpImages:
    @staticmethod