From 77df431871c6cfe32f8cc794e138a7aec4e3e098 Mon Sep 17 00:00:00 2001 From: Richard Hudson Date: Sun, 14 Aug 2022 10:52:50 +0100 Subject: [PATCH] Add HOCRConverter (fixes #650) (#651) * Add HOCRConverter * Add line to README.md * Test cicd * Test cicd 2 * Changes based on review comments * Remove whitespace changes to CHANGELOG.md * Remove duplicated html output * Add link to hocr wiki * Add tests for extracting hocr and html Co-authored-by: Pieter Marsman --- CHANGELOG.md | 3 +- README.md | 1 + pdfminer/converter.py | 177 ++++++++++++++++++++++++++++++++++++ pdfminer/high_level.py | 17 +++- tests/test_tools_pdf2txt.py | 6 ++ 5 files changed, 200 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0702a1..5a488d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,12 @@ # Changelog - All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) + ### Fixed - 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773)) diff --git a/README.md b/README.md index e211c43..b8c2542 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Features * Written entirely in Python. * Parse, analyze, and convert PDF documents. +* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR). * PDF-1.7 specification support. (well, almost). * CJK languages and vertical writing scripts support. * Various font types (Type1, TrueType, Type3, and CID) support. diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3da2fcb..6b367aa 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace from . import utils from .image import ImageWriter from .layout import LAParams, LTComponent, TextGroupElement +from .layout import LTAnno from .layout import LTChar from .layout import LTContainer from .layout import LTCurve @@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]): def close(self) -> None: self.write_footer() return + + +class HOCRConverter(PDFConverter[AnyIO]): + """Extract an hOCR representation from explicit text information within a PDF.""" + + # Where text is being extracted from a variety of types of PDF within a + # business process, those PDFs where the text is only present in image + # form will need to be analysed using an OCR tool which will typically + # output hOCR. This converter extracts the explicit text information from + # those PDFs that do have it and uses it to genxerate a basic hOCR + # representation that is designed to be used in conjunction with the image + # of the PDF in the same way as genuine OCR output would be, but without the + # inevitable OCR errors. + + # The converter does not handle images, diagrams or text colors. + + # In the examples processed by the contributor it was necessary to set + # LAParams.all_texts to True. + + CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]") + + def __init__( + self, + rsrcmgr: PDFResourceManager, + outfp: AnyIO, + codec: str = "utf8", + pageno: int = 1, + laparams: Optional[LAParams] = None, + stripcontrol: bool = False, + ): + PDFConverter.__init__( + self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams + ) + self.stripcontrol = stripcontrol + self.within_chars = False + self.write_header() + + def bbox_repr(self, bbox: Rect) -> str: + (in_x0, in_y0, in_x1, in_y1) = bbox + # PDF y-coordinates are the other way round from hOCR coordinates + out_x0 = int(in_x0) + out_y0 = int(self.page_bbox[3] - in_y1) + out_x1 = int(in_x1) + out_y1 = int(self.page_bbox[3] - in_y0) + return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}" + + def write(self, text: str) -> None: + if self.codec: + encoded_text = text.encode(self.codec) + cast(BinaryIO, self.outfp).write(encoded_text) + else: + cast(TextIO, self.outfp).write(text) + + def write_header(self) -> None: + if self.codec: + self.write( + "\n" % self.codec + ) + else: + self.write( + "\n" + ) + self.write("\n") + self.write("\n") + self.write( + "\n" + ) + self.write( + "\n" + ) + self.write( + " \n" + ) + self.write("\n") + self.write("\n") + + def write_footer(self) -> None: + self.write("\n") + self.write( + "\n" + ) + + def write_text(self, text: str) -> None: + if self.stripcontrol: + text = self.CONTROL.sub("", text) + self.write(text) + + def write_word(self) -> None: + if len(self.working_text) > 0: + bold_and_italic_styles = "" + if "Italic" in self.working_font: + bold_and_italic_styles = "font-style: italic; " + if "Bold" in self.working_font: + bold_and_italic_styles += "font-weight: bold; " + self.write( + "%s" + % ( + ( + self.working_font, + self.working_size, + bold_and_italic_styles, + self.bbox_repr(self.working_bbox), + self.working_font, + self.working_size, + self.working_text.strip(), + ) + ) + ) + self.within_chars = False + + def receive_layout(self, ltpage: LTPage) -> None: + def render(item: LTItem) -> None: + if self.within_chars and isinstance(item, LTAnno): + self.write_word() + if isinstance(item, LTPage): + self.page_bbox = item.bbox + self.write( + "
\n" + % (item.pageid, self.bbox_repr(item.bbox)) + ) + for child in item: + render(child) + self.write("
\n") + elif isinstance(item, LTTextLine): + self.write( + "" % ((self.bbox_repr(item.bbox))) + ) + for child_line in item: + render(child_line) + self.write("\n") + elif isinstance(item, LTTextBox): + self.write( + "
\n" + % (item.index, self.bbox_repr(item.bbox)) + ) + for child in item: + render(child) + self.write("
\n") + elif isinstance(item, LTChar): + if not self.within_chars: + self.within_chars = True + self.working_text = item.get_text() + self.working_bbox = item.bbox + self.working_font = item.fontname + self.working_size = item.size + else: + if len(item.get_text().strip()) == 0: + self.write_word() + self.write(item.get_text()) + else: + if ( + self.working_bbox[1] != item.bbox[1] + or self.working_font != item.fontname + or self.working_size != item.size + ): + self.write_word() + self.working_bbox = item.bbox + self.working_font = item.fontname + self.working_size = item.size + self.working_text += item.get_text() + self.working_bbox = ( + self.working_bbox[0], + self.working_bbox[1], + item.bbox[2], + self.working_bbox[3], + ) + + render(ltpage) + + def close(self) -> None: + self.write_footer() diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 29a985c..94be9d4 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -5,7 +5,13 @@ import sys from io import StringIO from typing import Any, BinaryIO, Container, Iterator, Optional, cast -from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator +from .converter import ( + XMLConverter, + HTMLConverter, + TextConverter, + PDFPageAggregator, + HOCRConverter, +) from .image import ImageWriter from .layout import LAParams, LTPage from .pdfdevice import PDFDevice, TagExtractor @@ -41,8 +47,8 @@ def extract_text_to_fp( :param inf: a file-like object to read PDF structure from, such as a file handler (using the builtin `open()` function) or a `BytesIO`. :param outfp: a file-like object to write the text to. - :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works - properly. + :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'. + Only 'text' works properly. :param codec: Text decoding codec :param laparams: An LAParams object from pdfminer.layout. Default is None but may not layout correctly. @@ -100,6 +106,11 @@ def extract_text_to_fp( imagewriter=imagewriter, ) + elif output_type == "hocr": + device = HOCRConverter( + rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control + ) + elif output_type == "tag": # Binary I/O is required, but we have no good way to test it here. device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 5e2b75a..abd5307 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -111,6 +111,12 @@ class TestPdf2Txt: def test_encryption_rc4_128(self): run("encryption/rc4-128.pdf", "-P foo") + def test_html_simple1(self): + run("simple1.pdf", "-t html") + + def test_hocr_simple1(self): + run("simple1.pdf", "-t hocr") + class TestDumpImages: @staticmethod