diff --git a/CHANGELOG.md b/CHANGELOG.md
index d0702a1..5a488d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,12 @@
# Changelog
-
All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
+- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
+
### Fixed
- 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
diff --git a/README.md b/README.md
index e211c43..b8c2542 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Features
* Written entirely in Python.
* Parse, analyze, and convert PDF documents.
+* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
* PDF-1.7 specification support. (well, almost).
* CJK languages and vertical writing scripts support.
* Various font types (Type1, TrueType, Type3, and CID) support.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 3da2fcb..6b367aa 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace
from . import utils
from .image import ImageWriter
from .layout import LAParams, LTComponent, TextGroupElement
+from .layout import LTAnno
from .layout import LTChar
from .layout import LTContainer
from .layout import LTCurve
@@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]):
def close(self) -> None:
self.write_footer()
return
+
+
+class HOCRConverter(PDFConverter[AnyIO]):
+ """Extract an hOCR representation from explicit text information within a PDF."""
+
+ # Where text is being extracted from a variety of types of PDF within a
+ # business process, those PDFs where the text is only present in image
+ # form will need to be analysed using an OCR tool which will typically
+ # output hOCR. This converter extracts the explicit text information from
+ # those PDFs that do have it and uses it to genxerate a basic hOCR
+ # representation that is designed to be used in conjunction with the image
+ # of the PDF in the same way as genuine OCR output would be, but without the
+ # inevitable OCR errors.
+
+ # The converter does not handle images, diagrams or text colors.
+
+ # In the examples processed by the contributor it was necessary to set
+ # LAParams.all_texts to True.
+
+ CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
+
+ def __init__(
+ self,
+ rsrcmgr: PDFResourceManager,
+ outfp: AnyIO,
+ codec: str = "utf8",
+ pageno: int = 1,
+ laparams: Optional[LAParams] = None,
+ stripcontrol: bool = False,
+ ):
+ PDFConverter.__init__(
+ self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
+ )
+ self.stripcontrol = stripcontrol
+ self.within_chars = False
+ self.write_header()
+
+ def bbox_repr(self, bbox: Rect) -> str:
+ (in_x0, in_y0, in_x1, in_y1) = bbox
+ # PDF y-coordinates are the other way round from hOCR coordinates
+ out_x0 = int(in_x0)
+ out_y0 = int(self.page_bbox[3] - in_y1)
+ out_x1 = int(in_x1)
+ out_y1 = int(self.page_bbox[3] - in_y0)
+ return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
+
+ def write(self, text: str) -> None:
+ if self.codec:
+ encoded_text = text.encode(self.codec)
+ cast(BinaryIO, self.outfp).write(encoded_text)
+ else:
+ cast(TextIO, self.outfp).write(text)
+
+ def write_header(self) -> None:
+ if self.codec:
+ self.write(
+ "\n" % self.codec
+ )
+ else:
+ self.write(
+ "\n"
+ )
+ self.write("
\n")
+ self.write("\n")
+ self.write(
+ "\n"
+ )
+ self.write(
+ "\n"
+ )
+ self.write(
+ " \n"
+ )
+ self.write("\n")
+ self.write("\n")
+
+ def write_footer(self) -> None:
+ self.write("\n")
+ self.write(
+ "\n"
+ )
+
+ def write_text(self, text: str) -> None:
+ if self.stripcontrol:
+ text = self.CONTROL.sub("", text)
+ self.write(text)
+
+ def write_word(self) -> None:
+ if len(self.working_text) > 0:
+ bold_and_italic_styles = ""
+ if "Italic" in self.working_font:
+ bold_and_italic_styles = "font-style: italic; "
+ if "Bold" in self.working_font:
+ bold_and_italic_styles += "font-weight: bold; "
+ self.write(
+ "%s"
+ % (
+ (
+ self.working_font,
+ self.working_size,
+ bold_and_italic_styles,
+ self.bbox_repr(self.working_bbox),
+ self.working_font,
+ self.working_size,
+ self.working_text.strip(),
+ )
+ )
+ )
+ self.within_chars = False
+
+ def receive_layout(self, ltpage: LTPage) -> None:
+ def render(item: LTItem) -> None:
+ if self.within_chars and isinstance(item, LTAnno):
+ self.write_word()
+ if isinstance(item, LTPage):
+ self.page_bbox = item.bbox
+ self.write(
+ "\n"
+ % (item.pageid, self.bbox_repr(item.bbox))
+ )
+ for child in item:
+ render(child)
+ self.write("
\n")
+ elif isinstance(item, LTTextLine):
+ self.write(
+ "" % ((self.bbox_repr(item.bbox)))
+ )
+ for child_line in item:
+ render(child_line)
+ self.write("\n")
+ elif isinstance(item, LTTextBox):
+ self.write(
+ "\n"
+ % (item.index, self.bbox_repr(item.bbox))
+ )
+ for child in item:
+ render(child)
+ self.write("
\n")
+ elif isinstance(item, LTChar):
+ if not self.within_chars:
+ self.within_chars = True
+ self.working_text = item.get_text()
+ self.working_bbox = item.bbox
+ self.working_font = item.fontname
+ self.working_size = item.size
+ else:
+ if len(item.get_text().strip()) == 0:
+ self.write_word()
+ self.write(item.get_text())
+ else:
+ if (
+ self.working_bbox[1] != item.bbox[1]
+ or self.working_font != item.fontname
+ or self.working_size != item.size
+ ):
+ self.write_word()
+ self.working_bbox = item.bbox
+ self.working_font = item.fontname
+ self.working_size = item.size
+ self.working_text += item.get_text()
+ self.working_bbox = (
+ self.working_bbox[0],
+ self.working_bbox[1],
+ item.bbox[2],
+ self.working_bbox[3],
+ )
+
+ render(ltpage)
+
+ def close(self) -> None:
+ self.write_footer()
diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
index 29a985c..94be9d4 100644
--- a/pdfminer/high_level.py
+++ b/pdfminer/high_level.py
@@ -5,7 +5,13 @@ import sys
from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
-from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
+from .converter import (
+ XMLConverter,
+ HTMLConverter,
+ TextConverter,
+ PDFPageAggregator,
+ HOCRConverter,
+)
from .image import ImageWriter
from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor
@@ -41,8 +47,8 @@ def extract_text_to_fp(
:param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to.
- :param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
- properly.
+ :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
+ Only 'text' works properly.
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly.
@@ -100,6 +106,11 @@ def extract_text_to_fp(
imagewriter=imagewriter,
)
+ elif output_type == "hocr":
+ device = HOCRConverter(
+ rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
+ )
+
elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
index 5e2b75a..abd5307 100644
--- a/tests/test_tools_pdf2txt.py
+++ b/tests/test_tools_pdf2txt.py
@@ -111,6 +111,12 @@ class TestPdf2Txt:
def test_encryption_rc4_128(self):
run("encryption/rc4-128.pdf", "-P foo")
+ def test_html_simple1(self):
+ run("simple1.pdf", "-t html")
+
+ def test_hocr_simple1(self):
+ run("simple1.pdf", "-t hocr")
+
class TestDumpImages:
@staticmethod