Add HOCRConverter (fixes #650) (#651)

* Add HOCRConverter

* Add line to README.md

* Test cicd

* Test cicd 2

* Changes based on review comments

* Remove whitespace changes to CHANGELOG.md

* Remove duplicated html output

* Add link to hocr wiki

* Add tests for extracting hocr and html

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/790/head
Richard Hudson 2022-08-14 10:52:50 +01:00 committed by GitHub
parent f79ad56f48
commit 77df431871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 200 additions and 4 deletions

View File

@ -1,11 +1,12 @@
# Changelog
All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased]
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
### Fixed
- 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))

View File

@ -23,6 +23,7 @@ Features
* Written entirely in Python.
* Parse, analyze, and convert PDF documents.
* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
* PDF-1.7 specification support. (well, almost).
* CJK languages and vertical writing scripts support.
* Various font types (Type1, TrueType, Type3, and CID) support.

View File

@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace
from . import utils
from .image import ImageWriter
from .layout import LAParams, LTComponent, TextGroupElement
from .layout import LTAnno
from .layout import LTChar
from .layout import LTContainer
from .layout import LTCurve
@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]):
def close(self) -> None:
self.write_footer()
return
class HOCRConverter(PDFConverter[AnyIO]):
"""Extract an hOCR representation from explicit text information within a PDF."""
# Where text is being extracted from a variety of types of PDF within a
# business process, those PDFs where the text is only present in image
# form will need to be analysed using an OCR tool which will typically
# output hOCR. This converter extracts the explicit text information from
# those PDFs that do have it and uses it to genxerate a basic hOCR
# representation that is designed to be used in conjunction with the image
# of the PDF in the same way as genuine OCR output would be, but without the
# inevitable OCR errors.
# The converter does not handle images, diagrams or text colors.
# In the examples processed by the contributor it was necessary to set
# LAParams.all_texts to True.
CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
def __init__(
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = "utf8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
stripcontrol: bool = False,
):
PDFConverter.__init__(
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
self.stripcontrol = stripcontrol
self.within_chars = False
self.write_header()
def bbox_repr(self, bbox: Rect) -> str:
(in_x0, in_y0, in_x1, in_y1) = bbox
# PDF y-coordinates are the other way round from hOCR coordinates
out_x0 = int(in_x0)
out_y0 = int(self.page_bbox[3] - in_y1)
out_x1 = int(in_x1)
out_y1 = int(self.page_bbox[3] - in_y0)
return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
def write(self, text: str) -> None:
if self.codec:
encoded_text = text.encode(self.codec)
cast(BinaryIO, self.outfp).write(encoded_text)
else:
cast(TextIO, self.outfp).write(text)
def write_header(self) -> None:
if self.codec:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en' charset='%s'>\n" % self.codec
)
else:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en'>\n"
)
self.write("<head>\n")
self.write("<title></title>\n")
self.write(
"<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
)
self.write(
"<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
)
self.write(
" <meta name='ocr-capabilities'"
" content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
)
self.write("</head>\n")
self.write("<body>\n")
def write_footer(self) -> None:
self.write("<!-- comment in the following line to debug -->\n")
self.write(
"<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
)
def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub("", text)
self.write(text)
def write_word(self) -> None:
if len(self.working_text) > 0:
bold_and_italic_styles = ""
if "Italic" in self.working_font:
bold_and_italic_styles = "font-style: italic; "
if "Bold" in self.working_font:
bold_and_italic_styles += "font-weight: bold; "
self.write(
"<span style='font:\"%s\"; font-size:%d; %s' "
"class='ocrx_word' title='%s; x_font %s; "
"x_fsize %d'>%s</span>"
% (
(
self.working_font,
self.working_size,
bold_and_italic_styles,
self.bbox_repr(self.working_bbox),
self.working_font,
self.working_size,
self.working_text.strip(),
)
)
)
self.within_chars = False
def receive_layout(self, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if self.within_chars and isinstance(item, LTAnno):
self.write_word()
if isinstance(item, LTPage):
self.page_bbox = item.bbox
self.write(
"<div class='ocr_page' id='%s' title='%s'>\n"
% (item.pageid, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTTextLine):
self.write(
"<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
)
for child_line in item:
render(child_line)
self.write("</span>\n")
elif isinstance(item, LTTextBox):
self.write(
"<div class='ocr_block' id='%d' title='%s'>\n"
% (item.index, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTChar):
if not self.within_chars:
self.within_chars = True
self.working_text = item.get_text()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
else:
if len(item.get_text().strip()) == 0:
self.write_word()
self.write(item.get_text())
else:
if (
self.working_bbox[1] != item.bbox[1]
or self.working_font != item.fontname
or self.working_size != item.size
):
self.write_word()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
self.working_text += item.get_text()
self.working_bbox = (
self.working_bbox[0],
self.working_bbox[1],
item.bbox[2],
self.working_bbox[3],
)
render(ltpage)
def close(self) -> None:
self.write_footer()

View File

@ -5,7 +5,13 @@ import sys
from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
from .converter import (
XMLConverter,
HTMLConverter,
TextConverter,
PDFPageAggregator,
HOCRConverter,
)
from .image import ImageWriter
from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor
@ -41,8 +47,8 @@ def extract_text_to_fp(
:param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to.
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
properly.
:param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
Only 'text' works properly.
:param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly.
@ -100,6 +106,11 @@ def extract_text_to_fp(
imagewriter=imagewriter,
)
elif output_type == "hocr":
device = HOCRConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
)
elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

View File

@ -111,6 +111,12 @@ class TestPdf2Txt:
def test_encryption_rc4_128(self):
run("encryption/rc4-128.pdf", "-P foo")
def test_html_simple1(self):
run("simple1.pdf", "-t html")
def test_hocr_simple1(self):
run("simple1.pdf", "-t hocr")
class TestDumpImages:
@staticmethod