Add HOCRConverter (fixes #650) (#651)

* Add HOCRConverter

* Add line to README.md

* Test cicd

* Test cicd 2

* Changes based on review comments

* Remove whitespace changes to CHANGELOG.md

* Remove duplicated html output

* Add link to hocr wiki

* Add tests for extracting hocr and html

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/790/head
Richard Hudson 2022-08-14 10:52:50 +01:00 committed by GitHub
parent f79ad56f48
commit 77df431871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 200 additions and 4 deletions

View File

@ -1,11 +1,12 @@
# Changelog # Changelog
All notable changes in pdfminer.six will be documented in this file. All notable changes in pdfminer.six will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [Unreleased] ## [Unreleased]
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
### Fixed ### Fixed
- 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773)) - 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))

View File

@ -23,6 +23,7 @@ Features
* Written entirely in Python. * Written entirely in Python.
* Parse, analyze, and convert PDF documents. * Parse, analyze, and convert PDF documents.
* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
* PDF-1.7 specification support. (well, almost). * PDF-1.7 specification support. (well, almost).
* CJK languages and vertical writing scripts support. * CJK languages and vertical writing scripts support.
* Various font types (Type1, TrueType, Type3, and CID) support. * Various font types (Type1, TrueType, Type3, and CID) support.

View File

@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace
from . import utils from . import utils
from .image import ImageWriter from .image import ImageWriter
from .layout import LAParams, LTComponent, TextGroupElement from .layout import LAParams, LTComponent, TextGroupElement
from .layout import LTAnno
from .layout import LTChar from .layout import LTChar
from .layout import LTContainer from .layout import LTContainer
from .layout import LTCurve from .layout import LTCurve
@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]):
def close(self) -> None: def close(self) -> None:
self.write_footer() self.write_footer()
return return
class HOCRConverter(PDFConverter[AnyIO]):
"""Extract an hOCR representation from explicit text information within a PDF."""
# Where text is being extracted from a variety of types of PDF within a
# business process, those PDFs where the text is only present in image
# form will need to be analysed using an OCR tool which will typically
# output hOCR. This converter extracts the explicit text information from
# those PDFs that do have it and uses it to genxerate a basic hOCR
# representation that is designed to be used in conjunction with the image
# of the PDF in the same way as genuine OCR output would be, but without the
# inevitable OCR errors.
# The converter does not handle images, diagrams or text colors.
# In the examples processed by the contributor it was necessary to set
# LAParams.all_texts to True.
CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
def __init__(
self,
rsrcmgr: PDFResourceManager,
outfp: AnyIO,
codec: str = "utf8",
pageno: int = 1,
laparams: Optional[LAParams] = None,
stripcontrol: bool = False,
):
PDFConverter.__init__(
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
)
self.stripcontrol = stripcontrol
self.within_chars = False
self.write_header()
def bbox_repr(self, bbox: Rect) -> str:
(in_x0, in_y0, in_x1, in_y1) = bbox
# PDF y-coordinates are the other way round from hOCR coordinates
out_x0 = int(in_x0)
out_y0 = int(self.page_bbox[3] - in_y1)
out_x1 = int(in_x1)
out_y1 = int(self.page_bbox[3] - in_y0)
return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
def write(self, text: str) -> None:
if self.codec:
encoded_text = text.encode(self.codec)
cast(BinaryIO, self.outfp).write(encoded_text)
else:
cast(TextIO, self.outfp).write(text)
def write_header(self) -> None:
if self.codec:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en' charset='%s'>\n" % self.codec
)
else:
self.write(
"<html xmlns='http://www.w3.org/1999/xhtml' "
"xml:lang='en' lang='en'>\n"
)
self.write("<head>\n")
self.write("<title></title>\n")
self.write(
"<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
)
self.write(
"<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
)
self.write(
" <meta name='ocr-capabilities'"
" content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
)
self.write("</head>\n")
self.write("<body>\n")
def write_footer(self) -> None:
self.write("<!-- comment in the following line to debug -->\n")
self.write(
"<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
)
def write_text(self, text: str) -> None:
if self.stripcontrol:
text = self.CONTROL.sub("", text)
self.write(text)
def write_word(self) -> None:
if len(self.working_text) > 0:
bold_and_italic_styles = ""
if "Italic" in self.working_font:
bold_and_italic_styles = "font-style: italic; "
if "Bold" in self.working_font:
bold_and_italic_styles += "font-weight: bold; "
self.write(
"<span style='font:\"%s\"; font-size:%d; %s' "
"class='ocrx_word' title='%s; x_font %s; "
"x_fsize %d'>%s</span>"
% (
(
self.working_font,
self.working_size,
bold_and_italic_styles,
self.bbox_repr(self.working_bbox),
self.working_font,
self.working_size,
self.working_text.strip(),
)
)
)
self.within_chars = False
def receive_layout(self, ltpage: LTPage) -> None:
def render(item: LTItem) -> None:
if self.within_chars and isinstance(item, LTAnno):
self.write_word()
if isinstance(item, LTPage):
self.page_bbox = item.bbox
self.write(
"<div class='ocr_page' id='%s' title='%s'>\n"
% (item.pageid, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTTextLine):
self.write(
"<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
)
for child_line in item:
render(child_line)
self.write("</span>\n")
elif isinstance(item, LTTextBox):
self.write(
"<div class='ocr_block' id='%d' title='%s'>\n"
% (item.index, self.bbox_repr(item.bbox))
)
for child in item:
render(child)
self.write("</div>\n")
elif isinstance(item, LTChar):
if not self.within_chars:
self.within_chars = True
self.working_text = item.get_text()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
else:
if len(item.get_text().strip()) == 0:
self.write_word()
self.write(item.get_text())
else:
if (
self.working_bbox[1] != item.bbox[1]
or self.working_font != item.fontname
or self.working_size != item.size
):
self.write_word()
self.working_bbox = item.bbox
self.working_font = item.fontname
self.working_size = item.size
self.working_text += item.get_text()
self.working_bbox = (
self.working_bbox[0],
self.working_bbox[1],
item.bbox[2],
self.working_bbox[3],
)
render(ltpage)
def close(self) -> None:
self.write_footer()

View File

@ -5,7 +5,13 @@ import sys
from io import StringIO from io import StringIO
from typing import Any, BinaryIO, Container, Iterator, Optional, cast from typing import Any, BinaryIO, Container, Iterator, Optional, cast
from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator from .converter import (
XMLConverter,
HTMLConverter,
TextConverter,
PDFPageAggregator,
HOCRConverter,
)
from .image import ImageWriter from .image import ImageWriter
from .layout import LAParams, LTPage from .layout import LAParams, LTPage
from .pdfdevice import PDFDevice, TagExtractor from .pdfdevice import PDFDevice, TagExtractor
@ -41,8 +47,8 @@ def extract_text_to_fp(
:param inf: a file-like object to read PDF structure from, such as a :param inf: a file-like object to read PDF structure from, such as a
file handler (using the builtin `open()` function) or a `BytesIO`. file handler (using the builtin `open()` function) or a `BytesIO`.
:param outfp: a file-like object to write the text to. :param outfp: a file-like object to write the text to.
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works :param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
properly. Only 'text' works properly.
:param codec: Text decoding codec :param codec: Text decoding codec
:param laparams: An LAParams object from pdfminer.layout. Default is None :param laparams: An LAParams object from pdfminer.layout. Default is None
but may not layout correctly. but may not layout correctly.
@ -100,6 +106,11 @@ def extract_text_to_fp(
imagewriter=imagewriter, imagewriter=imagewriter,
) )
elif output_type == "hocr":
device = HOCRConverter(
rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
)
elif output_type == "tag": elif output_type == "tag":
# Binary I/O is required, but we have no good way to test it here. # Binary I/O is required, but we have no good way to test it here.
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec) device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)

View File

@ -111,6 +111,12 @@ class TestPdf2Txt:
def test_encryption_rc4_128(self): def test_encryption_rc4_128(self):
run("encryption/rc4-128.pdf", "-P foo") run("encryption/rc4-128.pdf", "-P foo")
def test_html_simple1(self):
run("simple1.pdf", "-t html")
def test_hocr_simple1(self):
run("simple1.pdf", "-t hocr")
class TestDumpImages: class TestDumpImages:
@staticmethod @staticmethod