* Add HOCRConverter * Add line to README.md * Test cicd * Test cicd 2 * Changes based on review comments * Remove whitespace changes to CHANGELOG.md * Remove duplicated html output * Add link to hocr wiki * Add tests for extracting hocr and html Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/790/head
parent
f79ad56f48
commit
77df431871
|
@ -1,11 +1,12 @@
|
|||
# Changelog
|
||||
|
||||
All notable changes in pdfminer.six will be documented in this file.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
|
||||
|
||||
### Fixed
|
||||
|
||||
- 'ValueError': when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
|
||||
|
|
|
@ -23,6 +23,7 @@ Features
|
|||
|
||||
* Written entirely in Python.
|
||||
* Parse, analyze, and convert PDF documents.
|
||||
* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
|
||||
* PDF-1.7 specification support. (well, almost).
|
||||
* CJK languages and vertical writing scripts support.
|
||||
* Various font types (Type1, TrueType, Type3, and CID) support.
|
||||
|
|
|
@ -19,6 +19,7 @@ from pdfminer.pdfcolor import PDFColorSpace
|
|||
from . import utils
|
||||
from .image import ImageWriter
|
||||
from .layout import LAParams, LTComponent, TextGroupElement
|
||||
from .layout import LTAnno
|
||||
from .layout import LTChar
|
||||
from .layout import LTContainer
|
||||
from .layout import LTCurve
|
||||
|
@ -821,3 +822,179 @@ class XMLConverter(PDFConverter[AnyIO]):
|
|||
def close(self) -> None:
|
||||
self.write_footer()
|
||||
return
|
||||
|
||||
|
||||
class HOCRConverter(PDFConverter[AnyIO]):
|
||||
"""Extract an hOCR representation from explicit text information within a PDF."""
|
||||
|
||||
# Where text is being extracted from a variety of types of PDF within a
|
||||
# business process, those PDFs where the text is only present in image
|
||||
# form will need to be analysed using an OCR tool which will typically
|
||||
# output hOCR. This converter extracts the explicit text information from
|
||||
# those PDFs that do have it and uses it to genxerate a basic hOCR
|
||||
# representation that is designed to be used in conjunction with the image
|
||||
# of the PDF in the same way as genuine OCR output would be, but without the
|
||||
# inevitable OCR errors.
|
||||
|
||||
# The converter does not handle images, diagrams or text colors.
|
||||
|
||||
# In the examples processed by the contributor it was necessary to set
|
||||
# LAParams.all_texts to True.
|
||||
|
||||
CONTROL = re.compile(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]")
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rsrcmgr: PDFResourceManager,
|
||||
outfp: AnyIO,
|
||||
codec: str = "utf8",
|
||||
pageno: int = 1,
|
||||
laparams: Optional[LAParams] = None,
|
||||
stripcontrol: bool = False,
|
||||
):
|
||||
PDFConverter.__init__(
|
||||
self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams
|
||||
)
|
||||
self.stripcontrol = stripcontrol
|
||||
self.within_chars = False
|
||||
self.write_header()
|
||||
|
||||
def bbox_repr(self, bbox: Rect) -> str:
|
||||
(in_x0, in_y0, in_x1, in_y1) = bbox
|
||||
# PDF y-coordinates are the other way round from hOCR coordinates
|
||||
out_x0 = int(in_x0)
|
||||
out_y0 = int(self.page_bbox[3] - in_y1)
|
||||
out_x1 = int(in_x1)
|
||||
out_y1 = int(self.page_bbox[3] - in_y0)
|
||||
return f"bbox {out_x0} {out_y0} {out_x1} {out_y1}"
|
||||
|
||||
def write(self, text: str) -> None:
|
||||
if self.codec:
|
||||
encoded_text = text.encode(self.codec)
|
||||
cast(BinaryIO, self.outfp).write(encoded_text)
|
||||
else:
|
||||
cast(TextIO, self.outfp).write(text)
|
||||
|
||||
def write_header(self) -> None:
|
||||
if self.codec:
|
||||
self.write(
|
||||
"<html xmlns='http://www.w3.org/1999/xhtml' "
|
||||
"xml:lang='en' lang='en' charset='%s'>\n" % self.codec
|
||||
)
|
||||
else:
|
||||
self.write(
|
||||
"<html xmlns='http://www.w3.org/1999/xhtml' "
|
||||
"xml:lang='en' lang='en'>\n"
|
||||
)
|
||||
self.write("<head>\n")
|
||||
self.write("<title></title>\n")
|
||||
self.write(
|
||||
"<meta http-equiv='Content-Type' " "content='text/html;charset=utf-8' />\n"
|
||||
)
|
||||
self.write(
|
||||
"<meta name='ocr-system' " "content='pdfminer.six HOCR Converter' />\n"
|
||||
)
|
||||
self.write(
|
||||
" <meta name='ocr-capabilities'"
|
||||
" content='ocr_page ocr_block ocr_line ocrx_word'/>\n"
|
||||
)
|
||||
self.write("</head>\n")
|
||||
self.write("<body>\n")
|
||||
|
||||
def write_footer(self) -> None:
|
||||
self.write("<!-- comment in the following line to debug -->\n")
|
||||
self.write(
|
||||
"<!--script src='https://unpkg.com/hocrjs'>" "</script--></body></html>\n"
|
||||
)
|
||||
|
||||
def write_text(self, text: str) -> None:
|
||||
if self.stripcontrol:
|
||||
text = self.CONTROL.sub("", text)
|
||||
self.write(text)
|
||||
|
||||
def write_word(self) -> None:
|
||||
if len(self.working_text) > 0:
|
||||
bold_and_italic_styles = ""
|
||||
if "Italic" in self.working_font:
|
||||
bold_and_italic_styles = "font-style: italic; "
|
||||
if "Bold" in self.working_font:
|
||||
bold_and_italic_styles += "font-weight: bold; "
|
||||
self.write(
|
||||
"<span style='font:\"%s\"; font-size:%d; %s' "
|
||||
"class='ocrx_word' title='%s; x_font %s; "
|
||||
"x_fsize %d'>%s</span>"
|
||||
% (
|
||||
(
|
||||
self.working_font,
|
||||
self.working_size,
|
||||
bold_and_italic_styles,
|
||||
self.bbox_repr(self.working_bbox),
|
||||
self.working_font,
|
||||
self.working_size,
|
||||
self.working_text.strip(),
|
||||
)
|
||||
)
|
||||
)
|
||||
self.within_chars = False
|
||||
|
||||
def receive_layout(self, ltpage: LTPage) -> None:
|
||||
def render(item: LTItem) -> None:
|
||||
if self.within_chars and isinstance(item, LTAnno):
|
||||
self.write_word()
|
||||
if isinstance(item, LTPage):
|
||||
self.page_bbox = item.bbox
|
||||
self.write(
|
||||
"<div class='ocr_page' id='%s' title='%s'>\n"
|
||||
% (item.pageid, self.bbox_repr(item.bbox))
|
||||
)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write("</div>\n")
|
||||
elif isinstance(item, LTTextLine):
|
||||
self.write(
|
||||
"<span class='ocr_line' title='%s'>" % ((self.bbox_repr(item.bbox)))
|
||||
)
|
||||
for child_line in item:
|
||||
render(child_line)
|
||||
self.write("</span>\n")
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.write(
|
||||
"<div class='ocr_block' id='%d' title='%s'>\n"
|
||||
% (item.index, self.bbox_repr(item.bbox))
|
||||
)
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write("</div>\n")
|
||||
elif isinstance(item, LTChar):
|
||||
if not self.within_chars:
|
||||
self.within_chars = True
|
||||
self.working_text = item.get_text()
|
||||
self.working_bbox = item.bbox
|
||||
self.working_font = item.fontname
|
||||
self.working_size = item.size
|
||||
else:
|
||||
if len(item.get_text().strip()) == 0:
|
||||
self.write_word()
|
||||
self.write(item.get_text())
|
||||
else:
|
||||
if (
|
||||
self.working_bbox[1] != item.bbox[1]
|
||||
or self.working_font != item.fontname
|
||||
or self.working_size != item.size
|
||||
):
|
||||
self.write_word()
|
||||
self.working_bbox = item.bbox
|
||||
self.working_font = item.fontname
|
||||
self.working_size = item.size
|
||||
self.working_text += item.get_text()
|
||||
self.working_bbox = (
|
||||
self.working_bbox[0],
|
||||
self.working_bbox[1],
|
||||
item.bbox[2],
|
||||
self.working_bbox[3],
|
||||
)
|
||||
|
||||
render(ltpage)
|
||||
|
||||
def close(self) -> None:
|
||||
self.write_footer()
|
||||
|
|
|
@ -5,7 +5,13 @@ import sys
|
|||
from io import StringIO
|
||||
from typing import Any, BinaryIO, Container, Iterator, Optional, cast
|
||||
|
||||
from .converter import XMLConverter, HTMLConverter, TextConverter, PDFPageAggregator
|
||||
from .converter import (
|
||||
XMLConverter,
|
||||
HTMLConverter,
|
||||
TextConverter,
|
||||
PDFPageAggregator,
|
||||
HOCRConverter,
|
||||
)
|
||||
from .image import ImageWriter
|
||||
from .layout import LAParams, LTPage
|
||||
from .pdfdevice import PDFDevice, TagExtractor
|
||||
|
@ -41,8 +47,8 @@ def extract_text_to_fp(
|
|||
:param inf: a file-like object to read PDF structure from, such as a
|
||||
file handler (using the builtin `open()` function) or a `BytesIO`.
|
||||
:param outfp: a file-like object to write the text to.
|
||||
:param output_type: May be 'text', 'xml', 'html', 'tag'. Only 'text' works
|
||||
properly.
|
||||
:param output_type: May be 'text', 'xml', 'html', 'hocr', 'tag'.
|
||||
Only 'text' works properly.
|
||||
:param codec: Text decoding codec
|
||||
:param laparams: An LAParams object from pdfminer.layout. Default is None
|
||||
but may not layout correctly.
|
||||
|
@ -100,6 +106,11 @@ def extract_text_to_fp(
|
|||
imagewriter=imagewriter,
|
||||
)
|
||||
|
||||
elif output_type == "hocr":
|
||||
device = HOCRConverter(
|
||||
rsrcmgr, outfp, codec=codec, laparams=laparams, stripcontrol=strip_control
|
||||
)
|
||||
|
||||
elif output_type == "tag":
|
||||
# Binary I/O is required, but we have no good way to test it here.
|
||||
device = TagExtractor(rsrcmgr, cast(BinaryIO, outfp), codec=codec)
|
||||
|
|
|
@ -111,6 +111,12 @@ class TestPdf2Txt:
|
|||
def test_encryption_rc4_128(self):
|
||||
run("encryption/rc4-128.pdf", "-P foo")
|
||||
|
||||
def test_html_simple1(self):
|
||||
run("simple1.pdf", "-t html")
|
||||
|
||||
def test_hocr_simple1(self):
|
||||
run("simple1.pdf", "-t hocr")
|
||||
|
||||
|
||||
class TestDumpImages:
|
||||
@staticmethod
|
||||
|
|
Loading…
Reference in New Issue