Raise proper error when bad --output-type is used and fix formatting output of TagExtractor

* high_level: emit diagnostic for bad output_type

* TagExtractor: eliminate runtime error

This does not make is usable, but will satisfy my curiosity.

* Use if-elif-else structure

* Fix pycharm spacing warning

* Rename _write_outfp to _write

* Properly format tag names and tag values. Using utils.make_compat_str() such that the tag value is always a string.

* Update CHANGELOG.md

* Fix flake8 errors

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pull/611/head^2
Mingye Wang 2021-09-01 02:46:20 +08:00 committed by GitHub
parent 7f54cefe02
commit 46fa21476a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 33 additions and 19 deletions

View File

@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593)) - Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535)) - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
- Raising `UnboundLocalError` when a bad `--output-type` is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
## Removed ## Removed
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))

View File

@ -56,25 +56,33 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
imagewriter = ImageWriter(output_dir) imagewriter = ImageWriter(output_dir)
rsrcmgr = PDFResourceManager(caching=not disable_caching) rsrcmgr = PDFResourceManager(caching=not disable_caching)
device = None
if output_type != 'text' and outfp == sys.stdout:
outfp = sys.stdout.buffer
if output_type == 'text': if output_type == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter)
if outfp == sys.stdout: elif output_type == 'xml':
outfp = sys.stdout.buffer
if output_type == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter, imagewriter=imagewriter,
stripcontrol=strip_control) stripcontrol=strip_control)
elif output_type == 'html': elif output_type == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter)
elif output_type == 'tag': elif output_type == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else:
msg = f"Output type can be text, html, xml or tag but is " \
f"{output_type}"
raise ValueError(msg)
interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(inf, for page in PDFPage.get_pages(inf,
page_numbers, page_numbers,

View File

@ -154,29 +154,30 @@ class TagExtractor(PDFDevice):
char = font.to_unichr(cid) char = font.to_unichr(cid)
text += char text += char
except PDFUnicodeNotDefined: except PDFUnicodeNotDefined:
print(chars)
pass pass
self.outfp.write(utils.enc(text)) self._write(utils.enc(text))
return return
def begin_page(self, page, ctm): def begin_page(self, page, ctm):
output = '<page id="%s" bbox="%s" rotate="%d">' %\ output = '<page id="%s" bbox="%s" rotate="%d">' %\
(self.pageno, utils.bbox2str(page.mediabox), page.rotate) (self.pageno, utils.bbox2str(page.mediabox), page.rotate)
self.outfp.write(utils.make_compat_bytes(output)) self._write(output)
return return
def end_page(self, page): def end_page(self, page):
self.outfp.write(utils.make_compat_bytes('</page>\n')) self._write('</page>\n')
self.pageno += 1 self.pageno += 1
return return
def begin_tag(self, tag, props=None): def begin_tag(self, tag, props=None):
s = '' s = ''
if isinstance(props, dict): if isinstance(props, dict):
s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v))) s = ''.join([
for (k, v) in sorted(props.items())) ' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
for (k, v) in sorted(props.items())
])
out_s = '<{}{}>'.format(utils.enc(tag.name), s) out_s = '<{}{}>'.format(utils.enc(tag.name), s)
self.outfp.write(utils.make_compat_bytes(out_s)) self._write(out_s)
self._stack.append(tag) self._stack.append(tag)
return return
@ -184,10 +185,13 @@ class TagExtractor(PDFDevice):
assert self._stack, str(self.pageno) assert self._stack, str(self.pageno)
tag = self._stack.pop(-1) tag = self._stack.pop(-1)
out_s = '</%s>' % utils.enc(tag.name) out_s = '</%s>' % utils.enc(tag.name)
self.outfp.write(utils.make_compat_bytes(out_s)) self._write(out_s)
return return
def do_tag(self, tag, props=None): def do_tag(self, tag, props=None):
self.begin_tag(tag, props) self.begin_tag(tag, props)
self._stack.pop(-1) self._stack.pop(-1)
return return
def _write(self, s: str):
self.outfp.write(s.encode(self.codec))

View File

@ -46,13 +46,13 @@ def make_compat_bytes(in_str):
return in_str.encode() return in_str.encode()
def make_compat_str(in_str): def make_compat_str(o):
"""Converts to string, guessing encoding.""" """Converts everything to string, if bytes guessing the encoding."""
assert isinstance(in_str, (bytes, str)), str(type(in_str)) if isinstance(o, bytes):
if isinstance(in_str, bytes): enc = chardet.detect(o)
enc = chardet.detect(in_str) return o.decode(enc['encoding'])
in_str = in_str.decode(enc['encoding']) else:
return in_str return str(o)
def shorten_str(s, size): def shorten_str(s, size):