From 46fa21476ad916e6c882697660e0cd5e4d4f6ec4 Mon Sep 17 00:00:00 2001 From: Mingye Wang Date: Wed, 1 Sep 2021 02:46:20 +0800 Subject: [PATCH] Raise proper error when bad --output-type is used and fix formatting output of TagExtractor * high_level: emit diagnostic for bad output_type * TagExtractor: eliminate runtime error This does not make is usable, but will satisfy my curiosity. * Use if-elif-else structure * Fix pycharm spacing warning * Rename _write_outfp to _write * Properly format tag names and tag values. Using utils.make_compat_str() such that the tag value is always a string. * Update CHANGELOG.md * Fix flake8 errors Co-authored-by: Pieter Marsman --- CHANGELOG.md | 2 ++ pdfminer/high_level.py | 16 ++++++++++++---- pdfminer/pdfdevice.py | 20 ++++++++++++-------- pdfminer/utils.py | 14 +++++++------- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c55ccb..8cb40ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593)) - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535)) - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of BeziƩr path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530)) +- Raising `UnboundLocalError` when a bad `--output-type` is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610)) +- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610)) ## Removed - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522)) diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 96911ec..33f661c 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -56,25 +56,33 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8', imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) + device = None + + if output_type != 'text' and outfp == sys.stdout: + outfp = sys.stdout.buffer if output_type == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) - if outfp == sys.stdout: - outfp = sys.stdout.buffer - - if output_type == 'xml': + elif output_type == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=strip_control) + elif output_type == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) + elif output_type == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) + else: + msg = f"Output type can be text, html, xml or tag but is " \ + f"{output_type}" + raise ValueError(msg) + interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 1f66a31..82ede76 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -154,29 +154,30 @@ class TagExtractor(PDFDevice): char = font.to_unichr(cid) text += char except PDFUnicodeNotDefined: - print(chars) pass - self.outfp.write(utils.enc(text)) + self._write(utils.enc(text)) return def begin_page(self, page, ctm): output = '' %\ (self.pageno, utils.bbox2str(page.mediabox), page.rotate) - self.outfp.write(utils.make_compat_bytes(output)) + self._write(output) return def end_page(self, page): - self.outfp.write(utils.make_compat_bytes('\n')) + self._write('\n') self.pageno += 1 return def begin_tag(self, tag, props=None): s = '' if isinstance(props, dict): - s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v))) - for (k, v) in sorted(props.items())) + s = ''.join([ + ' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v)) + for (k, v) in sorted(props.items()) + ]) out_s = '<{}{}>'.format(utils.enc(tag.name), s) - self.outfp.write(utils.make_compat_bytes(out_s)) + self._write(out_s) self._stack.append(tag) return @@ -184,10 +185,13 @@ class TagExtractor(PDFDevice): assert self._stack, str(self.pageno) tag = self._stack.pop(-1) out_s = '' % utils.enc(tag.name) - self.outfp.write(utils.make_compat_bytes(out_s)) + self._write(out_s) return def do_tag(self, tag, props=None): self.begin_tag(tag, props) self._stack.pop(-1) return + + def _write(self, s: str): + self.outfp.write(s.encode(self.codec)) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index d6507ef..4aabb52 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -46,13 +46,13 @@ def make_compat_bytes(in_str): return in_str.encode() -def make_compat_str(in_str): - """Converts to string, guessing encoding.""" - assert isinstance(in_str, (bytes, str)), str(type(in_str)) - if isinstance(in_str, bytes): - enc = chardet.detect(in_str) - in_str = in_str.decode(enc['encoding']) - return in_str +def make_compat_str(o): + """Converts everything to string, if bytes guessing the encoding.""" + if isinstance(o, bytes): + enc = chardet.detect(o) + return o.decode(enc['encoding']) + else: + return str(o) def shorten_str(s, size):