Raise proper error when bad --output-type is used and fix formatting output of TagExtractor
* high_level: emit diagnostic for bad output_type * TagExtractor: eliminate runtime error This does not make is usable, but will satisfy my curiosity. * Use if-elif-else structure * Fix pycharm spacing warning * Rename _write_outfp to _write * Properly format tag names and tag values. Using utils.make_compat_str() such that the tag value is always a string. * Update CHANGELOG.md * Fix flake8 errors Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>pull/611/head^2
parent
7f54cefe02
commit
46fa21476a
|
@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||||
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
|
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
|
||||||
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
|
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
|
||||||
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
|
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
|
||||||
|
- Raising `UnboundLocalError` when a bad `--output-type` is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
|
||||||
|
- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
|
||||||
|
|
||||||
## Removed
|
## Removed
|
||||||
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
|
- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
|
||||||
|
|
|
@ -56,25 +56,33 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
|
||||||
imagewriter = ImageWriter(output_dir)
|
imagewriter = ImageWriter(output_dir)
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
rsrcmgr = PDFResourceManager(caching=not disable_caching)
|
||||||
|
device = None
|
||||||
|
|
||||||
|
if output_type != 'text' and outfp == sys.stdout:
|
||||||
|
outfp = sys.stdout.buffer
|
||||||
|
|
||||||
if output_type == 'text':
|
if output_type == 'text':
|
||||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
|
|
||||||
if outfp == sys.stdout:
|
elif output_type == 'xml':
|
||||||
outfp = sys.stdout.buffer
|
|
||||||
|
|
||||||
if output_type == 'xml':
|
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
imagewriter=imagewriter,
|
imagewriter=imagewriter,
|
||||||
stripcontrol=strip_control)
|
stripcontrol=strip_control)
|
||||||
|
|
||||||
elif output_type == 'html':
|
elif output_type == 'html':
|
||||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||||
layoutmode=layoutmode, laparams=laparams,
|
layoutmode=layoutmode, laparams=laparams,
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
|
|
||||||
elif output_type == 'tag':
|
elif output_type == 'tag':
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||||
|
|
||||||
|
else:
|
||||||
|
msg = f"Output type can be text, html, xml or tag but is " \
|
||||||
|
f"{output_type}"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
for page in PDFPage.get_pages(inf,
|
for page in PDFPage.get_pages(inf,
|
||||||
page_numbers,
|
page_numbers,
|
||||||
|
|
|
@ -154,29 +154,30 @@ class TagExtractor(PDFDevice):
|
||||||
char = font.to_unichr(cid)
|
char = font.to_unichr(cid)
|
||||||
text += char
|
text += char
|
||||||
except PDFUnicodeNotDefined:
|
except PDFUnicodeNotDefined:
|
||||||
print(chars)
|
|
||||||
pass
|
pass
|
||||||
self.outfp.write(utils.enc(text))
|
self._write(utils.enc(text))
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page, ctm):
|
def begin_page(self, page, ctm):
|
||||||
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
output = '<page id="%s" bbox="%s" rotate="%d">' %\
|
||||||
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
(self.pageno, utils.bbox2str(page.mediabox), page.rotate)
|
||||||
self.outfp.write(utils.make_compat_bytes(output))
|
self._write(output)
|
||||||
return
|
return
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
self.outfp.write(utils.make_compat_bytes('</page>\n'))
|
self._write('</page>\n')
|
||||||
self.pageno += 1
|
self.pageno += 1
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
def begin_tag(self, tag, props=None):
|
||||||
s = ''
|
s = ''
|
||||||
if isinstance(props, dict):
|
if isinstance(props, dict):
|
||||||
s = ''.join(' {}="{}"'.format(utils.enc(k), utils.enc(str(v)))
|
s = ''.join([
|
||||||
for (k, v) in sorted(props.items()))
|
' {}="{}"'.format(utils.enc(k), utils.make_compat_str(v))
|
||||||
|
for (k, v) in sorted(props.items())
|
||||||
|
])
|
||||||
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
|
out_s = '<{}{}>'.format(utils.enc(tag.name), s)
|
||||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
self._write(out_s)
|
||||||
self._stack.append(tag)
|
self._stack.append(tag)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -184,10 +185,13 @@ class TagExtractor(PDFDevice):
|
||||||
assert self._stack, str(self.pageno)
|
assert self._stack, str(self.pageno)
|
||||||
tag = self._stack.pop(-1)
|
tag = self._stack.pop(-1)
|
||||||
out_s = '</%s>' % utils.enc(tag.name)
|
out_s = '</%s>' % utils.enc(tag.name)
|
||||||
self.outfp.write(utils.make_compat_bytes(out_s))
|
self._write(out_s)
|
||||||
return
|
return
|
||||||
|
|
||||||
def do_tag(self, tag, props=None):
|
def do_tag(self, tag, props=None):
|
||||||
self.begin_tag(tag, props)
|
self.begin_tag(tag, props)
|
||||||
self._stack.pop(-1)
|
self._stack.pop(-1)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def _write(self, s: str):
|
||||||
|
self.outfp.write(s.encode(self.codec))
|
||||||
|
|
|
@ -46,13 +46,13 @@ def make_compat_bytes(in_str):
|
||||||
return in_str.encode()
|
return in_str.encode()
|
||||||
|
|
||||||
|
|
||||||
def make_compat_str(in_str):
|
def make_compat_str(o):
|
||||||
"""Converts to string, guessing encoding."""
|
"""Converts everything to string, if bytes guessing the encoding."""
|
||||||
assert isinstance(in_str, (bytes, str)), str(type(in_str))
|
if isinstance(o, bytes):
|
||||||
if isinstance(in_str, bytes):
|
enc = chardet.detect(o)
|
||||||
enc = chardet.detect(in_str)
|
return o.decode(enc['encoding'])
|
||||||
in_str = in_str.decode(enc['encoding'])
|
else:
|
||||||
return in_str
|
return str(o)
|
||||||
|
|
||||||
|
|
||||||
def shorten_str(s, size):
|
def shorten_str(s, size):
|
||||||
|
|
Loading…
Reference in New Issue