From 44074b42eaef5443c916151a142180879d3556df Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Sun, 22 Jun 2014 00:33:00 +0900 Subject: [PATCH] Added: stripcontrol for XMLConverter (-S option) --- pdfminer/converter.py | 8 +++++++- tools/pdf2txt.py | 9 ++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 463433c..e2c24e0 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import sys import logging +import re from pdfdevice import PDFTextDevice from pdffont import PDFUnicodeNotDefined from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve @@ -386,10 +387,13 @@ class HTMLConverter(PDFConverter): ## class XMLConverter(PDFConverter): + CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]') + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, - laparams=None, imagewriter=None): + laparams=None, imagewriter=None, stripcontrol=False): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.imagewriter = imagewriter + self.stripcontrol = stripcontrol self.write_header() return @@ -403,6 +407,8 @@ class XMLConverter(PDFConverter): return def write_text(self, text): + if self.stripcontrol: + text = self.CONTROL.sub(u'', text) self.outfp.write(enc(text, self.codec)) return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 27d85d5..5eb24bf 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -16,12 +16,12 @@ def main(argv): def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' - ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' + ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -36,6 +36,7 @@ def main(argv): outtype = None imagewriter = None rotation = 0 + stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 @@ -60,6 +61,7 @@ def main(argv): elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) + elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) @@ -88,7 +90,8 @@ def main(argv): imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, - imagewriter=imagewriter) + imagewriter=imagewriter, + stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams,