Added: stripcontrol for XMLConverter (-S option)

pull/1/head
Yusuke Shinyama 2014-06-22 00:33:00 +09:00
parent 81391c09f4
commit 44074b42ea
2 changed files with 13 additions and 4 deletions

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys
import logging import logging
import re
from pdfdevice import PDFTextDevice from pdfdevice import PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
@ -386,10 +387,13 @@ class HTMLConverter(PDFConverter):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None): laparams=None, imagewriter=None, stripcontrol=False):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.imagewriter = imagewriter self.imagewriter = imagewriter
self.stripcontrol = stripcontrol
self.write_header() self.write_header()
return return
@ -403,6 +407,8 @@ class XMLConverter(PDFConverter):
return return
def write_text(self, text): def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub(u'', text)
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return

View File

@ -16,12 +16,12 @@ def main(argv):
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
' [-t text|html|xml|tag] [-c codec] [-s scale]' ' [-t text|html|xml|tag] [-c codec] [-s scale]'
' file ...' % argv[0]) ' file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -36,6 +36,7 @@ def main(argv):
outtype = None outtype = None
imagewriter = None imagewriter = None
rotation = 0 rotation = 0
stripcontrol = False
layoutmode = 'normal' layoutmode = 'normal'
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
@ -60,6 +61,7 @@ def main(argv):
elif k == '-Y': layoutmode = v elif k == '-Y': layoutmode = v
elif k == '-O': imagewriter = ImageWriter(v) elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-R': rotation = int(v) elif k == '-R': rotation = int(v)
elif k == '-S': stripcontrol = True
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
@ -88,7 +90,8 @@ def main(argv):
imagewriter=imagewriter) imagewriter=imagewriter)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter,
stripcontrol=stripcontrol)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, layoutmode=layoutmode, laparams=laparams,