Added: stripcontrol for XMLConverter (-S option)
parent
81391c09f4
commit
44074b42ea
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pdfdevice import PDFTextDevice
|
from pdfdevice import PDFTextDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
|
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
|
||||||
|
@ -386,10 +387,13 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class XMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
|
CONTROL = re.compile(ur'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||||
laparams=None, imagewriter=None):
|
laparams=None, imagewriter=None, stripcontrol=False):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.imagewriter = imagewriter
|
self.imagewriter = imagewriter
|
||||||
|
self.stripcontrol = stripcontrol
|
||||||
self.write_header()
|
self.write_header()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -403,6 +407,8 @@ class XMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
|
if self.stripcontrol:
|
||||||
|
text = self.CONTROL.sub(u'', text)
|
||||||
self.outfp.write(enc(text, self.codec))
|
self.outfp.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -16,12 +16,12 @@ def main(argv):
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
|
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
|
||||||
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
|
' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
|
||||||
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
|
' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
|
||||||
' [-t text|html|xml|tag] [-c codec] [-s scale]'
|
' [-t text|html|xml|tag] [-c codec] [-s scale]'
|
||||||
' file ...' % argv[0])
|
' file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -36,6 +36,7 @@ def main(argv):
|
||||||
outtype = None
|
outtype = None
|
||||||
imagewriter = None
|
imagewriter = None
|
||||||
rotation = 0
|
rotation = 0
|
||||||
|
stripcontrol = False
|
||||||
layoutmode = 'normal'
|
layoutmode = 'normal'
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
|
@ -60,6 +61,7 @@ def main(argv):
|
||||||
elif k == '-Y': layoutmode = v
|
elif k == '-Y': layoutmode = v
|
||||||
elif k == '-O': imagewriter = ImageWriter(v)
|
elif k == '-O': imagewriter = ImageWriter(v)
|
||||||
elif k == '-R': rotation = int(v)
|
elif k == '-R': rotation = int(v)
|
||||||
|
elif k == '-S': stripcontrol = True
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
|
@ -88,7 +90,8 @@ def main(argv):
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
elif outtype == 'xml':
|
elif outtype == 'xml':
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter,
|
||||||
|
stripcontrol=stripcontrol)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||||
layoutmode=layoutmode, laparams=laparams,
|
layoutmode=layoutmode, laparams=laparams,
|
||||||
|
|
Loading…
Reference in New Issue