htmlconverter improved
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@274 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
fb4ce96309
commit
7374b81383
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sun Oct 17 09:25:27 UTC 2010
|
Last Modified: Sun Nov 14 15:03:59 UTC 2010
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -265,6 +265,14 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
||||||
<dd> Forces to perform layout analysis for all the text strings,
|
<dd> Forces to perform layout analysis for all the text strings,
|
||||||
including texts contained in figures.
|
including texts contained in figures.
|
||||||
<p>
|
<p>
|
||||||
|
<dt> <code>-Y <em>layout_mode</em></code>
|
||||||
|
<dd> Specifies how the page layout should be preserved. (Currently only applies to HTML format.)
|
||||||
|
<ul>
|
||||||
|
<li> <code>exact</code> : preserve the exact location of each individual character (a large and messy HTML).
|
||||||
|
<li> <code>normal</code> : preserve the location and line breaks in each text block. (Default)
|
||||||
|
<li> <code>loose</code> : preserve the overall location of each text block.
|
||||||
|
</ul>
|
||||||
|
<p>
|
||||||
<dt> <code>-s <em>scale</em></code>
|
<dt> <code>-s <em>scale</em></code>
|
||||||
<dd> Specifies the output scale. Can be used in HTML format only.
|
<dd> Specifies the output scale. Can be used in HTML format only.
|
||||||
<p>
|
<p>
|
||||||
|
|
|
@ -209,27 +209,38 @@ class HTMLConverter(PDFConverter):
|
||||||
'char': 'black',
|
'char': 'black',
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, exact=False, showpageno=True, pagepad=50, outdir=None):
|
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50,
|
||||||
|
outdir=None):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.exact = exact
|
|
||||||
self.showpageno = showpageno
|
|
||||||
self.pagepad = pagepad
|
|
||||||
self.outdir = outdir
|
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.fontscale = 0.7
|
self.fontscale = fontscale
|
||||||
self.write('<html><head>\n')
|
self.layoutmode = layoutmode
|
||||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
self.showpageno = showpageno
|
||||||
self.write('</head><body>\n')
|
self.pagemargin = pagemargin
|
||||||
self.yoffset = self.pagepad
|
self.outdir = outdir
|
||||||
|
self.yoffset = self.pagemargin
|
||||||
self._font = None
|
self._font = None
|
||||||
self._fontstack = []
|
self._fontstack = []
|
||||||
|
self.write_header()
|
||||||
return
|
return
|
||||||
|
|
||||||
def write(self, text):
|
def write(self, text):
|
||||||
self.outfp.write(text)
|
self.outfp.write(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_header(self):
|
||||||
|
self.write('<html><head>\n')
|
||||||
|
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||||
|
self.write('</head><body>\n')
|
||||||
|
return
|
||||||
|
|
||||||
|
def write_footer(self):
|
||||||
|
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||||
|
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||||
|
self.write('</body></html>\n')
|
||||||
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
self.write(enc(text, self.codec))
|
self.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
@ -258,7 +269,7 @@ class HTMLConverter(PDFConverter):
|
||||||
color = self.TEXT_COLORS.get(color)
|
color = self.TEXT_COLORS.get(color)
|
||||||
if color is not None:
|
if color is not None:
|
||||||
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||||
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
|
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale))
|
||||||
self.write_text(text)
|
self.write_text(text)
|
||||||
self.write('</span>\n')
|
self.write('</span>\n')
|
||||||
return
|
return
|
||||||
|
@ -282,7 +293,11 @@ class HTMLConverter(PDFConverter):
|
||||||
(fontname, fontsize * self.scale * self.fontscale))
|
(fontname, fontsize * self.scale * self.fontscale))
|
||||||
self._font = font
|
self._font = font
|
||||||
self.write_text(text)
|
self.write_text(text)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def put_newline(self):
|
||||||
|
self.write('<br>')
|
||||||
|
return
|
||||||
|
|
||||||
def end_textbox(self, color):
|
def end_textbox(self, color):
|
||||||
if self._font is not None:
|
if self._font is not None:
|
||||||
|
@ -311,7 +326,7 @@ class HTMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
||||||
else:
|
else:
|
||||||
if self.exact:
|
if self.layoutmode == 'exact':
|
||||||
if isinstance(item, LTTextLine):
|
if isinstance(item, LTTextLine):
|
||||||
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
|
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
|
@ -328,7 +343,8 @@ class HTMLConverter(PDFConverter):
|
||||||
if isinstance(item, LTTextLine):
|
if isinstance(item, LTTextLine):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
self.write('<br>')
|
if self.layoutmode != 'loose':
|
||||||
|
self.put_newline()
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
|
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
|
||||||
item.get_writing_mode())
|
item.get_writing_mode())
|
||||||
|
@ -349,13 +365,11 @@ class HTMLConverter(PDFConverter):
|
||||||
show_layout(child)
|
show_layout(child)
|
||||||
return
|
return
|
||||||
show_layout(ltpage.layout)
|
show_layout(ltpage.layout)
|
||||||
self.yoffset += self.pagepad
|
self.yoffset += self.pagemargin
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
self.write_footer()
|
||||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
|
||||||
self.write('</body></html>\n')
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -366,10 +380,18 @@ class XMLConverter(PDFConverter):
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
|
self.write_header()
|
||||||
|
return
|
||||||
|
|
||||||
|
def write_header(self):
|
||||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||||
self.outfp.write('<pages>\n')
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_footer(self):
|
||||||
|
self.outfp.write('</pages>\n')
|
||||||
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
self.outfp.write(enc(text, self.codec))
|
self.outfp.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
@ -445,5 +467,5 @@ class XMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.outfp.write('</pages>\n')
|
self.write_footer()
|
||||||
return
|
return
|
||||||
|
|
|
@ -12,11 +12,11 @@ def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
||||||
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
|
'[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] '
|
||||||
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
|
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
|
@ -30,6 +30,7 @@ def main(argv):
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
outdir = None
|
outdir = None
|
||||||
|
layoutmode = 'normal'
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
scale = 1
|
scale = 1
|
||||||
|
@ -43,10 +44,10 @@ def main(argv):
|
||||||
elif k == '-o': outfile = v
|
elif k == '-o': outfile = v
|
||||||
elif k == '-n': laparams = None
|
elif k == '-n': laparams = None
|
||||||
elif k == '-A': laparams.all_texts = True
|
elif k == '-A': laparams.all_texts = True
|
||||||
elif k == '-D': laparams.writing_mode = v
|
|
||||||
elif k == '-M': laparams.char_margin = float(v)
|
elif k == '-M': laparams.char_margin = float(v)
|
||||||
elif k == '-L': laparams.line_margin = float(v)
|
elif k == '-L': laparams.line_margin = float(v)
|
||||||
elif k == '-W': laparams.word_margin = float(v)
|
elif k == '-W': laparams.word_margin = float(v)
|
||||||
|
elif k == '-Y': layoutmode = v
|
||||||
elif k == '-O': outdir = v
|
elif k == '-O': outdir = v
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
|
@ -78,7 +79,8 @@ def main(argv):
|
||||||
elif outtype == 'xml':
|
elif outtype == 'xml':
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
|
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||||
|
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue