htmlconverter improved

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@274 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-11-14 15:04:28 +00:00
parent fb4ce96309
commit 7374b81383
3 changed files with 57 additions and 25 deletions

View File

@ -9,7 +9,7 @@
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Oct 17 09:25:27 UTC 2010 Last Modified: Sun Nov 14 15:03:59 UTC 2010
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -265,6 +265,14 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
<dd> Forces to perform layout analysis for all the text strings, <dd> Forces to perform layout analysis for all the text strings,
including texts contained in figures. including texts contained in figures.
<p> <p>
<dt> <code>-Y <em>layout_mode</em></code>
<dd> Specifies how the page layout should be preserved. (Currently only applies to HTML format.)
<ul>
<li> <code>exact</code> : preserve the exact location of each individual character (a large and messy HTML).
<li> <code>normal</code> : preserve the location and line breaks in each text block. (Default)
<li> <code>loose</code> : preserve the overall location of each text block.
</ul>
<p>
<dt> <code>-s <em>scale</em></code> <dt> <code>-s <em>scale</em></code>
<dd> Specifies the output scale. Can be used in HTML format only. <dd> Specifies the output scale. Can be used in HTML format only.
<p> <p>

View File

@ -210,26 +210,37 @@ class HTMLConverter(PDFConverter):
} }
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, exact=False, showpageno=True, pagepad=50, outdir=None): scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50,
outdir=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.exact = exact
self.showpageno = showpageno
self.pagepad = pagepad
self.outdir = outdir
self.scale = scale self.scale = scale
self.fontscale = 0.7 self.fontscale = fontscale
self.write('<html><head>\n') self.layoutmode = layoutmode
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec) self.showpageno = showpageno
self.write('</head><body>\n') self.pagemargin = pagemargin
self.yoffset = self.pagepad self.outdir = outdir
self.yoffset = self.pagemargin
self._font = None self._font = None
self._fontstack = [] self._fontstack = []
self.write_header()
return return
def write(self, text): def write(self, text):
self.outfp.write(text) self.outfp.write(text)
return return
def write_header(self):
self.write('<html><head>\n')
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
self.write('</head><body>\n')
return
def write_footer(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.write('</body></html>\n')
return
def write_text(self, text): def write_text(self, text):
self.write(enc(text, self.codec)) self.write(enc(text, self.codec))
return return
@ -258,7 +269,7 @@ class HTMLConverter(PDFConverter):
color = self.TEXT_COLORS.get(color) color = self.TEXT_COLORS.get(color)
if color is not None: if color is not None:
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' % self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale))
self.write_text(text) self.write_text(text)
self.write('</span>\n') self.write('</span>\n')
return return
@ -284,6 +295,10 @@ class HTMLConverter(PDFConverter):
self.write_text(text) self.write_text(text)
return return
def put_newline(self):
self.write('<br>')
return
def end_textbox(self, color): def end_textbox(self, color):
if self._font is not None: if self._font is not None:
self.write('</span>') self.write('</span>')
@ -311,7 +326,7 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
self.place_image(item, 1, item.x0, item.y1, item.width, item.height) self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
else: else:
if self.exact: if self.layoutmode == 'exact':
if isinstance(item, LTTextLine): if isinstance(item, LTTextLine):
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height) self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
for child in item: for child in item:
@ -328,7 +343,8 @@ class HTMLConverter(PDFConverter):
if isinstance(item, LTTextLine): if isinstance(item, LTTextLine):
for child in item: for child in item:
render(child) render(child)
self.write('<br>') if self.layoutmode != 'loose':
self.put_newline()
elif isinstance(item, LTTextBox): elif isinstance(item, LTTextBox):
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height, self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
item.get_writing_mode()) item.get_writing_mode())
@ -349,13 +365,11 @@ class HTMLConverter(PDFConverter):
show_layout(child) show_layout(child)
return return
show_layout(ltpage.layout) show_layout(ltpage.layout)
self.yoffset += self.pagepad self.yoffset += self.pagemargin
return return
def close(self): def close(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.write_footer()
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
self.write('</body></html>\n')
return return
@ -366,10 +380,18 @@ class XMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir self.outdir = outdir
self.write_header()
return
def write_header(self):
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec) self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
return return
def write_footer(self):
self.outfp.write('</pages>\n')
return
def write_text(self, text): def write_text(self, text):
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
@ -445,5 +467,5 @@ class XMLConverter(PDFConverter):
return return
def close(self): def close(self):
self.outfp.write('</pages>\n') self.write_footer()
return return

View File

@ -12,11 +12,11 @@ def main(argv):
import getopt import getopt
def usage(): def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
@ -30,6 +30,7 @@ def main(argv):
outfile = None outfile = None
outtype = None outtype = None
outdir = None outdir = None
layoutmode = 'normal'
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
scale = 1 scale = 1
@ -43,10 +44,10 @@ def main(argv):
elif k == '-o': outfile = v elif k == '-o': outfile = v
elif k == '-n': laparams = None elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True elif k == '-A': laparams.all_texts = True
elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v) elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v) elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v) elif k == '-W': laparams.word_margin = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v elif k == '-O': outdir = v
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
@ -78,7 +79,8 @@ def main(argv):
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else: