htmlconverter improved
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@274 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
fb4ce96309
commit
7374b81383
|
@ -9,7 +9,7 @@
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Oct 17 09:25:27 UTC 2010
|
||||
Last Modified: Sun Nov 14 15:03:59 UTC 2010
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -265,6 +265,14 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
|
|||
<dd> Forces to perform layout analysis for all the text strings,
|
||||
including texts contained in figures.
|
||||
<p>
|
||||
<dt> <code>-Y <em>layout_mode</em></code>
|
||||
<dd> Specifies how the page layout should be preserved. (Currently only applies to HTML format.)
|
||||
<ul>
|
||||
<li> <code>exact</code> : preserve the exact location of each individual character (a large and messy HTML).
|
||||
<li> <code>normal</code> : preserve the location and line breaks in each text block. (Default)
|
||||
<li> <code>loose</code> : preserve the overall location of each text block.
|
||||
</ul>
|
||||
<p>
|
||||
<dt> <code>-s <em>scale</em></code>
|
||||
<dd> Specifies the output scale. Can be used in HTML format only.
|
||||
<p>
|
||||
|
|
|
@ -210,26 +210,37 @@ class HTMLConverter(PDFConverter):
|
|||
}
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, exact=False, showpageno=True, pagepad=50, outdir=None):
|
||||
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50,
|
||||
outdir=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.exact = exact
|
||||
self.showpageno = showpageno
|
||||
self.pagepad = pagepad
|
||||
self.outdir = outdir
|
||||
self.scale = scale
|
||||
self.fontscale = 0.7
|
||||
self.write('<html><head>\n')
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
self.write('</head><body>\n')
|
||||
self.yoffset = self.pagepad
|
||||
self.fontscale = fontscale
|
||||
self.layoutmode = layoutmode
|
||||
self.showpageno = showpageno
|
||||
self.pagemargin = pagemargin
|
||||
self.outdir = outdir
|
||||
self.yoffset = self.pagemargin
|
||||
self._font = None
|
||||
self._fontstack = []
|
||||
self.write_header()
|
||||
return
|
||||
|
||||
def write(self, text):
|
||||
self.outfp.write(text)
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
self.write('<html><head>\n')
|
||||
self.write('<meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
self.write('</head><body>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||
self.write('</body></html>\n')
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
self.write(enc(text, self.codec))
|
||||
return
|
||||
|
@ -258,7 +269,7 @@ class HTMLConverter(PDFConverter):
|
|||
color = self.TEXT_COLORS.get(color)
|
||||
if color is not None:
|
||||
self.write('<span style="position:absolute; color:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
||||
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
|
||||
(color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale))
|
||||
self.write_text(text)
|
||||
self.write('</span>\n')
|
||||
return
|
||||
|
@ -284,6 +295,10 @@ class HTMLConverter(PDFConverter):
|
|||
self.write_text(text)
|
||||
return
|
||||
|
||||
def put_newline(self):
|
||||
self.write('<br>')
|
||||
return
|
||||
|
||||
def end_textbox(self, color):
|
||||
if self._font is not None:
|
||||
self.write('</span>')
|
||||
|
@ -311,7 +326,7 @@ class HTMLConverter(PDFConverter):
|
|||
elif isinstance(item, LTImage):
|
||||
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
|
||||
else:
|
||||
if self.exact:
|
||||
if self.layoutmode == 'exact':
|
||||
if isinstance(item, LTTextLine):
|
||||
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
|
||||
for child in item:
|
||||
|
@ -328,7 +343,8 @@ class HTMLConverter(PDFConverter):
|
|||
if isinstance(item, LTTextLine):
|
||||
for child in item:
|
||||
render(child)
|
||||
self.write('<br>')
|
||||
if self.layoutmode != 'loose':
|
||||
self.put_newline()
|
||||
elif isinstance(item, LTTextBox):
|
||||
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
|
||||
item.get_writing_mode())
|
||||
|
@ -349,13 +365,11 @@ class HTMLConverter(PDFConverter):
|
|||
show_layout(child)
|
||||
return
|
||||
show_layout(ltpage.layout)
|
||||
self.yoffset += self.pagepad
|
||||
self.yoffset += self.pagemargin
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
||||
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(1,self.pageno)))
|
||||
self.write('</body></html>\n')
|
||||
self.write_footer()
|
||||
return
|
||||
|
||||
|
||||
|
@ -366,10 +380,18 @@ class XMLConverter(PDFConverter):
|
|||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.outdir = outdir
|
||||
self.write_header()
|
||||
return
|
||||
|
||||
def write_header(self):
|
||||
self.outfp.write('<?xml version="1.0" encoding="%s" ?>\n' % codec)
|
||||
self.outfp.write('<pages>\n')
|
||||
return
|
||||
|
||||
def write_footer(self):
|
||||
self.outfp.write('</pages>\n')
|
||||
return
|
||||
|
||||
def write_text(self, text):
|
||||
self.outfp.write(enc(text, self.codec))
|
||||
return
|
||||
|
@ -445,5 +467,5 @@ class XMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def close(self):
|
||||
self.outfp.write('</pages>\n')
|
||||
self.write_footer()
|
||||
return
|
||||
|
|
|
@ -12,11 +12,11 @@ def main(argv):
|
|||
import getopt
|
||||
def usage():
|
||||
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
|
||||
'[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
|
||||
'[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] '
|
||||
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
|
@ -30,6 +30,7 @@ def main(argv):
|
|||
outfile = None
|
||||
outtype = None
|
||||
outdir = None
|
||||
layoutmode = 'normal'
|
||||
codec = 'utf-8'
|
||||
pageno = 1
|
||||
scale = 1
|
||||
|
@ -43,10 +44,10 @@ def main(argv):
|
|||
elif k == '-o': outfile = v
|
||||
elif k == '-n': laparams = None
|
||||
elif k == '-A': laparams.all_texts = True
|
||||
elif k == '-D': laparams.writing_mode = v
|
||||
elif k == '-M': laparams.char_margin = float(v)
|
||||
elif k == '-L': laparams.line_margin = float(v)
|
||||
elif k == '-W': laparams.word_margin = float(v)
|
||||
elif k == '-Y': layoutmode = v
|
||||
elif k == '-O': outdir = v
|
||||
elif k == '-t': outtype = v
|
||||
elif k == '-c': codec = v
|
||||
|
@ -78,7 +79,8 @@ def main(argv):
|
|||
elif outtype == 'xml':
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue