diff --git a/docs/index.html b/docs/index.html
index 3615953..9098a6c 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -9,7 +9,7 @@
-Last Modified: Sun Oct 17 09:25:27 UTC 2010
+Last Modified: Sun Nov 14 15:03:59 UTC 2010
@@ -265,6 +265,14 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
Specifies the output scale. Can be used in HTML format only.
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index 46043a3..a5476e6 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -209,27 +209,38 @@ class HTMLConverter(PDFConverter):
'char': 'black',
}
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
- scale=1, exact=False, showpageno=True, pagepad=50, outdir=None):
+ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
+ scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50,
+ outdir=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
- self.exact = exact
- self.showpageno = showpageno
- self.pagepad = pagepad
- self.outdir = outdir
self.scale = scale
- self.fontscale = 0.7
- self.write('
\n')
- self.write('\n' % self.codec)
- self.write('\n')
- self.yoffset = self.pagepad
+ self.fontscale = fontscale
+ self.layoutmode = layoutmode
+ self.showpageno = showpageno
+ self.pagemargin = pagemargin
+ self.outdir = outdir
+ self.yoffset = self.pagemargin
self._font = None
self._fontstack = []
+ self.write_header()
return
def write(self, text):
self.outfp.write(text)
return
+ def write_header(self):
+ self.write('\n')
+ self.write('\n' % self.codec)
+ self.write('\n')
+ return
+
+ def write_footer(self):
+ self.write('Page: %s
\n' %
+ ', '.join('%s' % (i,i) for i in xrange(1,self.pageno)))
+ self.write('\n')
+ return
+
def write_text(self, text):
self.write(enc(text, self.codec))
return
@@ -258,7 +269,7 @@ class HTMLConverter(PDFConverter):
color = self.TEXT_COLORS.get(color)
if color is not None:
self.write('' %
- (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale))
+ (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale))
self.write_text(text)
self.write('\n')
return
@@ -282,7 +293,11 @@ class HTMLConverter(PDFConverter):
(fontname, fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
- return
+ return
+
+ def put_newline(self):
+ self.write('
')
+ return
def end_textbox(self, color):
if self._font is not None:
@@ -311,7 +326,7 @@ class HTMLConverter(PDFConverter):
elif isinstance(item, LTImage):
self.place_image(item, 1, item.x0, item.y1, item.width, item.height)
else:
- if self.exact:
+ if self.layoutmode == 'exact':
if isinstance(item, LTTextLine):
self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height)
for child in item:
@@ -328,7 +343,8 @@ class HTMLConverter(PDFConverter):
if isinstance(item, LTTextLine):
for child in item:
render(child)
- self.write('
')
+ if self.layoutmode != 'loose':
+ self.put_newline()
elif isinstance(item, LTTextBox):
self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height,
item.get_writing_mode())
@@ -349,13 +365,11 @@ class HTMLConverter(PDFConverter):
show_layout(child)
return
show_layout(ltpage.layout)
- self.yoffset += self.pagepad
+ self.yoffset += self.pagemargin
return
def close(self):
- self.write('Page: %s
\n' %
- ', '.join('%s' % (i,i) for i in xrange(1,self.pageno)))
- self.write('\n')
+ self.write_footer()
return
@@ -366,10 +380,18 @@ class XMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir
+ self.write_header()
+ return
+
+ def write_header(self):
self.outfp.write('\n' % codec)
self.outfp.write('\n')
return
+ def write_footer(self):
+ self.outfp.write('\n')
+ return
+
def write_text(self, text):
self.outfp.write(enc(text, self.codec))
return
@@ -445,5 +467,5 @@ class XMLConverter(PDFConverter):
return
def close(self):
- self.outfp.write('\n')
+ self.write_footer()
return
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index d368398..5960f85 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -12,11 +12,11 @@ def main(argv):
import getopt
def usage():
print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
- '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
+ '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] '
'[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
return 100
try:
- (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
+ (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:')
except getopt.GetoptError:
return usage()
if not args: return usage()
@@ -30,6 +30,7 @@ def main(argv):
outfile = None
outtype = None
outdir = None
+ layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
scale = 1
@@ -43,10 +44,10 @@ def main(argv):
elif k == '-o': outfile = v
elif k == '-n': laparams = None
elif k == '-A': laparams.all_texts = True
- elif k == '-D': laparams.writing_mode = v
elif k == '-M': laparams.char_margin = float(v)
elif k == '-L': laparams.line_margin = float(v)
elif k == '-W': laparams.word_margin = float(v)
+ elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-t': outtype = v
elif k == '-c': codec = v
@@ -78,7 +79,8 @@ def main(argv):
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
elif outtype == 'html':
- device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
+ device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
+ layoutmode=layoutmode, laparams=laparams, outdir=outdir)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: