diff --git a/docs/index.html b/docs/index.html index 3615953..9098a6c 100644 --- a/docs/index.html +++ b/docs/index.html @@ -9,7 +9,7 @@
-Last Modified: Sun Oct 17 09:25:27 UTC 2010 +Last Modified: Sun Nov 14 15:03:59 UTC 2010
@@ -265,6 +265,14 @@ are M = 1.0, L = 0.3, and W = 0.2, respectively.
Forces to perform layout analysis for all the text strings, including texts contained in figures.

+

-Y layout_mode +
Specifies how the page layout should be preserved. (Currently only applies to HTML format.) + +

-s scale
Specifies the output scale. Can be used in HTML format only.

diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 46043a3..a5476e6 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -209,27 +209,38 @@ class HTMLConverter(PDFConverter): 'char': 'black', } - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, exact=False, showpageno=True, pagepad=50, outdir=None): + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, + scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50, + outdir=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) - self.exact = exact - self.showpageno = showpageno - self.pagepad = pagepad - self.outdir = outdir self.scale = scale - self.fontscale = 0.7 - self.write('\n') - self.write('\n' % self.codec) - self.write('\n') - self.yoffset = self.pagepad + self.fontscale = fontscale + self.layoutmode = layoutmode + self.showpageno = showpageno + self.pagemargin = pagemargin + self.outdir = outdir + self.yoffset = self.pagemargin self._font = None self._fontstack = [] + self.write_header() return def write(self, text): self.outfp.write(text) return + def write_header(self): + self.write('\n') + self.write('\n' % self.codec) + self.write('\n') + return + + def write_footer(self): + self.write('

Page: %s
\n' % + ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) + self.write('\n') + return + def write_text(self, text): self.write(enc(text, self.codec)) return @@ -258,7 +269,7 @@ class HTMLConverter(PDFConverter): color = self.TEXT_COLORS.get(color) if color is not None: self.write('' % - (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale)) + (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale)) self.write_text(text) self.write('\n') return @@ -282,7 +293,11 @@ class HTMLConverter(PDFConverter): (fontname, fontsize * self.scale * self.fontscale)) self._font = font self.write_text(text) - return + return + + def put_newline(self): + self.write('
') + return def end_textbox(self, color): if self._font is not None: @@ -311,7 +326,7 @@ class HTMLConverter(PDFConverter): elif isinstance(item, LTImage): self.place_image(item, 1, item.x0, item.y1, item.width, item.height) else: - if self.exact: + if self.layoutmode == 'exact': if isinstance(item, LTTextLine): self.place_rect('textline', 1, item.x0, item.y1, item.width, item.height) for child in item: @@ -328,7 +343,8 @@ class HTMLConverter(PDFConverter): if isinstance(item, LTTextLine): for child in item: render(child) - self.write('
') + if self.layoutmode != 'loose': + self.put_newline() elif isinstance(item, LTTextBox): self.begin_textbox('textbox', 1, item.x0, item.y1, item.width, item.height, item.get_writing_mode()) @@ -349,13 +365,11 @@ class HTMLConverter(PDFConverter): show_layout(child) return show_layout(ltpage.layout) - self.yoffset += self.pagepad + self.yoffset += self.pagemargin return def close(self): - self.write('
Page: %s
\n' % - ', '.join('%s' % (i,i) for i in xrange(1,self.pageno))) - self.write('\n') + self.write_footer() return @@ -366,10 +380,18 @@ class XMLConverter(PDFConverter): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.outdir = outdir + self.write_header() + return + + def write_header(self): self.outfp.write('\n' % codec) self.outfp.write('\n') return + def write_footer(self): + self.outfp.write('\n') + return + def write_text(self, text): self.outfp.write(enc(text, self.codec)) return @@ -445,5 +467,5 @@ class XMLConverter(PDFConverter): return def close(self): - self.outfp.write('\n') + self.write_footer() return diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index d368398..5960f85 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -12,11 +12,11 @@ def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' - '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' + '[-n] [-A] [-M char_margin] [-L line_margin] [-W word_margin] [-Y layout_mode] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') + (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAM:L:W:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() @@ -30,6 +30,7 @@ def main(argv): outfile = None outtype = None outdir = None + layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 @@ -43,10 +44,10 @@ def main(argv): elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True - elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) + elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v @@ -78,7 +79,8 @@ def main(argv): elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': - device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) + device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, + layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: