From 7dbb664db3ad033104cc5f7a524ac600f67e8375 Mon Sep 17 00:00:00 2001 From: Yusuke Shinyama Date: Mon, 14 Feb 2011 23:42:05 +0900 Subject: [PATCH] code cleanup and more debugging options --- pdfminer/converter.py | 40 +++++++++++++++++++++++----------------- tools/pdf2txt.py | 4 ++-- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index aa46ad9..f916360 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -199,21 +199,24 @@ class HTMLConverter(PDFConverter): RECT_COLORS = { #'char': 'green', - #'figure': 'yellow', - #'textline': 'magenta', - #'textbox': 'cyan', - #'textgroup': 'red', + 'figure': 'yellow', + 'textline': 'magenta', + 'textbox': 'cyan', + 'textgroup': 'red', 'polygon': 'black', 'page': 'gray', } + TEXT_COLORS = { - #'textbox': 'blue', + 'textbox': 'blue', 'char': 'black', } def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, pagemargin=50, - outdir=None): + scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, + pagemargin=50, outdir=None, + rect_colors={'polygon':'black', 'page':'gray'}, + text_colors={'char':'black'}): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.scale = scale self.fontscale = fontscale @@ -221,9 +224,12 @@ class HTMLConverter(PDFConverter): self.showpageno = showpageno self.pagemargin = pagemargin self.outdir = outdir - self.yoffset = self.pagemargin - self.rect_colors = self.RECT_COLORS - self.text_colors = self.TEXT_COLORS + self.rect_colors = rect_colors + self.text_colors = text_colors + if self.debug: + self.rect_colors.update(self.RECT_COLORS) + self.text_colors.update(self.TEXT_COLORS) + self._yoffset = self.pagemargin self._font = None self._fontstack = [] self.write_header() @@ -255,7 +261,7 @@ class HTMLConverter(PDFConverter): self.write('\n' % (color, borderwidth, - x*self.scale, (self.yoffset-y)*self.scale, + x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return @@ -269,7 +275,7 @@ class HTMLConverter(PDFConverter): self.write('\n' % (enc(name), borderwidth, - x*self.scale, (self.yoffset-y)*self.scale, + x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return @@ -277,7 +283,7 @@ class HTMLConverter(PDFConverter): color = self.text_colors.get(color) if color is not None: self.write('' % - (color, x*self.scale, (self.yoffset-y)*self.scale, size*self.scale*self.fontscale)) + (color, x*self.scale, (self._yoffset-y)*self.scale, size*self.scale*self.fontscale)) self.write_text(text) self.write('\n') return @@ -288,7 +294,7 @@ class HTMLConverter(PDFConverter): self.write('
' % (color, borderwidth, writing_mode, - x*self.scale, (self.yoffset-y)*self.scale, + x*self.scale, (self._yoffset-y)*self.scale, w*self.scale, h*self.scale)) return @@ -323,11 +329,11 @@ class HTMLConverter(PDFConverter): return def render(item): if isinstance(item, LTPage): - self.yoffset += item.y1 + self._yoffset += item.y1 self.place_border('page', 1, item) if self.showpageno: self.write('
' % - ((self.yoffset-item.y1)*self.scale)) + ((self._yoffset-item.y1)*self.scale)) self.write('Page %s
\n' % (item.pageid, item.pageid)) for child in item: render(child) @@ -373,7 +379,7 @@ class HTMLConverter(PDFConverter): self.write_text(item.text) return render(ltpage) - self.yoffset += self.pagemargin + self._yoffset += self.pagemargin return def close(self): diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 2aa121d..dc355d5 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -55,10 +55,10 @@ def main(argv): elif k == '-c': codec = v elif k == '-s': scale = float(v) # + #PDFDocument.debug = debug + #PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug - PDFDocument.debug = debug - PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug #