From 173d0955229332b277c5d4de68047b8c6f6b8688 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 16 May 2009 10:42:35 +0000 Subject: [PATCH] text spacing bug fixed git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@106 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 14 +++++++------- pdfminer/pdffont.py | 15 ++++----------- pdfminer/pdfinterp.py | 9 +++------ tools/pdf2txt.py | 2 ++ 4 files changed, 16 insertions(+), 24 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 3776699..4b1b5f9 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -86,12 +86,14 @@ class PDFPageAggregator(PDFDevice): def render_string(self, textstate, textmatrix, seq): font = textstate.font textmatrix = mult_matrix(textmatrix, self.ctm) + scaling = textstate.scaling * .01 + dxscale = scaling / (font.hscale*1000) * .01 + wordspace = textstate.wordspace * scaling chars = [] for x in seq: if isinstance(x, int) or isinstance(x, float): (dx,dy) = self.render_chars(textmatrix, textstate, chars) - dx -= x * textstate.scaling * .0001 - textmatrix = translate_matrix(textmatrix, (dx, dy)) + textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy)) chars = [] else: for cid in font.decode(x): @@ -101,10 +103,9 @@ class PDFPageAggregator(PDFDevice): (cidcoding, cid) = e.args char = self.handle_undefined_char(cidcoding, cid) chars.append((char, cid)) - if textstate.wordspace and not font.is_multibyte() and cid == 32: + if cid == 32 and textstate.wordspace and not font.is_multibyte(): (dx,dy) = self.render_chars(textmatrix, textstate, chars) - dx += textstate.wordspace * textstate.scaling * .01 - textmatrix = translate_matrix(textmatrix, (dx, dy)) + textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy)) chars = [] self.render_chars(textmatrix, textstate, chars) return @@ -238,7 +239,6 @@ class HTMLConverter(PDFConverter): self.codec) self.outfp.write('\n') self.yoffset = self.pagepad - self.show_text_border = False return def write_rect(self, color, width, x, y, w, h): @@ -268,7 +268,7 @@ class HTMLConverter(PDFConverter): item.fontsize*self.scale)) self.write(item.text) self.outfp.write('\n') - if self.show_text_border: + if self.debug: self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTLine) or isinstance(item, LTRect): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index a3de399..cc3af4e 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -330,6 +330,7 @@ class PDFFont(object): self.default_width = default_width or descriptor.get('MissingWidth', 0) self.leading = num_value(descriptor.get('Leading', 0)) self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) + self.hscale = self.vscale = .001 return def __repr__(self): @@ -345,12 +346,12 @@ class PDFFont(object): return map(ord, bytes) def get_ascent(self): - return self.ascent * .001 + return self.ascent * self.vscale def get_descent(self): - return self.descent * .001 + return self.descent * self.vscale def char_width(self, cid): - return self.widths.get(cid, self.default_width) * .001 + return self.widths.get(cid, self.default_width) * self.hscale def char_disp(self, cid): return 0 @@ -448,14 +449,6 @@ class PDFType3Font(PDFSimpleFont): def __repr__(self): return '' - def get_ascent(self): - return self.ascent * self.vscale - def get_descent(self): - return self.descent * self.vscale - - def char_width(self, cid): - return self.widths.get(cid, self.default_width) * self.hscale - # PDFCIDFont class PDFCIDFont(PDFFont): diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index a39e354..6ad9883 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -754,18 +754,15 @@ class PDFPageInterpreter(object): ## process_pdf ## -class TextExtractionNotAllowed(RuntimeError): pass +class PDFTextExtractionNotAllowed(PDFInterpreterError): pass def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) - try: - doc.initialize(password) - except PDFPasswordIncorrect: - raise TextExtractionNotAllowed('Incorrect password') + doc.initialize(password) if not doc.is_extractable: - raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) + raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device) for (pageno,page) in enumerate(doc.get_pages()): if pagenos and (pageno not in pagenos): continue diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 96a7a23..2e8ed18 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -2,6 +2,7 @@ import sys from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf +from pdfminer.pdfdevice import PDFDevice from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.cmap import CMapDB @@ -51,6 +52,7 @@ def main(argv): PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug + PDFDevice.debug = debug # CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager()