text spacing bug fixed

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@106 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-05-16 10:42:35 +00:00
parent 3f7f4fe5a5
commit 173d095522
4 changed files with 16 additions and 24 deletions

View File

@ -86,12 +86,14 @@ class PDFPageAggregator(PDFDevice):
def render_string(self, textstate, textmatrix, seq): def render_string(self, textstate, textmatrix, seq):
font = textstate.font font = textstate.font
textmatrix = mult_matrix(textmatrix, self.ctm) textmatrix = mult_matrix(textmatrix, self.ctm)
scaling = textstate.scaling * .01
dxscale = scaling / (font.hscale*1000) * .01
wordspace = textstate.wordspace * scaling
chars = [] chars = []
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
(dx,dy) = self.render_chars(textmatrix, textstate, chars) (dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx -= x * textstate.scaling * .0001 textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = [] chars = []
else: else:
for cid in font.decode(x): for cid in font.decode(x):
@ -101,10 +103,9 @@ class PDFPageAggregator(PDFDevice):
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
char = self.handle_undefined_char(cidcoding, cid) char = self.handle_undefined_char(cidcoding, cid)
chars.append((char, cid)) chars.append((char, cid))
if textstate.wordspace and not font.is_multibyte() and cid == 32: if cid == 32 and textstate.wordspace and not font.is_multibyte():
(dx,dy) = self.render_chars(textmatrix, textstate, chars) (dx,dy) = self.render_chars(textmatrix, textstate, chars)
dx += textstate.wordspace * textstate.scaling * .01 textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
textmatrix = translate_matrix(textmatrix, (dx, dy))
chars = [] chars = []
self.render_chars(textmatrix, textstate, chars) self.render_chars(textmatrix, textstate, chars)
return return
@ -238,7 +239,6 @@ class HTMLConverter(PDFConverter):
self.codec) self.codec)
self.outfp.write('</head><body>\n') self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad self.yoffset = self.pagepad
self.show_text_border = False
return return
def write_rect(self, color, width, x, y, w, h): def write_rect(self, color, width, x, y, w, h):
@ -268,7 +268,7 @@ class HTMLConverter(PDFConverter):
item.fontsize*self.scale)) item.fontsize*self.scale))
self.write(item.text) self.write(item.text)
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
if self.show_text_border: if self.debug:
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
elif isinstance(item, LTLine) or isinstance(item, LTRect): elif isinstance(item, LTLine) or isinstance(item, LTRect):
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)

View File

@ -330,6 +330,7 @@ class PDFFont(object):
self.default_width = default_width or descriptor.get('MissingWidth', 0) self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = num_value(descriptor.get('Leading', 0)) self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
self.hscale = self.vscale = .001
return return
def __repr__(self): def __repr__(self):
@ -345,12 +346,12 @@ class PDFFont(object):
return map(ord, bytes) return map(ord, bytes)
def get_ascent(self): def get_ascent(self):
return self.ascent * .001 return self.ascent * self.vscale
def get_descent(self): def get_descent(self):
return self.descent * .001 return self.descent * self.vscale
def char_width(self, cid): def char_width(self, cid):
return self.widths.get(cid, self.default_width) * .001 return self.widths.get(cid, self.default_width) * self.hscale
def char_disp(self, cid): def char_disp(self, cid):
return 0 return 0
@ -448,14 +449,6 @@ class PDFType3Font(PDFSimpleFont):
def __repr__(self): def __repr__(self):
return '<PDFType3Font>' return '<PDFType3Font>'
def get_ascent(self):
return self.ascent * self.vscale
def get_descent(self):
return self.descent * self.vscale
def char_width(self, cid):
return self.widths.get(cid, self.default_width) * self.hscale
# PDFCIDFont # PDFCIDFont
class PDFCIDFont(PDFFont): class PDFCIDFont(PDFFont):

View File

@ -754,18 +754,15 @@ class PDFPageInterpreter(object):
## process_pdf ## process_pdf
## ##
class TextExtractionNotAllowed(RuntimeError): pass class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''): def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
doc = PDFDocument() doc = PDFDocument()
fp = file(fname, 'rb') fp = file(fname, 'rb')
parser = PDFParser(doc, fp) parser = PDFParser(doc, fp)
try: doc.initialize(password)
doc.initialize(password)
except PDFPasswordIncorrect:
raise TextExtractionNotAllowed('Incorrect password')
if not doc.is_extractable: if not doc.is_extractable:
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname) raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
interpreter = PDFPageInterpreter(rsrc, device) interpreter = PDFPageInterpreter(rsrc, device)
for (pageno,page) in enumerate(doc.get_pages()): for (pageno,page) in enumerate(doc.get_pages()):
if pagenos and (pageno not in pagenos): continue if pagenos and (pageno not in pagenos): continue

View File

@ -2,6 +2,7 @@
import sys import sys
from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
from pdfminer.cmap import CMapDB from pdfminer.cmap import CMapDB
@ -51,6 +52,7 @@ def main(argv):
PDFDocument.debug = debug PDFDocument.debug = debug
PDFParser.debug = debug PDFParser.debug = debug
PDFPageInterpreter.debug = debug PDFPageInterpreter.debug = debug
PDFDevice.debug = debug
# #
CMapDB.initialize(cmapdir, cdbcmapdir) CMapDB.initialize(cmapdir, cdbcmapdir)
rsrc = PDFResourceManager() rsrc = PDFResourceManager()