text spacing bug fixed
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@106 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
3f7f4fe5a5
commit
173d095522
|
@ -86,12 +86,14 @@ class PDFPageAggregator(PDFDevice):
|
|||
def render_string(self, textstate, textmatrix, seq):
|
||||
font = textstate.font
|
||||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
scaling = textstate.scaling * .01
|
||||
dxscale = scaling / (font.hscale*1000) * .01
|
||||
wordspace = textstate.wordspace * scaling
|
||||
chars = []
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx -= x * textstate.scaling * .0001
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy))
|
||||
chars = []
|
||||
else:
|
||||
for cid in font.decode(x):
|
||||
|
@ -101,10 +103,9 @@ class PDFPageAggregator(PDFDevice):
|
|||
(cidcoding, cid) = e.args
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
chars.append((char, cid))
|
||||
if textstate.wordspace and not font.is_multibyte() and cid == 32:
|
||||
if cid == 32 and textstate.wordspace and not font.is_multibyte():
|
||||
(dx,dy) = self.render_chars(textmatrix, textstate, chars)
|
||||
dx += textstate.wordspace * textstate.scaling * .01
|
||||
textmatrix = translate_matrix(textmatrix, (dx, dy))
|
||||
textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy))
|
||||
chars = []
|
||||
self.render_chars(textmatrix, textstate, chars)
|
||||
return
|
||||
|
@ -238,7 +239,6 @@ class HTMLConverter(PDFConverter):
|
|||
self.codec)
|
||||
self.outfp.write('</head><body>\n')
|
||||
self.yoffset = self.pagepad
|
||||
self.show_text_border = False
|
||||
return
|
||||
|
||||
def write_rect(self, color, width, x, y, w, h):
|
||||
|
@ -268,7 +268,7 @@ class HTMLConverter(PDFConverter):
|
|||
item.fontsize*self.scale))
|
||||
self.write(item.text)
|
||||
self.outfp.write('</span>\n')
|
||||
if self.show_text_border:
|
||||
if self.debug:
|
||||
self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
elif isinstance(item, LTLine) or isinstance(item, LTRect):
|
||||
self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height)
|
||||
|
|
|
@ -330,6 +330,7 @@ class PDFFont(object):
|
|||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||
self.hscale = self.vscale = .001
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -345,12 +346,12 @@ class PDFFont(object):
|
|||
return map(ord, bytes)
|
||||
|
||||
def get_ascent(self):
|
||||
return self.ascent * .001
|
||||
return self.ascent * self.vscale
|
||||
def get_descent(self):
|
||||
return self.descent * .001
|
||||
return self.descent * self.vscale
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width) * .001
|
||||
return self.widths.get(cid, self.default_width) * self.hscale
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
@ -448,14 +449,6 @@ class PDFType3Font(PDFSimpleFont):
|
|||
def __repr__(self):
|
||||
return '<PDFType3Font>'
|
||||
|
||||
def get_ascent(self):
|
||||
return self.ascent * self.vscale
|
||||
def get_descent(self):
|
||||
return self.descent * self.vscale
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width) * self.hscale
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
|
|
@ -754,18 +754,15 @@ class PDFPageInterpreter(object):
|
|||
|
||||
## process_pdf
|
||||
##
|
||||
class TextExtractionNotAllowed(RuntimeError): pass
|
||||
class PDFTextExtractionNotAllowed(PDFInterpreterError): pass
|
||||
|
||||
def process_pdf(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
||||
doc = PDFDocument()
|
||||
fp = file(fname, 'rb')
|
||||
parser = PDFParser(doc, fp)
|
||||
try:
|
||||
doc.initialize(password)
|
||||
except PDFPasswordIncorrect:
|
||||
raise TextExtractionNotAllowed('Incorrect password')
|
||||
if not doc.is_extractable:
|
||||
raise TextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||
raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fname)
|
||||
interpreter = PDFPageInterpreter(rsrc, device)
|
||||
for (pageno,page) in enumerate(doc.get_pages()):
|
||||
if pagenos and (pageno not in pagenos): continue
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
import sys
|
||||
from pdfminer.pdfparser import PDFDocument, PDFParser
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import SGMLConverter, HTMLConverter, TextConverter, TagExtractor
|
||||
from pdfminer.cmap import CMapDB
|
||||
|
||||
|
@ -51,6 +52,7 @@ def main(argv):
|
|||
PDFDocument.debug = debug
|
||||
PDFParser.debug = debug
|
||||
PDFPageInterpreter.debug = debug
|
||||
PDFDevice.debug = debug
|
||||
#
|
||||
CMapDB.initialize(cmapdir, cdbcmapdir)
|
||||
rsrc = PDFResourceManager()
|
||||
|
|
Loading…
Reference in New Issue