handling type3 font size correctly.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
91770edd46
commit
ba277fb5a0
|
@ -90,11 +90,11 @@ class TextItem(object):
|
|||
self.direction = 0
|
||||
self.text = ''
|
||||
scaling *= .01
|
||||
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||
if not self.font.is_vertical():
|
||||
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
||||
# horizontal text
|
||||
spwidth = font.char_width(32) * self.SPACE_WIDTH # space width
|
||||
self.direction = 1
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
|
||||
ty += descent
|
||||
w = 0
|
||||
dx = 0
|
||||
prev = ' '
|
||||
|
@ -106,14 +106,18 @@ class TextItem(object):
|
|||
self.text += char
|
||||
prev = char
|
||||
dx = 0
|
||||
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||
w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
else:
|
||||
t *= .001
|
||||
dx -= t
|
||||
w += t * fontsize * .001 * scaling
|
||||
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
||||
w += t * fontsize * scaling
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
||||
self.adv = (w, 0)
|
||||
self.bbox = (tx, ty, tx+w, ty+h)
|
||||
else:
|
||||
# vertical text
|
||||
self.direction = 2
|
||||
disp = 0
|
||||
h = 0
|
||||
|
@ -122,19 +126,19 @@ class TextItem(object):
|
|||
(disp,char) = t
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
break
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
(_,char) = t
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
||||
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
||||
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
||||
tx -= w/2
|
||||
ty += disp
|
||||
self.adv = (0, h)
|
||||
self.bbox = (tx, ty+h, tx+w, ty)
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
||||
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
|
|
@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \
|
|||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
from utils import apply_matrix_norm
|
||||
|
||||
|
||||
## Fonts
|
||||
|
@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
|||
# PDFFont
|
||||
class PDFFont(object):
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
|
||||
def __init__(self, descriptor, widths, default_width=None):
|
||||
self.descriptor = descriptor
|
||||
self.widths = widths
|
||||
self.fontname = descriptor.get('FontName', 'unknown')
|
||||
|
@ -37,7 +38,6 @@ class PDFFont(object):
|
|||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
|
@ -52,8 +52,13 @@ class PDFFont(object):
|
|||
def decode(self, bytes):
|
||||
return map(ord, bytes)
|
||||
|
||||
def get_ascent(self):
|
||||
return self.ascent * .001
|
||||
def get_descent(self):
|
||||
return self.descent * .001
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width)
|
||||
return self.widths.get(cid, self.default_width) * .001
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
@ -61,10 +66,11 @@ class PDFFont(object):
|
|||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec, font_matrix=None):
|
||||
def __init__(self, descriptor, widths, spec):
|
||||
# Font encoding is specified either by a name of
|
||||
# built-in encoding or a dictionary that describes
|
||||
# the differences.
|
||||
|
@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont):
|
|||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
|
||||
PDFFont.__init__(self, descriptor, widths)
|
||||
return
|
||||
|
||||
def to_unicode(self, cid):
|
||||
|
@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont):
|
|||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
def __init__(self, rsrc, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
|
@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font):
|
|||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
def __init__(self, rsrc, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
|
@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont):
|
|||
descriptor = {'FontName':spec.get('Name'),
|
||||
'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec,
|
||||
font_matrix=tuple(list_value(spec.get('FontMatrix'))))
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||
(_,self.descent,_,self.ascent) = self.bbox
|
||||
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType3Font>'
|
||||
|
||||
def get_ascent(self):
|
||||
return self.ascent * self.vscale
|
||||
def get_descent(self):
|
||||
return self.descent * self.vscale
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width) * self.hscale
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
|
||||
|
@ -229,7 +245,7 @@ class TrueTypeFont(object):
|
|||
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
def __init__(self, rsrc, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
|
@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont):
|
|||
raise PDFFontError('Encoding is unspecified')
|
||||
name = 'unknown'
|
||||
try:
|
||||
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
||||
self.cmap = rsrc.get_cmap(name, strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
try:
|
||||
|
@ -273,7 +289,7 @@ class PDFCIDFont(PDFFont):
|
|||
pass
|
||||
else:
|
||||
try:
|
||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
||||
self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
|
||||
strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
|
@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont):
|
|||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
|
|
|
@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
|
|||
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
|
||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
||||
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
from pdflib.cmap import CMapDB
|
||||
|
||||
|
||||
## Exceptions
|
||||
|
@ -59,6 +60,9 @@ class PDFResourceManager(object):
|
|||
pass
|
||||
return
|
||||
|
||||
def get_cmap(self, cmapname, strict=False):
|
||||
return CMapDB.get_cmap(cmapname, strict=strict)
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
if objid and objid in self.fonts:
|
||||
font = self.fonts[objid]
|
||||
|
@ -75,16 +79,16 @@ class PDFResourceManager(object):
|
|||
subtype = 'Type1'
|
||||
if subtype in ('Type1', 'MMType1'):
|
||||
# Type1 Font
|
||||
font = PDFType1Font(spec)
|
||||
font = PDFType1Font(self, spec)
|
||||
elif subtype == 'TrueType':
|
||||
# TrueType Font
|
||||
font = PDFTrueTypeFont(spec)
|
||||
font = PDFTrueTypeFont(self, spec)
|
||||
elif subtype == 'Type3':
|
||||
# Type3 Font
|
||||
font = PDFType3Font(spec)
|
||||
font = PDFType3Font(self, spec)
|
||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||
# CID Font
|
||||
font = PDFCIDFont(spec)
|
||||
font = PDFCIDFont(self, spec)
|
||||
elif subtype == 'Type0':
|
||||
# Type0 Font
|
||||
dfonts = list_value(spec['DescendantFonts'])
|
||||
|
@ -535,16 +539,17 @@ class PDFPageInterpreter(object):
|
|||
self.device.render_string(textstate, textmatrix, seq)
|
||||
font = textstate.font
|
||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||
n = sum( x for x in seq if not isinstance(x, str) )
|
||||
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
|
||||
if not font.is_multibyte():
|
||||
w += s.count(' ')*textstate.wordspace
|
||||
w *= (textstate.scaling * .01)
|
||||
w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
|
||||
len(s) * textstate.charspace)
|
||||
(lx,ly) = textstate.linematrix
|
||||
if font.is_vertical():
|
||||
ly += w
|
||||
# advance vertically
|
||||
ly += w * (textstate.scaling * .01)
|
||||
else:
|
||||
lx += w
|
||||
# advance horizontally
|
||||
if not font.is_multibyte():
|
||||
w += s.count(' ')*textstate.wordspace
|
||||
lx += w * (textstate.scaling * .01)
|
||||
textstate.linematrix = (lx,ly)
|
||||
return
|
||||
# show
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
PYTHON=python
|
||||
CDBCMAPDIR=../CDBCMap
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
|
||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt
|
||||
|
||||
HTMLS= \
|
||||
simple1.html \
|
||||
|
@ -22,4 +22,4 @@ clean:
|
|||
|
||||
.SUFFIXES: .pdf .html
|
||||
.pdf.html:
|
||||
$(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $<
|
||||
$(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<
|
||||
|
|
Loading…
Reference in New Issue