handling type3 font size correctly.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-01-10 10:45:49 +00:00
parent 91770edd46
commit ba277fb5a0
4 changed files with 62 additions and 39 deletions

View File

@ -90,11 +90,11 @@ class TextItem(object):
self.direction = 0 self.direction = 0
self.text = '' self.text = ''
scaling *= .01 scaling *= .01
size = (font.get_ascent() - font.get_descent()) * fontsize
if not self.font.is_vertical(): if not self.font.is_vertical():
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width # horizontal text
spwidth = font.char_width(32) * self.SPACE_WIDTH # space width
self.direction = 1 self.direction = 1
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
ty += descent
w = 0 w = 0
dx = 0 dx = 0
prev = ' ' prev = ' '
@ -106,14 +106,18 @@ class TextItem(object):
self.text += char self.text += char
prev = char prev = char
dx = 0 dx = 0
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
else: else:
t *= .001
dx -= t dx -= t
w += t * fontsize * .001 * scaling w += t * fontsize * scaling
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size))
self.adv = (w, 0) self.adv = (w, 0)
self.bbox = (tx, ty, tx+w, ty+h) self.bbox = (tx, ty, tx+w, ty+h)
else: else:
# vertical text
self.direction = 2 self.direction = 2
disp = 0 disp = 0
h = 0 h = 0
@ -122,19 +126,19 @@ class TextItem(object):
(disp,char) = t (disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
break break
for t in text: for t in text:
if isinstance(t, tuple): if isinstance(t, tuple):
(_,char) = t (_,char) = t
self.text += char self.text += char
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h)) (w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2 tx -= w/2
ty += disp ty += disp
self.adv = (0, h) self.adv = (0, h)
self.bbox = (tx, ty+h, tx+w, ty) self.bbox = (tx, ty+h, tx+w, ty)
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize))) self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
return return
def __repr__(self): def __repr__(self):

View File

@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \
resolve1, int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import apply_matrix_norm
## Fonts ## Fonts
@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
# PDFFont # PDFFont
class PDFFont(object): class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None, font_matrix=None): def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor self.descriptor = descriptor
self.widths = widths self.widths = widths
self.fontname = descriptor.get('FontName', 'unknown') self.fontname = descriptor.get('FontName', 'unknown')
@ -37,7 +38,6 @@ class PDFFont(object):
self.default_width = default_width or descriptor.get('MissingWidth', 0) self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = num_value(descriptor.get('Leading', 0)) self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
return return
def __repr__(self): def __repr__(self):
@ -52,8 +52,13 @@ class PDFFont(object):
def decode(self, bytes): def decode(self, bytes):
return map(ord, bytes) return map(ord, bytes)
def get_ascent(self):
return self.ascent * .001
def get_descent(self):
return self.descent * .001
def char_width(self, cid): def char_width(self, cid):
return self.widths.get(cid, self.default_width) return self.widths.get(cid, self.default_width) * .001
def char_disp(self, cid): def char_disp(self, cid):
return 0 return 0
@ -61,10 +66,11 @@ class PDFFont(object):
def string_width(self, s): def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) ) return sum( self.char_width(cid) for cid in self.decode(s) )
# PDFSimpleFont # PDFSimpleFont
class PDFSimpleFont(PDFFont): class PDFSimpleFont(PDFFont):
def __init__(self, descriptor, widths, spec, font_matrix=None): def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of # Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes # built-in encoding or a dictionary that describes
# the differences. # the differences.
@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont):
strm = stream_value(spec['ToUnicode']) strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap() self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix) PDFFont.__init__(self, descriptor, widths)
return return
def to_unicode(self, cid): def to_unicode(self, cid):
@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont):
# PDFType1Font # PDFType1Font
class PDFType1Font(PDFSimpleFont): class PDFType1Font(PDFSimpleFont):
def __init__(self, spec): def __init__(self, rsrc, spec):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:
@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font):
# PDFType3Font # PDFType3Font
class PDFType3Font(PDFSimpleFont): class PDFType3Font(PDFSimpleFont):
def __init__(self, spec): def __init__(self, rsrc, spec):
firstchar = int_value(spec.get('FirstChar', 0)) firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0)) lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256)) widths = list_value(spec.get('Widths', [0]*256))
@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont):
descriptor = {'FontName':spec.get('Name'), descriptor = {'FontName':spec.get('Name'),
'Ascent':0, 'Descent':0, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']} 'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec, PDFSimpleFont.__init__(self, descriptor, widths, spec)
font_matrix=tuple(list_value(spec.get('FontMatrix')))) self.matrix = tuple(list_value(spec.get('FontMatrix')))
(_,self.descent,_,self.ascent) = self.bbox
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
return return
def __repr__(self): def __repr__(self):
return '<PDFType3Font>' return '<PDFType3Font>'
def get_ascent(self):
return self.ascent * self.vscale
def get_descent(self):
return self.descent * self.vscale
def char_width(self, cid):
return self.widths.get(cid, self.default_width) * self.hscale
# PDFCIDFont # PDFCIDFont
@ -229,7 +245,7 @@ class TrueTypeFont(object):
class PDFCIDFont(PDFFont): class PDFCIDFont(PDFFont):
def __init__(self, spec): def __init__(self, rsrc, spec):
try: try:
self.basefont = literal_name(spec['BaseFont']) self.basefont = literal_name(spec['BaseFont'])
except KeyError: except KeyError:
@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont):
raise PDFFontError('Encoding is unspecified') raise PDFFontError('Encoding is unspecified')
name = 'unknown' name = 'unknown'
try: try:
self.cmap = CMapDB.get_cmap(name, strict=STRICT) self.cmap = rsrc.get_cmap(name, strict=STRICT)
except CMapDB.CMapNotFound, e: except CMapDB.CMapNotFound, e:
raise PDFFontError(e) raise PDFFontError(e)
try: try:
@ -273,7 +289,7 @@ class PDFCIDFont(PDFFont):
pass pass
else: else:
try: try:
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding, self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
strict=STRICT) strict=STRICT)
except CMapDB.CMapNotFound, e: except CMapDB.CMapNotFound, e:
raise PDFFontError(e) raise PDFFontError(e)
@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont):
raise PDFUnicodeNotDefined(self.cidcoding, cid) raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code) chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars ) return ''.join( unichr(c) for c in chars )

View File

@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \ from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from pdflib.cmap import CMapDB
## Exceptions ## Exceptions
@ -59,6 +60,9 @@ class PDFResourceManager(object):
pass pass
return return
def get_cmap(self, cmapname, strict=False):
return CMapDB.get_cmap(cmapname, strict=strict)
def get_font(self, objid, spec): def get_font(self, objid, spec):
if objid and objid in self.fonts: if objid and objid in self.fonts:
font = self.fonts[objid] font = self.fonts[objid]
@ -75,16 +79,16 @@ class PDFResourceManager(object):
subtype = 'Type1' subtype = 'Type1'
if subtype in ('Type1', 'MMType1'): if subtype in ('Type1', 'MMType1'):
# Type1 Font # Type1 Font
font = PDFType1Font(spec) font = PDFType1Font(self, spec)
elif subtype == 'TrueType': elif subtype == 'TrueType':
# TrueType Font # TrueType Font
font = PDFTrueTypeFont(spec) font = PDFTrueTypeFont(self, spec)
elif subtype == 'Type3': elif subtype == 'Type3':
# Type3 Font # Type3 Font
font = PDFType3Font(spec) font = PDFType3Font(self, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'): elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font # CID Font
font = PDFCIDFont(spec) font = PDFCIDFont(self, spec)
elif subtype == 'Type0': elif subtype == 'Type0':
# Type0 Font # Type0 Font
dfonts = list_value(spec['DescendantFonts']) dfonts = list_value(spec['DescendantFonts'])
@ -535,16 +539,17 @@ class PDFPageInterpreter(object):
self.device.render_string(textstate, textmatrix, seq) self.device.render_string(textstate, textmatrix, seq)
font = textstate.font font = textstate.font
s = ''.join( x for x in seq if isinstance(x, str) ) s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) ) w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace len(s) * textstate.charspace)
if not font.is_multibyte():
w += s.count(' ')*textstate.wordspace
w *= (textstate.scaling * .01)
(lx,ly) = textstate.linematrix (lx,ly) = textstate.linematrix
if font.is_vertical(): if font.is_vertical():
ly += w # advance vertically
ly += w * (textstate.scaling * .01)
else: else:
lx += w # advance horizontally
if not font.is_multibyte():
w += s.count(' ')*textstate.wordspace
lx += w * (textstate.scaling * .01)
textstate.linematrix = (lx,ly) textstate.linematrix = (lx,ly)
return return
# show # show

View File

@ -2,7 +2,7 @@
PYTHON=python PYTHON=python
CDBCMAPDIR=../CDBCMap CDBCMAPDIR=../CDBCMap
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt
HTMLS= \ HTMLS= \
simple1.html \ simple1.html \
@ -22,4 +22,4 @@ clean:
.SUFFIXES: .pdf .html .SUFFIXES: .pdf .html
.pdf.html: .pdf.html:
$(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $< $(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<