handling type3 font size correctly.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
91770edd46
commit
ba277fb5a0
|
@ -90,11 +90,11 @@ class TextItem(object):
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
self.text = ''
|
self.text = ''
|
||||||
scaling *= .01
|
scaling *= .01
|
||||||
|
size = (font.get_ascent() - font.get_descent()) * fontsize
|
||||||
if not self.font.is_vertical():
|
if not self.font.is_vertical():
|
||||||
spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width
|
# horizontal text
|
||||||
|
spwidth = font.char_width(32) * self.SPACE_WIDTH # space width
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001))
|
|
||||||
ty += descent
|
|
||||||
w = 0
|
w = 0
|
||||||
dx = 0
|
dx = 0
|
||||||
prev = ' '
|
prev = ' '
|
||||||
|
@ -106,14 +106,18 @@ class TextItem(object):
|
||||||
self.text += char
|
self.text += char
|
||||||
prev = char
|
prev = char
|
||||||
dx = 0
|
dx = 0
|
||||||
w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||||
else:
|
else:
|
||||||
|
t *= .001
|
||||||
dx -= t
|
dx -= t
|
||||||
w += t * fontsize * .001 * scaling
|
w += t * fontsize * scaling
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (w,fontsize))
|
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||||
|
ty += descent
|
||||||
|
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
||||||
self.adv = (w, 0)
|
self.adv = (w, 0)
|
||||||
self.bbox = (tx, ty, tx+w, ty+h)
|
self.bbox = (tx, ty, tx+w, ty+h)
|
||||||
else:
|
else:
|
||||||
|
# vertical text
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
disp = 0
|
disp = 0
|
||||||
h = 0
|
h = 0
|
||||||
|
@ -122,19 +126,19 @@ class TextItem(object):
|
||||||
(disp,char) = t
|
(disp,char) = t
|
||||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||||
self.text += char
|
self.text += char
|
||||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||||
break
|
break
|
||||||
for t in text:
|
for t in text:
|
||||||
if isinstance(t, tuple):
|
if isinstance(t, tuple):
|
||||||
(_,char) = t
|
(_,char) = t
|
||||||
self.text += char
|
self.text += char
|
||||||
h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling
|
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||||
(w,h) = apply_matrix_norm(self.matrix, (fontsize,h))
|
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
||||||
tx -= w/2
|
tx -= w/2
|
||||||
ty += disp
|
ty += disp
|
||||||
self.adv = (0, h)
|
self.adv = (0, h)
|
||||||
self.bbox = (tx, ty+h, tx+w, ty)
|
self.bbox = (tx, ty+h, tx+w, ty)
|
||||||
self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize)))
|
self.fontsize = max(apply_matrix_norm(self.matrix, (size,size)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
|
|
@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \
|
||||||
resolve1, int_value, float_value, num_value, \
|
resolve1, int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
|
from utils import apply_matrix_norm
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
## Fonts
|
||||||
|
@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||||
# PDFFont
|
# PDFFont
|
||||||
class PDFFont(object):
|
class PDFFont(object):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
|
def __init__(self, descriptor, widths, default_width=None):
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
self.widths = widths
|
self.widths = widths
|
||||||
self.fontname = descriptor.get('FontName', 'unknown')
|
self.fontname = descriptor.get('FontName', 'unknown')
|
||||||
|
@ -37,7 +38,6 @@ class PDFFont(object):
|
||||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||||
self.leading = num_value(descriptor.get('Leading', 0))
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||||
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -52,8 +52,13 @@ class PDFFont(object):
|
||||||
def decode(self, bytes):
|
def decode(self, bytes):
|
||||||
return map(ord, bytes)
|
return map(ord, bytes)
|
||||||
|
|
||||||
|
def get_ascent(self):
|
||||||
|
return self.ascent * .001
|
||||||
|
def get_descent(self):
|
||||||
|
return self.descent * .001
|
||||||
|
|
||||||
def char_width(self, cid):
|
def char_width(self, cid):
|
||||||
return self.widths.get(cid, self.default_width)
|
return self.widths.get(cid, self.default_width) * .001
|
||||||
|
|
||||||
def char_disp(self, cid):
|
def char_disp(self, cid):
|
||||||
return 0
|
return 0
|
||||||
|
@ -61,10 +66,11 @@ class PDFFont(object):
|
||||||
def string_width(self, s):
|
def string_width(self, s):
|
||||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||||
|
|
||||||
|
|
||||||
# PDFSimpleFont
|
# PDFSimpleFont
|
||||||
class PDFSimpleFont(PDFFont):
|
class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, spec, font_matrix=None):
|
def __init__(self, descriptor, widths, spec):
|
||||||
# Font encoding is specified either by a name of
|
# Font encoding is specified either by a name of
|
||||||
# built-in encoding or a dictionary that describes
|
# built-in encoding or a dictionary that describes
|
||||||
# the differences.
|
# the differences.
|
||||||
|
@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
self.ucs2_cmap = CMap()
|
self.ucs2_cmap = CMap()
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||||
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
|
PDFFont.__init__(self, descriptor, widths)
|
||||||
return
|
return
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
def to_unicode(self, cid):
|
||||||
|
@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
# PDFType1Font
|
# PDFType1Font
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, spec):
|
def __init__(self, rsrc, spec):
|
||||||
try:
|
try:
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font):
|
||||||
# PDFType3Font
|
# PDFType3Font
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, spec):
|
def __init__(self, rsrc, spec):
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
lastchar = int_value(spec.get('LastChar', 0))
|
lastchar = int_value(spec.get('LastChar', 0))
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
|
@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
descriptor = {'FontName':spec.get('Name'),
|
descriptor = {'FontName':spec.get('Name'),
|
||||||
'Ascent':0, 'Descent':0,
|
'Ascent':0, 'Descent':0,
|
||||||
'FontBBox':spec['FontBBox']}
|
'FontBBox':spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec,
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
font_matrix=tuple(list_value(spec.get('FontMatrix'))))
|
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||||
|
(_,self.descent,_,self.ascent) = self.bbox
|
||||||
|
(self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFType3Font>'
|
return '<PDFType3Font>'
|
||||||
|
|
||||||
|
def get_ascent(self):
|
||||||
|
return self.ascent * self.vscale
|
||||||
|
def get_descent(self):
|
||||||
|
return self.descent * self.vscale
|
||||||
|
|
||||||
|
def char_width(self, cid):
|
||||||
|
return self.widths.get(cid, self.default_width) * self.hscale
|
||||||
|
|
||||||
|
|
||||||
# PDFCIDFont
|
# PDFCIDFont
|
||||||
|
|
||||||
|
@ -229,7 +245,7 @@ class TrueTypeFont(object):
|
||||||
|
|
||||||
class PDFCIDFont(PDFFont):
|
class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, spec):
|
def __init__(self, rsrc, spec):
|
||||||
try:
|
try:
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont):
|
||||||
raise PDFFontError('Encoding is unspecified')
|
raise PDFFontError('Encoding is unspecified')
|
||||||
name = 'unknown'
|
name = 'unknown'
|
||||||
try:
|
try:
|
||||||
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
self.cmap = rsrc.get_cmap(name, strict=STRICT)
|
||||||
except CMapDB.CMapNotFound, e:
|
except CMapDB.CMapNotFound, e:
|
||||||
raise PDFFontError(e)
|
raise PDFFontError(e)
|
||||||
try:
|
try:
|
||||||
|
@ -273,8 +289,8 @@ class PDFCIDFont(PDFFont):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding,
|
||||||
strict=STRICT)
|
strict=STRICT)
|
||||||
except CMapDB.CMapNotFound, e:
|
except CMapDB.CMapNotFound, e:
|
||||||
raise PDFFontError(e)
|
raise PDFFontError(e)
|
||||||
|
|
||||||
|
@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont):
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
chars = unpack('>%dH' % (len(code)/2), code)
|
chars = unpack('>%dH' % (len(code)/2), code)
|
||||||
return ''.join( unichr(c) for c in chars )
|
return ''.join( unichr(c) for c in chars )
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
|
||||||
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||||
resolve1, int_value, float_value, num_value, \
|
resolve1, int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
|
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
|
||||||
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||||
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
||||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||||
|
from pdflib.cmap import CMapDB
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
|
@ -58,6 +59,9 @@ class PDFResourceManager(object):
|
||||||
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def get_cmap(self, cmapname, strict=False):
|
||||||
|
return CMapDB.get_cmap(cmapname, strict=strict)
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
|
@ -75,16 +79,16 @@ class PDFResourceManager(object):
|
||||||
subtype = 'Type1'
|
subtype = 'Type1'
|
||||||
if subtype in ('Type1', 'MMType1'):
|
if subtype in ('Type1', 'MMType1'):
|
||||||
# Type1 Font
|
# Type1 Font
|
||||||
font = PDFType1Font(spec)
|
font = PDFType1Font(self, spec)
|
||||||
elif subtype == 'TrueType':
|
elif subtype == 'TrueType':
|
||||||
# TrueType Font
|
# TrueType Font
|
||||||
font = PDFTrueTypeFont(spec)
|
font = PDFTrueTypeFont(self, spec)
|
||||||
elif subtype == 'Type3':
|
elif subtype == 'Type3':
|
||||||
# Type3 Font
|
# Type3 Font
|
||||||
font = PDFType3Font(spec)
|
font = PDFType3Font(self, spec)
|
||||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||||
# CID Font
|
# CID Font
|
||||||
font = PDFCIDFont(spec)
|
font = PDFCIDFont(self, spec)
|
||||||
elif subtype == 'Type0':
|
elif subtype == 'Type0':
|
||||||
# Type0 Font
|
# Type0 Font
|
||||||
dfonts = list_value(spec['DescendantFonts'])
|
dfonts = list_value(spec['DescendantFonts'])
|
||||||
|
@ -535,16 +539,17 @@ class PDFPageInterpreter(object):
|
||||||
self.device.render_string(textstate, textmatrix, seq)
|
self.device.render_string(textstate, textmatrix, seq)
|
||||||
font = textstate.font
|
font = textstate.font
|
||||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||||
n = sum( x for x in seq if not isinstance(x, str) )
|
w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize +
|
||||||
w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace
|
len(s) * textstate.charspace)
|
||||||
if not font.is_multibyte():
|
|
||||||
w += s.count(' ')*textstate.wordspace
|
|
||||||
w *= (textstate.scaling * .01)
|
|
||||||
(lx,ly) = textstate.linematrix
|
(lx,ly) = textstate.linematrix
|
||||||
if font.is_vertical():
|
if font.is_vertical():
|
||||||
ly += w
|
# advance vertically
|
||||||
|
ly += w * (textstate.scaling * .01)
|
||||||
else:
|
else:
|
||||||
lx += w
|
# advance horizontally
|
||||||
|
if not font.is_multibyte():
|
||||||
|
w += s.count(' ')*textstate.wordspace
|
||||||
|
lx += w * (textstate.scaling * .01)
|
||||||
textstate.linematrix = (lx,ly)
|
textstate.linematrix = (lx,ly)
|
||||||
return
|
return
|
||||||
# show
|
# show
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
CDBCMAPDIR=../CDBCMap
|
CDBCMAPDIR=../CDBCMap
|
||||||
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt
|
PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt
|
||||||
|
|
||||||
HTMLS= \
|
HTMLS= \
|
||||||
simple1.html \
|
simple1.html \
|
||||||
|
@ -22,4 +22,4 @@ clean:
|
||||||
|
|
||||||
.SUFFIXES: .pdf .html
|
.SUFFIXES: .pdf .html
|
||||||
.pdf.html:
|
.pdf.html:
|
||||||
$(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $<
|
$(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<
|
||||||
|
|
Loading…
Reference in New Issue