From ba277fb5a00ad5e719d478c01d9df7ae54e93ee0 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 10 Jan 2009 10:45:49 +0000 Subject: [PATCH] handling type3 font size correctly. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@60 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/pdfdevice.py | 24 ++++++++++++++---------- pdflib/pdffont.py | 44 +++++++++++++++++++++++++++++--------------- pdflib/pdfinterp.py | 29 +++++++++++++++++------------ samples/Makefile | 4 ++-- 4 files changed, 62 insertions(+), 39 deletions(-) diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index df5c304..18954cc 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -90,11 +90,11 @@ class TextItem(object): self.direction = 0 self.text = '' scaling *= .01 + size = (font.get_ascent() - font.get_descent()) * fontsize if not self.font.is_vertical(): - spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width + # horizontal text + spwidth = font.char_width(32) * self.SPACE_WIDTH # space width self.direction = 1 - (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001)) - ty += descent w = 0 dx = 0 prev = ' ' @@ -106,14 +106,18 @@ class TextItem(object): self.text += char prev = char dx = 0 - w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling + w += (font.char_width(ord(char)) * fontsize + charspace) * scaling else: + t *= .001 dx -= t - w += t * fontsize * .001 * scaling - (w,h) = apply_matrix_norm(self.matrix, (w,fontsize)) + w += t * fontsize * scaling + (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) + ty += descent + (w,h) = apply_matrix_norm(self.matrix, (w,size)) self.adv = (w, 0) self.bbox = (tx, ty, tx+w, ty+h) else: + # vertical text self.direction = 2 disp = 0 h = 0 @@ -122,19 +126,19 @@ class TextItem(object): (disp,char) = t (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) self.text += char - h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling + h += (font.char_width(ord(char)) * fontsize + charspace) * scaling break for t in text: if isinstance(t, tuple): (_,char) = t self.text += char - h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling - (w,h) = apply_matrix_norm(self.matrix, (fontsize,h)) + h += (font.char_width(ord(char)) * fontsize + charspace) * scaling + (w,h) = apply_matrix_norm(self.matrix, (size,h)) tx -= w/2 ty += disp self.adv = (0, h) self.bbox = (tx, ty+h, tx+w, ty) - self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize))) + self.fontsize = max(apply_matrix_norm(self.matrix, (size,size))) return def __repr__(self): diff --git a/pdflib/pdffont.py b/pdflib/pdffont.py index 30f6c1d..d929bf7 100644 --- a/pdflib/pdffont.py +++ b/pdflib/pdffont.py @@ -12,6 +12,7 @@ from pdflib.pdftypes import PDFException, \ resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB +from utils import apply_matrix_norm ## Fonts @@ -26,7 +27,7 @@ LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') # PDFFont class PDFFont(object): - def __init__(self, descriptor, widths, default_width=None, font_matrix=None): + def __init__(self, descriptor, widths, default_width=None): self.descriptor = descriptor self.widths = widths self.fontname = descriptor.get('FontName', 'unknown') @@ -37,7 +38,6 @@ class PDFFont(object): self.default_width = default_width or descriptor.get('MissingWidth', 0) self.leading = num_value(descriptor.get('Leading', 0)) self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) - self.font_matrix = font_matrix or (.001,0,0,.001,0,0) return def __repr__(self): @@ -52,8 +52,13 @@ class PDFFont(object): def decode(self, bytes): return map(ord, bytes) + def get_ascent(self): + return self.ascent * .001 + def get_descent(self): + return self.descent * .001 + def char_width(self, cid): - return self.widths.get(cid, self.default_width) + return self.widths.get(cid, self.default_width) * .001 def char_disp(self, cid): return 0 @@ -61,10 +66,11 @@ class PDFFont(object): def string_width(self, s): return sum( self.char_width(cid) for cid in self.decode(s) ) + # PDFSimpleFont class PDFSimpleFont(PDFFont): - def __init__(self, descriptor, widths, spec, font_matrix=None): + def __init__(self, descriptor, widths, spec): # Font encoding is specified either by a name of # built-in encoding or a dictionary that describes # the differences. @@ -83,7 +89,7 @@ class PDFSimpleFont(PDFFont): strm = stream_value(spec['ToUnicode']) self.ucs2_cmap = CMap() CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() - PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix) + PDFFont.__init__(self, descriptor, widths) return def to_unicode(self, cid): @@ -102,7 +108,7 @@ class PDFSimpleFont(PDFFont): # PDFType1Font class PDFType1Font(PDFSimpleFont): - def __init__(self, spec): + def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -132,7 +138,7 @@ class PDFTrueTypeFont(PDFType1Font): # PDFType3Font class PDFType3Font(PDFSimpleFont): - def __init__(self, spec): + def __init__(self, rsrc, spec): firstchar = int_value(spec.get('FirstChar', 0)) lastchar = int_value(spec.get('LastChar', 0)) widths = list_value(spec.get('Widths', [0]*256)) @@ -143,13 +149,23 @@ class PDFType3Font(PDFSimpleFont): descriptor = {'FontName':spec.get('Name'), 'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} - PDFSimpleFont.__init__(self, descriptor, widths, spec, - font_matrix=tuple(list_value(spec.get('FontMatrix')))) + PDFSimpleFont.__init__(self, descriptor, widths, spec) + self.matrix = tuple(list_value(spec.get('FontMatrix'))) + (_,self.descent,_,self.ascent) = self.bbox + (self.hscale,self.vscale) = apply_matrix_norm(self.matrix, (1,1)) return def __repr__(self): return '' + def get_ascent(self): + return self.ascent * self.vscale + def get_descent(self): + return self.descent * self.vscale + + def char_width(self, cid): + return self.widths.get(cid, self.default_width) * self.hscale + # PDFCIDFont @@ -229,7 +245,7 @@ class TrueTypeFont(object): class PDFCIDFont(PDFFont): - def __init__(self, spec): + def __init__(self, rsrc, spec): try: self.basefont = literal_name(spec['BaseFont']) except KeyError: @@ -246,7 +262,7 @@ class PDFCIDFont(PDFFont): raise PDFFontError('Encoding is unspecified') name = 'unknown' try: - self.cmap = CMapDB.get_cmap(name, strict=STRICT) + self.cmap = rsrc.get_cmap(name, strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e) try: @@ -273,8 +289,8 @@ class PDFCIDFont(PDFFont): pass else: try: - self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding, - strict=STRICT) + self.ucs2_cmap = rsrc.get_cmap('%s-UCS2' % self.cidcoding, + strict=STRICT) except CMapDB.CMapNotFound, e: raise PDFFontError(e) @@ -336,5 +352,3 @@ class PDFCIDFont(PDFFont): raise PDFUnicodeNotDefined(self.cidcoding, cid) chars = unpack('>%dH' % (len(code)/2), code) return ''.join( unichr(c) for c in chars ) - - diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 280c317..90ca1f8 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -12,10 +12,11 @@ from pdflib.psparser import PSException, PSTypeError, PSEOF, \ from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \ resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value -from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY +from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK +from pdflib.cmap import CMapDB ## Exceptions @@ -58,6 +59,9 @@ class PDFResourceManager(object): #raise PDFResourceError('ProcSet %r is not supported.' % proc) pass return + + def get_cmap(self, cmapname, strict=False): + return CMapDB.get_cmap(cmapname, strict=strict) def get_font(self, objid, spec): if objid and objid in self.fonts: @@ -75,16 +79,16 @@ class PDFResourceManager(object): subtype = 'Type1' if subtype in ('Type1', 'MMType1'): # Type1 Font - font = PDFType1Font(spec) + font = PDFType1Font(self, spec) elif subtype == 'TrueType': # TrueType Font - font = PDFTrueTypeFont(spec) + font = PDFTrueTypeFont(self, spec) elif subtype == 'Type3': # Type3 Font - font = PDFType3Font(spec) + font = PDFType3Font(self, spec) elif subtype in ('CIDFontType0', 'CIDFontType2'): # CID Font - font = PDFCIDFont(spec) + font = PDFCIDFont(self, spec) elif subtype == 'Type0': # Type0 Font dfonts = list_value(spec['DescendantFonts']) @@ -535,16 +539,17 @@ class PDFPageInterpreter(object): self.device.render_string(textstate, textmatrix, seq) font = textstate.font s = ''.join( x for x in seq if isinstance(x, str) ) - n = sum( x for x in seq if not isinstance(x, str) ) - w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace - if not font.is_multibyte(): - w += s.count(' ')*textstate.wordspace - w *= (textstate.scaling * .01) + w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize + + len(s) * textstate.charspace) (lx,ly) = textstate.linematrix if font.is_vertical(): - ly += w + # advance vertically + ly += w * (textstate.scaling * .01) else: - lx += w + # advance horizontally + if not font.is_multibyte(): + w += s.count(' ')*textstate.wordspace + lx += w * (textstate.scaling * .01) textstate.linematrix = (lx,ly) return # show diff --git a/samples/Makefile b/samples/Makefile index 757137c..67e3534 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -2,7 +2,7 @@ PYTHON=python CDBCMAPDIR=../CDBCMap -PDF2TXT=PYTHONPATH=.. $(PYTHON) -m tools.pdf2txt +PDF2TXT=PYTHONPATH=.. $(PYTHON) -m pdflib.pdf2txt HTMLS= \ simple1.html \ @@ -22,4 +22,4 @@ clean: .SUFFIXES: .pdf .html .pdf.html: - $(PDF2TXT) -D$(CDBCMAPDIR) -H -o $@ $< + $(PDF2TXT) -D$(CDBCMAPDIR) -o $@ $<