diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index bd5a9f1..5099136 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -5,9 +5,11 @@ try: except ImportError: from StringIO import StringIO from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap -from encodingdb import EncodingDB +from encodingdb import EncodingDB, name2unicode from struct import pack, unpack -from psparser import LIT, STRICT +from psparser import PSStackParser +from psparser import PSSyntaxError, PSEOF +from psparser import LIT, KWD, STRICT from psparser import PSLiteral, literal_name from pdftypes import PDFException, resolve1 from pdftypes import int_value, float_value, num_value @@ -70,6 +72,46 @@ class FontMetricsDB(object): return FONT_METRICS[fontname] +## Type1FontHeaderParser +## +class Type1FontHeaderParser(PSStackParser): + + KEYWORD_BEGIN = KWD('begin') + KEYWORD_END = KWD('end') + KEYWORD_DEF = KWD('def') + KEYWORD_PUT = KWD('put') + KEYWORD_DICT = KWD('dict') + KEYWORD_ARRAY = KWD('array') + KEYWORD_READONLY = KWD('readonly') + KEYWORD_FOR = KWD('for') + KEYWORD_FOR = KWD('for') + + def __init__(self, data): + PSStackParser.__init__(self, data) + self._cid2unicode = {} + return + + def get_encoding(self): + while 1: + try: + (cid,name) = self.nextobject() + except PSEOF: + break + try: + self._cid2unicode[cid] = name2unicode(name) + except KeyError: + pass + return self._cid2unicode + + def do_keyword(self, pos, token): + if token is self.KEYWORD_PUT: + ((_,key),(_,value)) = self.pop(2) + if (isinstance(key, int) and + isinstance(value, PSLiteral)): + self.add_results((key, literal_name(value))) + return + + ## CFFFont ## (Format specified in Adobe Technical Note: #5176 ## "The Compact Font Format Specification") @@ -445,9 +487,9 @@ class PDFSimpleFont(PDFFont): if isinstance(encoding, dict): name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) diff = list_value(encoding.get('Differences', None)) - self.encoding = EncodingDB.get_encoding(name, diff) + self.cid2unicode = EncodingDB.get_encoding(name, diff) else: - self.encoding = EncodingDB.get_encoding(literal_name(encoding)) + self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding)) self.unicode_map = None if 'ToUnicode' in spec: strm = stream_value(spec['ToUnicode']) @@ -463,7 +505,7 @@ class PDFSimpleFont(PDFFont): except KeyError: pass try: - return self.encoding[cid] + return self.cid2unicode[cid] except KeyError: raise PDFUnicodeNotDefined(None, cid) @@ -486,6 +528,13 @@ class PDFType1Font(PDFSimpleFont): widths = list_value(spec.get('Widths', [0]*256)) widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) PDFSimpleFont.__init__(self, descriptor, widths, spec) + if 'Encoding' not in spec and 'FontFile' in descriptor: + # try to recover the missing encoding info from the font file. + self.fontfile = stream_value(descriptor.get('FontFile')) + length1 = int_value(self.fontfile['Length1']) + data = self.fontfile.get_data()[:length1] + parser = Type1FontHeaderParser(StringIO(data)) + self.cid2unicode = parser.get_encoding() return def __repr__(self): diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 489f561..8041d94 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -99,8 +99,8 @@ PSLiteralTable = PSSymbolTable(PSLiteral) PSKeywordTable = PSSymbolTable(PSKeyword) LIT = PSLiteralTable.intern KWD = PSKeywordTable.intern -KEYWORD_BRACE_BEGIN = KWD('{') -KEYWORD_BRACE_END = KWD('}') +KEYWORD_PROC_BEGIN = KWD('{') +KEYWORD_PROC_END = KWD('}') KEYWORD_ARRAY_BEGIN = KWD('[') KEYWORD_ARRAY_END = KWD(']') KEYWORD_DICT_BEGIN = KWD('<<') @@ -542,6 +542,15 @@ class PSStackParser(PSBaseParser): self.push((pos, d)) except PSTypeError: if STRICT: raise + elif token == KEYWORD_PROC_BEGIN: + # begin proc + self.start_type(pos, 'p') + elif token == KEYWORD_PROC_END: + # end proc + try: + self.push(self.end_type('p')) + except PSTypeError: + if STRICT: raise else: if 2 <= self.debug: print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \