attempt to recover encoding info from texfont
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@252 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
e0db043260
commit
0ecd0b8f9d
|
@ -5,9 +5,11 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
|
from cmapdb import CMapDB, CMapParser, FileUnicodeMap, CMap
|
||||||
from encodingdb import EncodingDB
|
from encodingdb import EncodingDB, name2unicode
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
from psparser import LIT, STRICT
|
from psparser import PSStackParser
|
||||||
|
from psparser import PSSyntaxError, PSEOF
|
||||||
|
from psparser import LIT, KWD, STRICT
|
||||||
from psparser import PSLiteral, literal_name
|
from psparser import PSLiteral, literal_name
|
||||||
from pdftypes import PDFException, resolve1
|
from pdftypes import PDFException, resolve1
|
||||||
from pdftypes import int_value, float_value, num_value
|
from pdftypes import int_value, float_value, num_value
|
||||||
|
@ -70,6 +72,46 @@ class FontMetricsDB(object):
|
||||||
return FONT_METRICS[fontname]
|
return FONT_METRICS[fontname]
|
||||||
|
|
||||||
|
|
||||||
|
## Type1FontHeaderParser
|
||||||
|
##
|
||||||
|
class Type1FontHeaderParser(PSStackParser):
|
||||||
|
|
||||||
|
KEYWORD_BEGIN = KWD('begin')
|
||||||
|
KEYWORD_END = KWD('end')
|
||||||
|
KEYWORD_DEF = KWD('def')
|
||||||
|
KEYWORD_PUT = KWD('put')
|
||||||
|
KEYWORD_DICT = KWD('dict')
|
||||||
|
KEYWORD_ARRAY = KWD('array')
|
||||||
|
KEYWORD_READONLY = KWD('readonly')
|
||||||
|
KEYWORD_FOR = KWD('for')
|
||||||
|
KEYWORD_FOR = KWD('for')
|
||||||
|
|
||||||
|
def __init__(self, data):
|
||||||
|
PSStackParser.__init__(self, data)
|
||||||
|
self._cid2unicode = {}
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_encoding(self):
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
(cid,name) = self.nextobject()
|
||||||
|
except PSEOF:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
self._cid2unicode[cid] = name2unicode(name)
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
return self._cid2unicode
|
||||||
|
|
||||||
|
def do_keyword(self, pos, token):
|
||||||
|
if token is self.KEYWORD_PUT:
|
||||||
|
((_,key),(_,value)) = self.pop(2)
|
||||||
|
if (isinstance(key, int) and
|
||||||
|
isinstance(value, PSLiteral)):
|
||||||
|
self.add_results((key, literal_name(value)))
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## CFFFont
|
## CFFFont
|
||||||
## (Format specified in Adobe Technical Note: #5176
|
## (Format specified in Adobe Technical Note: #5176
|
||||||
## "The Compact Font Format Specification")
|
## "The Compact Font Format Specification")
|
||||||
|
@ -445,9 +487,9 @@ class PDFSimpleFont(PDFFont):
|
||||||
if isinstance(encoding, dict):
|
if isinstance(encoding, dict):
|
||||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||||
diff = list_value(encoding.get('Differences', None))
|
diff = list_value(encoding.get('Differences', None))
|
||||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
||||||
else:
|
else:
|
||||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
||||||
self.unicode_map = None
|
self.unicode_map = None
|
||||||
if 'ToUnicode' in spec:
|
if 'ToUnicode' in spec:
|
||||||
strm = stream_value(spec['ToUnicode'])
|
strm = stream_value(spec['ToUnicode'])
|
||||||
|
@ -463,7 +505,7 @@ class PDFSimpleFont(PDFFont):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
return self.encoding[cid]
|
return self.cid2unicode[cid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise PDFUnicodeNotDefined(None, cid)
|
raise PDFUnicodeNotDefined(None, cid)
|
||||||
|
|
||||||
|
@ -486,6 +528,13 @@ class PDFType1Font(PDFSimpleFont):
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
|
if 'Encoding' not in spec and 'FontFile' in descriptor:
|
||||||
|
# try to recover the missing encoding info from the font file.
|
||||||
|
self.fontfile = stream_value(descriptor.get('FontFile'))
|
||||||
|
length1 = int_value(self.fontfile['Length1'])
|
||||||
|
data = self.fontfile.get_data()[:length1]
|
||||||
|
parser = Type1FontHeaderParser(StringIO(data))
|
||||||
|
self.cid2unicode = parser.get_encoding()
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
|
|
@ -99,8 +99,8 @@ PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||||
LIT = PSLiteralTable.intern
|
LIT = PSLiteralTable.intern
|
||||||
KWD = PSKeywordTable.intern
|
KWD = PSKeywordTable.intern
|
||||||
KEYWORD_BRACE_BEGIN = KWD('{')
|
KEYWORD_PROC_BEGIN = KWD('{')
|
||||||
KEYWORD_BRACE_END = KWD('}')
|
KEYWORD_PROC_END = KWD('}')
|
||||||
KEYWORD_ARRAY_BEGIN = KWD('[')
|
KEYWORD_ARRAY_BEGIN = KWD('[')
|
||||||
KEYWORD_ARRAY_END = KWD(']')
|
KEYWORD_ARRAY_END = KWD(']')
|
||||||
KEYWORD_DICT_BEGIN = KWD('<<')
|
KEYWORD_DICT_BEGIN = KWD('<<')
|
||||||
|
@ -542,6 +542,15 @@ class PSStackParser(PSBaseParser):
|
||||||
self.push((pos, d))
|
self.push((pos, d))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT: raise
|
||||||
|
elif token == KEYWORD_PROC_BEGIN:
|
||||||
|
# begin proc
|
||||||
|
self.start_type(pos, 'p')
|
||||||
|
elif token == KEYWORD_PROC_END:
|
||||||
|
# end proc
|
||||||
|
try:
|
||||||
|
self.push(self.end_type('p'))
|
||||||
|
except PSTypeError:
|
||||||
|
if STRICT: raise
|
||||||
else:
|
else:
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
print >>stderr, 'do_keyword: pos=%r, token=%r, stack=%r' % \
|
||||||
|
|
Loading…
Reference in New Issue