2007-12-30 09:13:51 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
# pdfparser.py, Yusuke Shinyama
|
|
|
|
# ver 0.1, Dec 24 2004-
|
|
|
|
# ver 0.2, Dec 24 2007
|
|
|
|
|
|
|
|
# TODO:
|
2007-12-31 02:40:32 +00:00
|
|
|
# - Code Documentation.
|
|
|
|
# - Error handling for invalid type.
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
# - Outlines.
|
|
|
|
# - Named Objects. (pages)
|
|
|
|
# - Writers.
|
|
|
|
# - Linearized PDF.
|
|
|
|
# - Encryption?
|
|
|
|
|
|
|
|
import sys, re
|
|
|
|
from struct import pack, unpack
|
|
|
|
try:
|
|
|
|
from cStringIO import StringIO
|
|
|
|
except ImportError:
|
|
|
|
from StringIO import StringIO
|
|
|
|
try:
|
|
|
|
import cdb
|
|
|
|
except ImportError:
|
|
|
|
import pycdb as cdb
|
|
|
|
stderr = sys.stderr
|
|
|
|
|
|
|
|
|
|
|
|
## Utilities
|
|
|
|
##
|
|
|
|
def choplist(n, seq):
|
|
|
|
'''Groups every n elements of the list.'''
|
|
|
|
r = []
|
|
|
|
for x in seq:
|
|
|
|
r.append(x)
|
|
|
|
if len(r) == n:
|
|
|
|
yield tuple(r)
|
|
|
|
r = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def nunpack(s, default=0):
|
|
|
|
'''Unpacks up to 4 bytes.'''
|
|
|
|
l = len(s)
|
|
|
|
if not l:
|
|
|
|
return default
|
|
|
|
elif l == 1:
|
|
|
|
return ord(s)
|
|
|
|
elif l == 2:
|
|
|
|
return unpack('>H', s)[0]
|
|
|
|
elif l == 3:
|
|
|
|
return unpack('>L', '\x00'+s)[0]
|
|
|
|
elif l == 4:
|
|
|
|
return unpack('>L', s)[0]
|
|
|
|
else:
|
|
|
|
return TypeError('invalid length: %d' % l)
|
|
|
|
|
|
|
|
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
|
|
|
'''Multiplies two matrices.'''
|
|
|
|
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
|
|
|
a0*c1+c0*d1, b0*c1+d0*d1,
|
|
|
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
|
|
|
|
|
|
|
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
|
|
|
'''Applies a matrix to a coordination.'''
|
|
|
|
return (a*x+c*y+e, b*x+d*y+f)
|
|
|
|
|
|
|
|
|
|
|
|
## Exceptions
|
|
|
|
##
|
|
|
|
class PSException(Exception): pass
|
|
|
|
class PSSyntaxError(PSException): pass
|
|
|
|
class PSTypeError(PSException): pass
|
|
|
|
class PSValueError(PSException): pass
|
|
|
|
class PDFException(PSException): pass
|
|
|
|
class PDFSyntaxError(PDFException): pass
|
|
|
|
class PDFEncrypted(PDFException): pass
|
|
|
|
class PDFTypeError(PDFException): pass
|
|
|
|
class PDFValueError(PDFException): pass
|
|
|
|
class PDFResourceError(PDFException): pass
|
|
|
|
class PDFInterpreterError(PDFException): pass
|
|
|
|
class PDFFontError(PDFException): pass
|
|
|
|
class PDFUnicodeNotDefined(PDFFontError): pass
|
|
|
|
|
|
|
|
|
|
|
|
## PostScript Types
|
|
|
|
##
|
|
|
|
class PSLiteral:
|
|
|
|
'''
|
|
|
|
PS literals (e.g. "/Name").
|
|
|
|
Caution: Never create these objects directly.
|
|
|
|
Use PSLiteralTable.intern() instead.
|
|
|
|
'''
|
|
|
|
def __init__(self, name):
|
|
|
|
self.name = name
|
|
|
|
return
|
|
|
|
def __repr__(self):
|
|
|
|
return '/%s' % self.name
|
|
|
|
|
|
|
|
class PSKeyword:
|
|
|
|
'''
|
|
|
|
PS keywords (e.g. "showpage").
|
|
|
|
Caution: Never create these objects directly.
|
|
|
|
Use PSKeywordTable.intern() instead.
|
|
|
|
'''
|
|
|
|
def __init__(self, name):
|
|
|
|
self.name = name
|
|
|
|
return
|
|
|
|
def __repr__(self):
|
|
|
|
return self.name
|
|
|
|
|
|
|
|
class PSSymbolTable:
|
|
|
|
'''
|
|
|
|
Symbol table that stores PSLiteral or PSKeyword.
|
|
|
|
'''
|
|
|
|
def __init__(self, classe):
|
|
|
|
self.dic = {}
|
|
|
|
self.classe = classe
|
|
|
|
return
|
|
|
|
|
|
|
|
def intern(self, name):
|
|
|
|
if name in self.dic:
|
|
|
|
lit = self.dic[name]
|
|
|
|
else:
|
|
|
|
lit = self.classe(name)
|
|
|
|
self.dic[name] = lit
|
|
|
|
return lit
|
|
|
|
|
|
|
|
PSLiteralTable = PSSymbolTable(PSLiteral)
|
|
|
|
PSKeywordTable = PSSymbolTable(PSKeyword)
|
|
|
|
|
|
|
|
# some predefined literals and keywords.
|
|
|
|
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
|
|
|
|
LITERAL_PDF = PSLiteralTable.intern('PDF')
|
|
|
|
LITERAL_TEXT = PSLiteralTable.intern('Text')
|
|
|
|
LITERAL_XREF = PSLiteralTable.intern('XRef')
|
|
|
|
LITERAL_FONT = PSLiteralTable.intern('Font')
|
|
|
|
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
|
|
|
LITERAL_FORM = PSLiteralTable.intern('Form')
|
|
|
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
|
|
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
|
|
|
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
|
|
|
|
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
|
|
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
|
|
|
KEYWORD_EI = PSKeywordTable.intern('EI')
|
|
|
|
|
|
|
|
|
|
|
|
## CMap
|
|
|
|
##
|
|
|
|
class CMap:
|
|
|
|
|
|
|
|
def __init__(self, debug=0):
|
|
|
|
self.debug = 0
|
|
|
|
self.code2cid = {}
|
|
|
|
self.cid2code = {}
|
|
|
|
self.attrs = {}
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
|
|
|
|
|
|
|
def update(self, code2cid=None, cid2code=None):
|
|
|
|
if code2cid:
|
|
|
|
self.code2cid.update(code2cid)
|
|
|
|
if cid2code:
|
|
|
|
self.cid2code.update(cid2code)
|
|
|
|
return self
|
|
|
|
|
|
|
|
def copycmap(self, cmap):
|
|
|
|
self.code2cid.update(cmap.getall_code2cid())
|
|
|
|
self.cid2code.update(cmap.getall_cid2code())
|
|
|
|
return self
|
|
|
|
|
|
|
|
def register_code2cid(self, code, cid):
|
|
|
|
assert isinstance(code, str)
|
|
|
|
assert isinstance(cid, int)
|
|
|
|
self.code2cid[code] = cid
|
|
|
|
return self
|
|
|
|
|
|
|
|
def register_cid2code(self, cid, code):
|
2007-12-31 02:40:32 +00:00
|
|
|
from glyphlist import charname2unicode
|
2007-12-30 09:13:51 +00:00
|
|
|
assert isinstance(cid, int)
|
2007-12-31 02:40:32 +00:00
|
|
|
if isinstance(code, PSLiteral):
|
|
|
|
code = pack('>H', charname2unicode[code.name])
|
2007-12-30 09:13:51 +00:00
|
|
|
self.cid2code[cid] = code
|
|
|
|
return self
|
|
|
|
|
|
|
|
def decode(self, bytes):
|
|
|
|
if self.debug:
|
|
|
|
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
|
|
|
x = ''
|
|
|
|
for c in bytes:
|
|
|
|
if x:
|
|
|
|
if x+c in self.code2cid:
|
|
|
|
yield self.code2cid[x+c]
|
|
|
|
x = ''
|
|
|
|
elif c in self.code2cid:
|
|
|
|
yield self.code2cid[c]
|
|
|
|
else:
|
|
|
|
x = c
|
|
|
|
return
|
|
|
|
|
|
|
|
def is_vertical(self):
|
|
|
|
return self.attrs.get('WMode', '0') == '1'
|
|
|
|
|
|
|
|
def tocid(self, code):
|
|
|
|
return self.code2cid.get(code)
|
|
|
|
def tocode(self, cid):
|
|
|
|
return self.cid2code.get(cid)
|
|
|
|
|
|
|
|
def getall_attrs(self):
|
|
|
|
return self.attrs.iteritems()
|
|
|
|
def getall_code2cid(self):
|
|
|
|
return self.code2cid.iteritems()
|
|
|
|
def getall_cid2code(self):
|
|
|
|
return self.cid2code.iteritems()
|
|
|
|
|
|
|
|
|
|
|
|
## CDBCMap
|
|
|
|
##
|
|
|
|
class CDBCMap(CMap):
|
|
|
|
|
|
|
|
def __init__(self, cdbname, debug=0):
|
|
|
|
CMap.__init__(self, debug=debug)
|
|
|
|
self.cdbname = cdbname
|
|
|
|
self.db = cdb.init(cdbname)
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
|
|
|
|
|
|
|
|
def tocid(self, code):
|
|
|
|
k = 'c'+code
|
|
|
|
if not self.db.has_key(k):
|
|
|
|
return None
|
|
|
|
return unpack('>L', self.db[k])
|
|
|
|
def tocode(self, cid):
|
|
|
|
k = 'i'+pack('>L', cid)
|
|
|
|
if not self.db.has_key(k):
|
|
|
|
return None
|
|
|
|
return self.db[k]
|
2007-12-31 02:40:32 +00:00
|
|
|
|
|
|
|
def is_vertical(self):
|
|
|
|
return (self.db.has_key('/WMode') and
|
|
|
|
self.db['/WMode'] == '1')
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
def getall(self, c):
|
|
|
|
while 1:
|
|
|
|
x = self.db.each()
|
|
|
|
if not x: break
|
|
|
|
(k,v) = x
|
|
|
|
if k.startswith(c):
|
|
|
|
yield (k[1:], unpack('>L', v)[0])
|
|
|
|
return
|
|
|
|
|
|
|
|
def getall_attrs(self):
|
|
|
|
while 1:
|
|
|
|
x = self.db.each()
|
|
|
|
if not x: break
|
|
|
|
(k,v) = x
|
2007-12-31 02:40:32 +00:00
|
|
|
if k.startswith('/'):
|
2007-12-30 09:13:51 +00:00
|
|
|
yield (k[1:], eval(v)[0])
|
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
|
2007-12-30 09:13:51 +00:00
|
|
|
def getall_cid2code(self):
|
|
|
|
return self.getall('i')
|
|
|
|
def getall_code2cid(self):
|
|
|
|
return self.getall('c')
|
|
|
|
|
|
|
|
def decode(self, bytes):
|
|
|
|
if self.debug:
|
|
|
|
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
|
|
|
x = ''
|
|
|
|
for c in bytes:
|
|
|
|
if x:
|
|
|
|
if x+c in self.code2cid:
|
|
|
|
yield self.code2cid[x+c]
|
|
|
|
elif self.db.has_key('c'+x+c):
|
|
|
|
(dest,) = unpack('>L', self.db['c'+x+c])
|
|
|
|
self.code2cid[x+c] = dest
|
|
|
|
yield dest
|
|
|
|
x = ''
|
|
|
|
elif c in self.code2cid:
|
|
|
|
yield self.code2cid[c]
|
|
|
|
elif self.db.has_key('c'+c):
|
|
|
|
(dest,) = unpack('>L', self.db['c'+c])
|
|
|
|
self.code2cid[c] = dest
|
|
|
|
yield dest
|
|
|
|
else:
|
|
|
|
x = c
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## CMapDB
|
|
|
|
##
|
|
|
|
class CMapDB:
|
|
|
|
|
|
|
|
CMAP_ALIAS = {
|
|
|
|
}
|
|
|
|
|
|
|
|
debug = 0
|
|
|
|
dirname = None
|
|
|
|
cdbdirname = None
|
|
|
|
cmapdb = {}
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def initialize(klass, dirname, cdbdirname=None, debug=0):
|
|
|
|
klass.dirname = dirname
|
|
|
|
klass.cdbdirname = cdbdirname or dirname
|
|
|
|
klass.debug = debug
|
|
|
|
return
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_cmap(klass, cmapname):
|
|
|
|
import os.path
|
|
|
|
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
|
|
|
if cmapname in klass.cmapdb:
|
|
|
|
cmap = klass.cmapdb[cmapname]
|
|
|
|
else:
|
|
|
|
fname = os.path.join(klass.dirname, cmapname)
|
|
|
|
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
|
|
|
if os.path.exists(cdbname):
|
|
|
|
if 1 <= klass.debug:
|
|
|
|
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
|
|
|
cmap = CDBCMap(cdbname)
|
|
|
|
elif os.path.exists(fname):
|
|
|
|
if 1 <= klass.debug:
|
|
|
|
print >>stderr, 'Reading: CMap %r...' % fname
|
|
|
|
cmap = CMap()
|
|
|
|
fp = file(fname)
|
|
|
|
CMapParser(cmap, fp).parse()
|
|
|
|
fp.close()
|
|
|
|
klass.cmapdb[cmapname] = cmap
|
|
|
|
return cmap
|
|
|
|
|
|
|
|
|
|
|
|
## FontMetricsDB
|
|
|
|
##
|
|
|
|
class FontMetricsDB:
|
|
|
|
from fontmetrics import FONT_METRICS
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_metrics(klass, fontname):
|
|
|
|
return klass.FONT_METRICS[fontname]
|
|
|
|
|
|
|
|
|
|
|
|
## EncodingDB
|
|
|
|
##
|
|
|
|
class EncodingDB:
|
|
|
|
|
|
|
|
from glyphlist import charname2unicode
|
|
|
|
from latin_enc import ENCODING
|
|
|
|
std2unicode = {}
|
|
|
|
mac2unicode = {}
|
|
|
|
win2unicode = {}
|
|
|
|
pdf2unicode = {}
|
|
|
|
for (name,std,mac,win,pdf) in ENCODING:
|
|
|
|
c = unichr(charname2unicode[name])
|
|
|
|
if std: std2unicode[std] = c
|
|
|
|
if mac: mac2unicode[mac] = c
|
|
|
|
if win: win2unicode[win] = c
|
|
|
|
if pdf: pdf2unicode[pdf] = c
|
|
|
|
encodings = {
|
|
|
|
'StandardEncoding': std2unicode,
|
|
|
|
'MacRomanEncoding': mac2unicode,
|
|
|
|
'WinAnsiEncoding': win2unicode,
|
|
|
|
'PDFDocEncoding': pdf2unicode,
|
|
|
|
}
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def get_encoding(klass, name, diff=None):
|
|
|
|
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
|
|
|
if diff:
|
|
|
|
cid2unicode = cid2unicode.copy()
|
|
|
|
cid = 0
|
|
|
|
for x in diff:
|
|
|
|
if isinstance(x, int):
|
|
|
|
cid = x
|
|
|
|
elif isinstance(x, PSLiteral):
|
|
|
|
try:
|
|
|
|
cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
|
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
cid += 1
|
|
|
|
return cid2unicode
|
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
|
|
|
|
## Color Spaces
|
|
|
|
##
|
|
|
|
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
|
|
|
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
|
|
|
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
|
|
|
LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
|
|
|
|
LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
|
|
|
|
CS_COMPONENTS = {
|
|
|
|
PSLiteralTable.intern('CalRGB'): 3,
|
|
|
|
PSLiteralTable.intern('CalGray'): 1,
|
|
|
|
PSLiteralTable.intern('Lab'): 3,
|
|
|
|
PSLiteralTable.intern('DeviceRGB'): 3,
|
|
|
|
PSLiteralTable.intern('DeviceCMYK'): 4,
|
|
|
|
PSLiteralTable.intern('DeviceGray'): 1,
|
|
|
|
PSLiteralTable.intern('Separation'): 1,
|
|
|
|
PSLiteralTable.intern('Indexed'): 1,
|
|
|
|
PSLiteralTable.intern('Pattern'): 1,
|
|
|
|
}
|
|
|
|
|
|
|
|
def cs_params(cs):
|
|
|
|
t = cs[0]
|
|
|
|
if t == LITERAL_ICC_BASED:
|
|
|
|
return stream_value(cs[1]).dic['N']
|
|
|
|
elif t == LITERAL_DEVICE_N:
|
|
|
|
return len(list_value(cs[1]))
|
|
|
|
else:
|
|
|
|
return CS_COMPONENTS[t]
|
|
|
|
|
|
|
|
|
2007-12-30 09:13:51 +00:00
|
|
|
## PSBaseParser
|
|
|
|
##
|
|
|
|
class PSBaseParser:
|
|
|
|
|
|
|
|
'''PostScript parser that performs only basic tokenization.'''
|
|
|
|
|
|
|
|
def __init__(self, fp, debug=0):
|
|
|
|
self.fp = fp
|
|
|
|
self.debug = debug
|
|
|
|
self.bufsize = 4096
|
|
|
|
self.seek(0)
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2007-12-31 02:40:32 +00:00
|
|
|
return '<PSBaseParser: %r>' % (self.fp,)
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
def seek(self, pos):
|
|
|
|
'''
|
|
|
|
seeks to the given pos.
|
|
|
|
'''
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'seek:', pos
|
|
|
|
self.fp.seek(pos)
|
2007-12-31 02:40:32 +00:00
|
|
|
self.linepos = pos
|
2007-12-30 09:13:51 +00:00
|
|
|
self.linebuf = None
|
2007-12-31 02:40:32 +00:00
|
|
|
self.curpos = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
self.line = ''
|
|
|
|
return
|
|
|
|
|
|
|
|
EOLCHAR = re.compile(r'[\r\n]')
|
|
|
|
def nextline(self):
|
|
|
|
'''
|
|
|
|
fetches the next line that ends either with \\r or \\n.
|
|
|
|
'''
|
|
|
|
line = ''
|
|
|
|
eol = None
|
|
|
|
while 1:
|
2007-12-31 02:40:32 +00:00
|
|
|
if not self.linebuf or len(self.linebuf) <= self.curpos:
|
2007-12-30 09:13:51 +00:00
|
|
|
# fetch next chunk.
|
|
|
|
self.linebuf = self.fp.read(self.bufsize)
|
|
|
|
if not self.linebuf:
|
|
|
|
# at EOF.
|
|
|
|
break
|
2007-12-31 02:40:32 +00:00
|
|
|
self.curpos = 0
|
2007-12-30 09:13:51 +00:00
|
|
|
if eol:
|
2007-12-31 02:40:32 +00:00
|
|
|
c = self.linebuf[self.curpos]
|
2007-12-30 09:13:51 +00:00
|
|
|
# handle '\r\n'
|
|
|
|
if (eol == '\r' and c == '\n'):
|
|
|
|
line += c
|
2007-12-31 02:40:32 +00:00
|
|
|
self.curpos += 1
|
2007-12-30 09:13:51 +00:00
|
|
|
break
|
2007-12-31 02:40:32 +00:00
|
|
|
m = self.EOLCHAR.search(self.linebuf, self.curpos)
|
2007-12-30 09:13:51 +00:00
|
|
|
if m:
|
|
|
|
i = m.end(0)
|
2007-12-31 02:40:32 +00:00
|
|
|
line += self.linebuf[self.curpos:i]
|
2007-12-30 09:13:51 +00:00
|
|
|
eol = self.linebuf[i-1]
|
2007-12-31 02:40:32 +00:00
|
|
|
self.curpos = i
|
2007-12-30 09:13:51 +00:00
|
|
|
else:
|
|
|
|
# fetch further
|
2007-12-31 02:40:32 +00:00
|
|
|
line += self.linebuf[self.curpos:]
|
2007-12-30 09:13:51 +00:00
|
|
|
self.linebuf = None
|
2007-12-31 02:40:32 +00:00
|
|
|
self.linepos += len(line)
|
2007-12-30 09:13:51 +00:00
|
|
|
return line
|
|
|
|
|
|
|
|
def revreadlines(self):
|
|
|
|
'''
|
|
|
|
fetches lines backword. used to locate trailers.
|
|
|
|
'''
|
|
|
|
self.fp.seek(0, 2)
|
|
|
|
pos = self.fp.tell()
|
|
|
|
buf = ''
|
|
|
|
while 0 < pos:
|
|
|
|
pos = max(0, pos-self.bufsize)
|
|
|
|
self.fp.seek(pos)
|
|
|
|
s = self.fp.read(self.bufsize)
|
|
|
|
if not s: break
|
|
|
|
while 1:
|
|
|
|
n = max(s.rfind('\r'), s.rfind('\n'))
|
|
|
|
if n == -1:
|
|
|
|
buf = s + buf
|
|
|
|
break
|
|
|
|
yield buf+s[n:]
|
|
|
|
s = s[:n]
|
|
|
|
buf = ''
|
|
|
|
return
|
|
|
|
|
|
|
|
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
|
|
|
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
|
|
|
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
|
|
|
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
|
|
|
|
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
|
|
|
|
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
|
|
|
|
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
|
|
|
|
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
|
|
|
|
|
|
|
|
def parse(self):
|
|
|
|
'''
|
|
|
|
Yields a list of basic tokens: keywords, literals, strings,
|
|
|
|
numbers and parentheses. Comments are skipped.
|
|
|
|
Nested objects (i.e. arrays and dictionaries) are not handled.
|
|
|
|
'''
|
|
|
|
while 1:
|
|
|
|
# do not strip line! we need to distinguish last '\n' or '\r'
|
2007-12-31 02:40:32 +00:00
|
|
|
linepos0 = self.linepos
|
2007-12-30 09:13:51 +00:00
|
|
|
self.line = self.nextline()
|
|
|
|
if not self.line: break
|
|
|
|
if 2 <= self.debug:
|
2007-12-31 02:40:32 +00:00
|
|
|
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
|
2007-12-30 09:13:51 +00:00
|
|
|
# do this before removing comment
|
|
|
|
if self.line.startswith('%%EOF'): break
|
|
|
|
charpos = 0
|
|
|
|
|
|
|
|
# tokenize
|
|
|
|
while 1:
|
|
|
|
m = self.TOKEN.search(self.line, charpos)
|
|
|
|
if not m: break
|
|
|
|
t = m.group(0)
|
2007-12-31 02:40:32 +00:00
|
|
|
pos = linepos0 + m.start(0)
|
2007-12-30 09:13:51 +00:00
|
|
|
charpos = m.end(0)
|
|
|
|
|
|
|
|
if t == '%':
|
|
|
|
# skip comment
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'comment: %r' % self.line[charpos:]
|
|
|
|
break
|
|
|
|
|
|
|
|
elif t == '/':
|
|
|
|
# literal object
|
|
|
|
mn = self.LITERAL.match(self.line, m.start(0)+1)
|
|
|
|
lit = PSLiteralTable.intern(mn.group(0))
|
|
|
|
yield (pos, lit)
|
|
|
|
charpos = mn.end(0)
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'name: %r' % lit
|
|
|
|
|
|
|
|
elif t == '(':
|
|
|
|
# normal string object
|
|
|
|
s = ''
|
|
|
|
while 1:
|
|
|
|
ms = self.STRING_NORM.match(self.line, charpos)
|
|
|
|
if not ms: break
|
|
|
|
s1 = ms.group(0)
|
|
|
|
charpos = ms.end(0)
|
|
|
|
if len(s1) == 1 and s1[-1] == '\\':
|
|
|
|
s += s1[-1:]
|
|
|
|
self.line = self.nextline()
|
|
|
|
if not self.line:
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, self.line))
|
2007-12-30 09:13:51 +00:00
|
|
|
charpos = 0
|
|
|
|
elif charpos == len(self.line):
|
|
|
|
s += s1
|
|
|
|
self.line = self.nextline()
|
|
|
|
if not self.line:
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, self.line))
|
2007-12-30 09:13:51 +00:00
|
|
|
charpos = 0
|
|
|
|
else:
|
|
|
|
s += s1
|
|
|
|
break
|
|
|
|
if self.line[charpos] != ')':
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, self.line))
|
2007-12-30 09:13:51 +00:00
|
|
|
charpos += 1
|
|
|
|
def convesc(m):
|
|
|
|
x = m.group(0)
|
|
|
|
if x[1:].isdigit():
|
|
|
|
return chr(int(x[1:], 8))
|
|
|
|
else:
|
|
|
|
return x[1]
|
|
|
|
s = self.STRING_NORM_SUB.sub(convesc, s)
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'str: %r' % s
|
|
|
|
yield (pos, s)
|
|
|
|
|
|
|
|
elif t == '<':
|
|
|
|
# hex string object
|
|
|
|
ms = self.STRING_HEX.match(self.line, charpos)
|
|
|
|
charpos = ms.end(0)
|
|
|
|
if self.line[charpos] != '>':
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, self.line))
|
2007-12-30 09:13:51 +00:00
|
|
|
charpos += 1
|
|
|
|
def convhex(m1):
|
|
|
|
return chr(int(m1.group(0), 16))
|
|
|
|
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'str: %r' % s
|
|
|
|
yield (pos, s)
|
|
|
|
|
|
|
|
elif self.NUMBER.match(t):
|
|
|
|
# number
|
|
|
|
if '.' in t:
|
|
|
|
n = float(t)
|
|
|
|
else:
|
|
|
|
n = int(t)
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'number: %r' % n
|
|
|
|
yield (pos, n)
|
|
|
|
|
|
|
|
elif t in ('true','false'):
|
|
|
|
# boolean
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'boolean: %r' % t
|
|
|
|
yield (pos, (t == 'true'))
|
|
|
|
|
|
|
|
else:
|
|
|
|
# other token
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'keyword: %r' % t
|
|
|
|
yield (pos, PSKeywordTable.intern(t))
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## PSStackParser
|
|
|
|
##
|
|
|
|
class PSStackParser(PSBaseParser):
|
|
|
|
|
|
|
|
'''
|
|
|
|
PostScript parser that recognizes compound objects
|
|
|
|
such as arrays and dictionaries.
|
|
|
|
'''
|
|
|
|
|
|
|
|
def __init__(self, fp, debug=0):
|
|
|
|
PSBaseParser.__init__(self, fp, debug=debug)
|
|
|
|
self.context = []
|
|
|
|
self.partobj = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def do_token(self, pos, token):
|
|
|
|
'''
|
|
|
|
Handles special tokens.
|
|
|
|
Returns true if the token denotes the end of an object.
|
|
|
|
'''
|
|
|
|
return False
|
|
|
|
|
|
|
|
def push(self, obj):
|
|
|
|
'''
|
|
|
|
Push an object to the stack.
|
|
|
|
'''
|
|
|
|
self.partobj.append(obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
def pop(self, n):
|
|
|
|
'''
|
|
|
|
Pop N objects from the stack.
|
|
|
|
'''
|
|
|
|
if len(self.partobj) < n:
|
|
|
|
raise PSSyntaxError('stack too short < %d' % n)
|
|
|
|
r = self.partobj[-n:]
|
|
|
|
self.partobj = self.partobj[:-n]
|
|
|
|
return r
|
|
|
|
|
|
|
|
def popall(self):
|
|
|
|
'''
|
|
|
|
Discards all the objects on the stack.
|
|
|
|
'''
|
|
|
|
self.partobj = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def parse(self):
|
|
|
|
'''
|
|
|
|
Yields a list of objects: keywords, literals, strings,
|
|
|
|
numbers, arrays and dictionaries. Arrays and dictionaries
|
|
|
|
are represented as Python sequence and dictionaries.
|
|
|
|
'''
|
|
|
|
|
|
|
|
def startobj(type):
|
|
|
|
self.context.append((type, self.partobj))
|
|
|
|
self.partobj = []
|
|
|
|
return
|
|
|
|
|
|
|
|
def endobj(type1):
|
|
|
|
assert self.context
|
|
|
|
obj = self.partobj
|
|
|
|
(type0, self.partobj) = self.context.pop()
|
|
|
|
if type0 != type1:
|
|
|
|
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
|
|
|
(type0, self.partobj, type1, obj))
|
|
|
|
return obj
|
|
|
|
|
|
|
|
startobj('o')
|
|
|
|
|
|
|
|
for (pos,t) in PSBaseParser.parse(self):
|
|
|
|
if isinstance(t, int) or isinstance(t, float):
|
|
|
|
self.push(t)
|
|
|
|
elif isinstance(t, str):
|
|
|
|
self.push(t)
|
|
|
|
elif isinstance(t, PSLiteral):
|
|
|
|
self.push(t)
|
|
|
|
else:
|
|
|
|
c = keyword_name(t)
|
|
|
|
if c == '{' or c == '}':
|
|
|
|
self.push(t)
|
|
|
|
elif c == '[':
|
|
|
|
# begin array
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'start array'
|
|
|
|
startobj('a')
|
|
|
|
elif c == ']':
|
|
|
|
# end array
|
|
|
|
a = endobj('a')
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'end array: %r' % a
|
|
|
|
self.push(a)
|
|
|
|
elif c == '<<':
|
|
|
|
# begin dictionary
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'start dict'
|
|
|
|
startobj('d')
|
|
|
|
elif c == '>>':
|
|
|
|
# end dictionary
|
|
|
|
objs = endobj('d')
|
|
|
|
if len(objs) % 2 != 0:
|
|
|
|
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
|
|
|
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'end dict: %r' % d
|
|
|
|
self.push(d)
|
|
|
|
elif self.do_token(pos, t):
|
|
|
|
break
|
|
|
|
|
|
|
|
return endobj('o')
|
|
|
|
|
|
|
|
|
|
|
|
## CMapParser
|
|
|
|
##
|
|
|
|
class CMapParser(PSStackParser):
|
|
|
|
|
|
|
|
def __init__(self, cmap, fp, debug=0):
|
|
|
|
PSStackParser.__init__(self, fp, debug=debug)
|
|
|
|
self.cmap = cmap
|
|
|
|
self.in_cmap = False
|
|
|
|
return
|
|
|
|
|
|
|
|
def do_token(self, pos, token):
|
|
|
|
name = token.name
|
|
|
|
if name == 'begincmap':
|
|
|
|
self.in_cmap = True
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
elif name == 'endcmap':
|
|
|
|
self.in_cmap = False
|
|
|
|
return
|
|
|
|
if not self.in_cmap: return
|
|
|
|
#
|
|
|
|
if name == 'def':
|
|
|
|
try:
|
|
|
|
(k,v) = self.pop(2)
|
|
|
|
self.cmap.attrs[literal_name(k)] = v
|
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'usecmap':
|
|
|
|
try:
|
|
|
|
(cmapname,) = self.pop(1)
|
|
|
|
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'begincodespacerange':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endcodespacerange':
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'codespace: %r' % self.partobj
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'begincidrange':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endcidrange':
|
|
|
|
for (s,e,cid) in choplist(3, self.partobj):
|
|
|
|
assert isinstance(s, str)
|
|
|
|
assert isinstance(e, str)
|
|
|
|
assert isinstance(cid, int)
|
|
|
|
assert len(s) == len(e)
|
|
|
|
sprefix = s[:-4]
|
|
|
|
eprefix = e[:-4]
|
|
|
|
assert sprefix == eprefix
|
|
|
|
svar = s[-4:]
|
|
|
|
evar = e[-4:]
|
|
|
|
s1 = nunpack(svar)
|
|
|
|
e1 = nunpack(evar)
|
|
|
|
vlen = len(svar)
|
|
|
|
assert s1 <= e1
|
|
|
|
for i in xrange(e1-s1+1):
|
|
|
|
x = sprefix+pack('>L',s1+i)[-vlen:]
|
|
|
|
self.cmap.register_code2cid(x, cid+i)
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'begincidchar':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endcidchar':
|
|
|
|
for (cid,code) in choplist(2, self.partobj):
|
|
|
|
assert isinstance(code, str)
|
|
|
|
assert isinstance(cid, str)
|
|
|
|
self.cmap.register_code2cid(code, nunpack(cid))
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'beginbfrange':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endbfrange':
|
|
|
|
for (s,e,code) in choplist(3, self.partobj):
|
|
|
|
assert isinstance(s, str)
|
|
|
|
assert isinstance(e, str)
|
|
|
|
assert len(s) == len(e)
|
|
|
|
s1 = nunpack(s)
|
|
|
|
e1 = nunpack(e)
|
|
|
|
assert s1 <= e1
|
2007-12-31 02:40:32 +00:00
|
|
|
if isinstance(code, list):
|
|
|
|
for i in xrange(e1-s1+1):
|
|
|
|
self.cmap.register_cid2code(s1+i, code[i])
|
|
|
|
else:
|
|
|
|
var = code[-4:]
|
|
|
|
base = nunpack(var)
|
|
|
|
prefix = code[:-4]
|
|
|
|
vlen = len(var)
|
|
|
|
for i in xrange(e1-s1+1):
|
|
|
|
x = prefix+pack('>L',base+i)[-vlen:]
|
|
|
|
self.cmap.register_cid2code(s1+i, x)
|
2007-12-30 09:13:51 +00:00
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'beginbfchar':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endbfchar':
|
|
|
|
for (cid,code) in choplist(2, self.partobj):
|
|
|
|
assert isinstance(cid, str)
|
|
|
|
assert isinstance(code, str)
|
|
|
|
self.cmap.register_cid2code(nunpack(cid), code)
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
if name == 'beginnotdefrange':
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
if name == 'endnotdefrange':
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'notdefrange: %r' % self.partobj
|
|
|
|
self.popall()
|
|
|
|
return
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## PDFStream type
|
|
|
|
##
|
|
|
|
class PDFStream:
|
|
|
|
|
|
|
|
def __init__(self, doc, dic, rawdata):
|
|
|
|
self.doc = doc
|
|
|
|
self.dic = dic
|
|
|
|
self.rawdata = rawdata
|
|
|
|
self.data = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFStream: %r>' % (self.dic)
|
|
|
|
|
|
|
|
def decode(self):
|
|
|
|
assert self.data == None and self.rawdata != None
|
|
|
|
data = self.rawdata
|
|
|
|
if self.doc.crypt:
|
|
|
|
# func DECRYPT is not implemented yet...
|
2007-12-31 02:40:32 +00:00
|
|
|
raise NotImplementedError
|
2007-12-30 09:13:51 +00:00
|
|
|
data = DECRYPT(self.doc.crypt, data)
|
|
|
|
if 'Filter' not in self.dic:
|
|
|
|
self.data = data
|
|
|
|
self.rawdata = None
|
|
|
|
return
|
|
|
|
filters = self.dic['Filter']
|
|
|
|
if not isinstance(filters, list):
|
|
|
|
filters = [ filters ]
|
|
|
|
for f in filters:
|
|
|
|
if f == LITERAL_FLATE_DECODE:
|
|
|
|
import zlib
|
|
|
|
# will get errors if the document is encrypted.
|
|
|
|
data = zlib.decompress(data)
|
|
|
|
# apply predictors
|
|
|
|
params = self.dic.get('DecodeParms', {})
|
|
|
|
if 'Predictor' in params:
|
|
|
|
pred = int_value(params['Predictor'])
|
|
|
|
if pred:
|
|
|
|
if pred != 12:
|
|
|
|
raise PDFValueError('Unsupported predictor: %r' % pred)
|
|
|
|
if 'Columns' not in params:
|
|
|
|
raise PDFValueError('Columns undefined for predictor=12')
|
|
|
|
columns = int_value(params['Columns'])
|
|
|
|
buf = ''
|
|
|
|
ent0 = '\x00' * columns
|
|
|
|
for i in xrange(0, len(data), columns+1):
|
|
|
|
pred = data[i]
|
|
|
|
ent1 = data[i+1:i+1+columns]
|
|
|
|
if pred == '\x02':
|
|
|
|
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
|
|
|
buf += ent1
|
|
|
|
ent0 = ent1
|
|
|
|
data = buf
|
|
|
|
else:
|
|
|
|
raise PDFValueError('Invalid filter spec: %r' % f)
|
|
|
|
self.data = data
|
|
|
|
self.rawdata = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_data(self):
|
|
|
|
if self.data == None:
|
|
|
|
self.decode()
|
|
|
|
return self.data
|
|
|
|
|
|
|
|
def parse_data(self, inline=False, debug=0):
|
|
|
|
return PDFParser(self.doc, StringIO(self.get_data()),
|
|
|
|
inline=inline, debug=debug).parse()
|
|
|
|
|
|
|
|
|
|
|
|
## PDFObjRef
|
|
|
|
##
|
|
|
|
class PDFObjRef:
|
|
|
|
|
|
|
|
def __init__(self, doc, objid, genno):
|
|
|
|
if objid == 0:
|
|
|
|
raise PDFValueError('objid cannot be 0.')
|
|
|
|
self.doc = doc
|
|
|
|
self.objid = objid
|
|
|
|
#self.genno = genno # Never used.
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFObjRef:%d>' % (self.objid)
|
|
|
|
|
|
|
|
def resolve(self):
|
|
|
|
return self.doc.getobj(self.objid)
|
|
|
|
|
|
|
|
|
|
|
|
# resolve
|
|
|
|
def resolve1(x):
|
|
|
|
'''
|
|
|
|
Resolve an object. If this is an array or dictionary,
|
|
|
|
it may still contains some indirect objects inside.
|
|
|
|
'''
|
|
|
|
while isinstance(x, PDFObjRef):
|
|
|
|
x = x.resolve()
|
|
|
|
return x
|
|
|
|
|
|
|
|
def resolveall(x):
|
|
|
|
'''
|
|
|
|
Recursively resolve X and all the internals.
|
|
|
|
Make sure there is no indirect reference within the nested object.
|
|
|
|
This procedure might be slow. Do not used it unless
|
|
|
|
you really need it.
|
|
|
|
'''
|
|
|
|
while isinstance(x, PDFObjRef):
|
|
|
|
x = x.resolve()
|
|
|
|
if isinstance(x, list):
|
|
|
|
x = [ resolveall(v) for v in x ]
|
|
|
|
elif isinstance(x, dict):
|
|
|
|
for (k,v) in x.iteritems():
|
|
|
|
x[k] = resolveall(v)
|
|
|
|
return x
|
|
|
|
|
|
|
|
# Type cheking
|
|
|
|
def literal_name(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, PSLiteral):
|
|
|
|
raise PDFTypeError('literal required: %r' % x)
|
|
|
|
return x.name
|
|
|
|
|
|
|
|
def keyword_name(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, PSKeyword):
|
|
|
|
raise PDFTypeError('keyword required: %r' % x)
|
|
|
|
return x.name
|
|
|
|
|
|
|
|
def str_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, str):
|
|
|
|
raise PDFTypeError('string required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def int_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, int):
|
|
|
|
raise PDFTypeError('integer required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def float_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, float):
|
|
|
|
raise PDFTypeError('float required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def num_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not (isinstance(x, int) or isinstance(x, float)):
|
|
|
|
raise PDFTypeError('int or float required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def list_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, list):
|
|
|
|
raise PDFTypeError('list required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def dict_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, dict):
|
|
|
|
raise PDFTypeError('dict required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
def stream_value(x):
|
|
|
|
x = resolve1(x)
|
|
|
|
if not isinstance(x, PDFStream):
|
|
|
|
raise PDFTypeError('stream required: %r' % x)
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
## PDFPage
|
|
|
|
##
|
|
|
|
class PDFPage:
|
|
|
|
|
|
|
|
def __init__(self, doc, pageidx, attrs, parent_attrs):
|
|
|
|
self.doc = doc
|
|
|
|
self.pageid = pageidx
|
|
|
|
self.attrs = dict_value(attrs)
|
|
|
|
self.parent_attrs = parent_attrs
|
|
|
|
self.resources = self.get_attr('Resources')
|
|
|
|
self.mediabox = self.get_attr('MediaBox')
|
|
|
|
contents = resolve1(self.attrs['Contents'])
|
|
|
|
if not isinstance(contents, list):
|
|
|
|
contents = [ contents ]
|
|
|
|
self.contents = contents
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
|
|
|
|
|
|
|
def get_attr(self, k):
|
|
|
|
if k in self.attrs:
|
|
|
|
return resolve1(self.attrs[k])
|
|
|
|
return self.parent_attrs.get(k)
|
|
|
|
|
|
|
|
|
|
|
|
## XRefs
|
|
|
|
|
|
|
|
## PDFXRef
|
|
|
|
##
|
|
|
|
class PDFXRef:
|
|
|
|
|
|
|
|
def __init__(self, parser):
|
|
|
|
while 1:
|
|
|
|
line = parser.nextline()
|
|
|
|
if not line:
|
|
|
|
raise PDFSyntaxError('premature eof: %r' % parser)
|
|
|
|
line = line.strip()
|
|
|
|
f = line.split(' ')
|
|
|
|
if len(f) != 2:
|
|
|
|
if line != 'trailer':
|
|
|
|
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
|
|
|
break
|
|
|
|
(start, nobjs) = map(long, f)
|
|
|
|
self.objid0 = start
|
|
|
|
self.objid1 = start+nobjs
|
|
|
|
self.offsets = []
|
|
|
|
for objid in xrange(start, start+nobjs):
|
|
|
|
line = parser.nextline()
|
|
|
|
f = line.strip().split(' ')
|
|
|
|
if len(f) != 3:
|
|
|
|
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
|
|
|
(pos, genno, use) = f
|
|
|
|
self.offsets.append((int(genno), long(pos), use))
|
|
|
|
# read trailer
|
|
|
|
self.trailer = dict_value(parser.parse()[0])
|
|
|
|
return
|
|
|
|
|
|
|
|
def getpos(self, objid):
|
|
|
|
if objid < self.objid0 or self.objid1 <= objid:
|
|
|
|
raise IndexError
|
|
|
|
(genno, pos, use) = self.offsets[objid-self.objid0]
|
|
|
|
if use != 'n':
|
|
|
|
raise PDFValueError('unused objid=%r' % objid)
|
|
|
|
return (None, pos)
|
|
|
|
|
|
|
|
|
|
|
|
## PDFXRefStream
|
|
|
|
##
|
|
|
|
class PDFXRefStream:
|
|
|
|
|
|
|
|
def __init__(self, parser):
|
|
|
|
(objid, genno, _, stream) = list_value(parser.parse())
|
|
|
|
assert stream.dic['Type'] == LITERAL_XREF
|
|
|
|
size = stream.dic['Size']
|
|
|
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
|
|
|
self.objid0 = start
|
|
|
|
self.objid1 = start+nobjs
|
|
|
|
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
|
|
|
|
self.data = stream.get_data()
|
|
|
|
self.entlen = self.fl1+self.fl2+self.fl3
|
|
|
|
self.trailer = stream.dic
|
|
|
|
return
|
|
|
|
|
|
|
|
def getpos(self, objid):
|
|
|
|
if objid < self.objid0 or self.objid1 <= objid:
|
|
|
|
raise IndexError
|
|
|
|
i = self.entlen * (objid-self.objid0)
|
|
|
|
ent = self.data[i:i+self.entlen]
|
|
|
|
f1 = nunpack(ent[:self.fl1], 1)
|
|
|
|
if f1 == 1:
|
|
|
|
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
|
|
|
genno = nunpack(ent[self.fl1+self.fl2:])
|
|
|
|
return (None, pos)
|
|
|
|
elif f1 == 2:
|
|
|
|
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
|
|
|
|
index = nunpack(ent[self.fl1+self.fl2:])
|
|
|
|
return (objid, index)
|
|
|
|
|
|
|
|
|
|
|
|
## PDFDocument
|
|
|
|
##
|
|
|
|
class PDFDocument:
|
|
|
|
|
|
|
|
def __init__(self, debug=0):
|
|
|
|
self.debug = debug
|
|
|
|
self.xrefs = []
|
|
|
|
self.objs = {}
|
|
|
|
self.parsed_objs = {}
|
|
|
|
self.crypt = None
|
|
|
|
self.root = None
|
2007-12-31 02:40:32 +00:00
|
|
|
self.catalog = None
|
2007-12-30 09:13:51 +00:00
|
|
|
self.parser = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def set_parser(self, parser):
|
|
|
|
if self.parser: return
|
|
|
|
self.parser = parser
|
|
|
|
self.xrefs = list(parser.read_xref())
|
|
|
|
for xref in self.xrefs:
|
|
|
|
trailer = xref.trailer
|
|
|
|
if 'Encrypt' in trailer:
|
|
|
|
self.crypt = dict_value(trailer['Encrypt'])
|
|
|
|
if 'Root' in trailer:
|
|
|
|
self.set_root(dict_value(trailer['Root']))
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise PDFValueError('no /Root object!')
|
|
|
|
return
|
|
|
|
|
|
|
|
def getobj(self, objid):
|
2007-12-31 02:40:32 +00:00
|
|
|
assert self.xrefs
|
2007-12-30 09:13:51 +00:00
|
|
|
if objid in self.objs:
|
|
|
|
obj = self.objs[objid]
|
|
|
|
else:
|
|
|
|
for xref in self.xrefs:
|
|
|
|
try:
|
|
|
|
(strmid, index) = xref.getpos(objid)
|
|
|
|
break
|
|
|
|
except IndexError:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
raise PDFValueError('Cannot locate objid=%r' % objid)
|
|
|
|
if strmid:
|
|
|
|
stream = stream_value(self.getobj(strmid))
|
|
|
|
if stream.dic['Type'] != LITERAL_OBJSTM:
|
|
|
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
|
|
|
if 'N' not in stream.dic:
|
|
|
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
|
|
|
if strmid in self.parsed_objs:
|
|
|
|
objs = self.parsed_objs[stream]
|
|
|
|
else:
|
|
|
|
objs = stream.parse_data(self.debug)
|
|
|
|
self.parsed_objs[stream] = objs
|
|
|
|
obj = objs[stream.dic['N']*2+index]
|
|
|
|
else:
|
2007-12-31 02:40:32 +00:00
|
|
|
pos0 = self.parser.linepos
|
|
|
|
self.parser.seek(index)
|
2007-12-30 09:13:51 +00:00
|
|
|
seq = list_value(self.parser.parse())
|
|
|
|
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
|
|
|
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
|
|
|
obj = seq[3]
|
|
|
|
self.parser.seek(pos0)
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
|
|
|
|
self.objs[objid] = obj
|
|
|
|
return obj
|
|
|
|
|
|
|
|
def get_pages(self, debug=0):
|
2007-12-31 02:40:32 +00:00
|
|
|
assert self.xrefs
|
2007-12-30 09:13:51 +00:00
|
|
|
def search(obj, parent):
|
|
|
|
tree = dict_value(obj)
|
|
|
|
if tree['Type'] == LITERAL_PAGES:
|
|
|
|
if 1 <= debug:
|
|
|
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
|
|
|
for c in tree['Kids']:
|
|
|
|
for x in search(c, tree):
|
|
|
|
yield x
|
|
|
|
elif tree['Type'] == LITERAL_PAGE:
|
|
|
|
if 1 <= debug:
|
2007-12-31 02:40:32 +00:00
|
|
|
print >>stderr, 'Page: %r' % tree
|
2007-12-30 09:13:51 +00:00
|
|
|
yield (tree, parent)
|
|
|
|
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
|
|
|
|
yield PDFPage(self, i, tree, parent)
|
|
|
|
return
|
|
|
|
|
|
|
|
def set_root(self, root):
|
|
|
|
self.root = root
|
|
|
|
self.catalog = dict_value(self.root)
|
|
|
|
if self.catalog['Type'] != LITERAL_CATALOG:
|
|
|
|
raise PDFValueError('Catalog not found!')
|
|
|
|
self.outline = self.catalog.get('Outline')
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## PDFParser
|
|
|
|
##
|
|
|
|
class PDFParser(PSStackParser):
|
|
|
|
|
|
|
|
def __init__(self, doc, fp, inline=False, debug=0):
|
|
|
|
PSStackParser.__init__(self, fp, debug=debug)
|
|
|
|
self.inline = inline
|
|
|
|
self.doc = doc
|
|
|
|
self.doc.set_parser(self)
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
2007-12-31 02:40:32 +00:00
|
|
|
return '<PDFParser: linepos=%d>' % self.linepos
|
2007-12-30 09:13:51 +00:00
|
|
|
|
|
|
|
EOIPAT = re.compile(r'\nEI\W')
|
|
|
|
def do_token(self, pos, token):
|
|
|
|
name = keyword_name(token)
|
|
|
|
if name in ('xref', 'trailer', 'startxref', 'endobj'):
|
|
|
|
return True
|
|
|
|
|
|
|
|
if name == 'R':
|
|
|
|
# reference to indirect object
|
|
|
|
try:
|
|
|
|
(objid, genno) = self.pop(2)
|
|
|
|
(objid, genno) = (int(objid), int(genno))
|
|
|
|
obj = PDFObjRef(self.doc, objid, genno)
|
|
|
|
self.push(obj)
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'refer obj: %r' % obj
|
|
|
|
except PSSyntaxError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
elif name == 'stream':
|
|
|
|
# stream object
|
|
|
|
(dic,) = self.pop(1)
|
|
|
|
dic = dict_value(dic)
|
|
|
|
if 'Length' not in dic:
|
|
|
|
raise PDFValueError('/Length is undefined: %r' % dic)
|
|
|
|
objlen = int_value(dic['Length'])
|
|
|
|
self.seek(pos)
|
|
|
|
line = self.nextline() # 'stream'
|
|
|
|
self.fp.seek(pos+len(line))
|
|
|
|
data = self.fp.read(objlen)
|
|
|
|
self.seek(pos+len(line)+objlen)
|
|
|
|
while 1:
|
|
|
|
line = self.nextline()
|
|
|
|
if not line:
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, line))
|
2007-12-30 09:13:51 +00:00
|
|
|
if line.strip():
|
|
|
|
if not line.startswith('endstream'):
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, line))
|
2007-12-30 09:13:51 +00:00
|
|
|
break
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
|
|
|
(pos, objlen, dic, data[:10])
|
|
|
|
obj = PDFStream(self.doc, dic, data)
|
|
|
|
self.push(obj)
|
|
|
|
|
|
|
|
elif self.inline and name == 'BI':
|
|
|
|
# inline image within a content stream
|
|
|
|
self.context.append(('BI', self.partobj))
|
|
|
|
self.partobj = []
|
|
|
|
|
|
|
|
elif self.inline and name == 'ID':
|
|
|
|
objs = self.partobj
|
|
|
|
(type0, self.partobj) = self.context.pop()
|
|
|
|
if len(objs) % 2 != 0:
|
|
|
|
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
|
|
|
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
|
|
|
pos += len('ID ')
|
|
|
|
self.fp.seek(pos)
|
|
|
|
data = self.fp.read(8192)
|
2007-12-31 02:40:32 +00:00
|
|
|
# XXX how do we know the real length other than scanning?
|
2007-12-30 09:13:51 +00:00
|
|
|
m = self.EOIPAT.search(data)
|
|
|
|
assert m
|
|
|
|
objlen = m.start(0)
|
|
|
|
obj = PDFStream(self.doc, dic, data[:objlen])
|
|
|
|
self.push(obj)
|
|
|
|
self.seek(pos+objlen+len('\nEI'))
|
|
|
|
self.push(KEYWORD_EI)
|
|
|
|
|
|
|
|
else:
|
|
|
|
self.push(token)
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
def find_xref(self):
|
|
|
|
# find the first xref table
|
|
|
|
prev = None
|
|
|
|
for line in self.revreadlines():
|
|
|
|
line = line.strip()
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'line: %r' % line
|
|
|
|
if line == 'startxref': break
|
|
|
|
if line:
|
|
|
|
prev = line
|
|
|
|
else:
|
|
|
|
raise PDFSyntaxError('startxref not found!')
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'xref found: pos=%r' % prev
|
|
|
|
self.seek(long(prev))
|
|
|
|
return
|
|
|
|
|
|
|
|
# read xref tables and trailers
|
|
|
|
def read_xref(self):
|
|
|
|
self.find_xref()
|
|
|
|
while 1:
|
|
|
|
# read xref table
|
2007-12-31 02:40:32 +00:00
|
|
|
pos0 = self.linepos
|
2007-12-30 09:13:51 +00:00
|
|
|
line = self.nextline()
|
|
|
|
if 2 <= self.debug:
|
|
|
|
print >>stderr, 'line: %r' % line
|
|
|
|
if line[0].isdigit():
|
|
|
|
# XRefStream: PDF-1.5
|
|
|
|
self.seek(pos0)
|
|
|
|
xref = PDFXRefStream(self)
|
|
|
|
elif line.strip() != 'xref':
|
2007-12-31 02:40:32 +00:00
|
|
|
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
|
|
|
(self.linepos, line))
|
2007-12-30 09:13:51 +00:00
|
|
|
else:
|
|
|
|
xref = PDFXRef(self)
|
|
|
|
yield xref
|
|
|
|
trailer = xref.trailer
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'trailer: %r' % trailer
|
|
|
|
if 'XRefStm' in trailer:
|
|
|
|
self.seek(int_value(trailer['XRefStm']))
|
|
|
|
if 'Prev' in trailer:
|
|
|
|
# find previous xref
|
|
|
|
pos0 = int_value(trailer['Prev'])
|
|
|
|
self.seek(pos0)
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'prev trailer: pos=%d' % pos0
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## Fonts
|
|
|
|
##
|
|
|
|
|
|
|
|
# PDFFont
|
|
|
|
class PDFFont:
|
|
|
|
|
|
|
|
def __init__(self, fontid, descriptor, widths, default_width=None):
|
|
|
|
self.fontid = fontid
|
|
|
|
self.descriptor = descriptor
|
|
|
|
self.widths = widths
|
|
|
|
self.fontname = descriptor['FontName']
|
|
|
|
if isinstance(self.fontname, PSLiteral):
|
|
|
|
self.fontname = literal_name(self.fontname)
|
|
|
|
self.ascent = descriptor['Ascent']
|
|
|
|
self.descent = descriptor['Descent']
|
|
|
|
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
|
|
|
self.leading = descriptor.get('Leading', 0)
|
|
|
|
self.bbox = descriptor['FontBBox']
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFFont: fontid=%r>' % (self.fontid,)
|
|
|
|
|
|
|
|
def is_vertical(self):
|
|
|
|
return False
|
|
|
|
|
|
|
|
def decode(self, bytes):
|
|
|
|
return map(ord, bytes)
|
|
|
|
|
|
|
|
def char_width(self, cid):
|
|
|
|
return self.widths.get(cid, self.default_width)
|
|
|
|
|
|
|
|
def char_disp(self, cid):
|
|
|
|
return 0
|
|
|
|
|
|
|
|
def string_width(self, s):
|
|
|
|
return sum( self.char_width(cid) for cid in self.decode(s) )
|
|
|
|
|
|
|
|
|
|
|
|
# PDFSimpleFont
|
|
|
|
class PDFSimpleFont(PDFFont):
|
|
|
|
|
|
|
|
def __init__(self, fontid, descriptor, widths, spec):
|
|
|
|
# Font encoding is specified either by a name of
|
|
|
|
# built-in encoding or a dictionary that describes
|
|
|
|
# the differences.
|
|
|
|
if 'Encoding' in spec:
|
|
|
|
encoding = resolve1(spec['Encoding'])
|
|
|
|
else:
|
|
|
|
encoding = LITERAL_STANDARD_ENCODING
|
|
|
|
if isinstance(encoding, dict):
|
|
|
|
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
|
|
|
diff = encoding.get('Differences', None)
|
|
|
|
self.encoding = EncodingDB.get_encoding(name, diff)
|
|
|
|
else:
|
|
|
|
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
|
|
|
self.ucs2_cmap = None
|
|
|
|
if 'ToUnicode' in spec:
|
|
|
|
strm = stream_value(spec['ToUnicode'])
|
|
|
|
self.ucs2_cmap = CMap()
|
|
|
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
|
|
|
PDFFont.__init__(self, fontid, descriptor, widths)
|
|
|
|
return
|
|
|
|
|
|
|
|
def to_unicode(self, cid):
|
|
|
|
if not self.ucs2_cmap:
|
|
|
|
try:
|
|
|
|
return self.encoding[cid]
|
|
|
|
except KeyError:
|
|
|
|
raise PDFUnicodeNotDefined(None, cid)
|
|
|
|
code = self.ucs2_cmap.tocode(cid)
|
|
|
|
if not code:
|
|
|
|
raise PDFUnicodeNotDefined(None, cid)
|
|
|
|
chars = unpack('>%dH' % (len(code)/2), code)
|
|
|
|
return ''.join( unichr(c) for c in chars )
|
|
|
|
|
|
|
|
|
|
|
|
# PDFType1Font
|
|
|
|
class PDFType1Font(PDFSimpleFont):
|
|
|
|
|
|
|
|
def __init__(self, fontid, spec):
|
|
|
|
if 'BaseFont' not in spec:
|
|
|
|
raise PDFFontError('BaseFont is missing')
|
|
|
|
self.basefont = literal_name(spec['BaseFont'])
|
|
|
|
try:
|
|
|
|
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
|
|
|
except KeyError:
|
|
|
|
try:
|
|
|
|
descriptor = dict_value(spec['FontDescriptor'])
|
|
|
|
firstchar = int_value(spec['FirstChar'])
|
|
|
|
lastchar = int_value(spec['LastChar'])
|
|
|
|
widths = dict( (i+firstchar,w) for (i,w)
|
|
|
|
in enumerate(list_value(spec['Widths'])) )
|
|
|
|
except KeyError, k:
|
|
|
|
raise PDFFontError('%s is missing' % k)
|
|
|
|
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
|
|
|
return
|
|
|
|
|
|
|
|
# PDFTrueTypeFont
|
|
|
|
class PDFTrueTypeFont(PDFType1Font):
|
|
|
|
pass
|
|
|
|
|
|
|
|
# PDFType3Font
|
|
|
|
class PDFType3Font(PDFSimpleFont):
|
|
|
|
def __init__(self, fontid, spec):
|
|
|
|
try:
|
|
|
|
firstchar = int_value(spec['FirstChar'])
|
|
|
|
lastchar = int_value(spec['LastChar'])
|
|
|
|
widths = dict( (i+firstchar,w) for (i,w)
|
|
|
|
in enumerate(list_value(spec['Widths'])) )
|
|
|
|
except KeyError, k:
|
|
|
|
raise PDFFontError('%s is missing' % k)
|
|
|
|
if 'FontDescriptor' in spec:
|
|
|
|
descriptor = dict_value(spec['FontDescriptor'])
|
|
|
|
else:
|
|
|
|
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
|
|
|
|
'FontBBox':spec['FontBBox']}
|
|
|
|
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
|
|
|
return
|
|
|
|
|
|
|
|
# PDFCIDFont
|
|
|
|
|
|
|
|
## TrueTypeFont
|
|
|
|
##
|
|
|
|
class TrueTypeFont:
|
|
|
|
|
|
|
|
class CMapNotFound(Exception): pass
|
|
|
|
|
|
|
|
def __init__(self, name, fp):
|
|
|
|
self.name = name
|
|
|
|
self.fp = fp
|
|
|
|
self.tables = {}
|
|
|
|
fonttype = fp.read(4)
|
|
|
|
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
|
|
|
for i in xrange(ntables):
|
|
|
|
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
|
|
|
self.tables[name] = (offset, length)
|
|
|
|
return
|
|
|
|
|
|
|
|
def create_cmap(self):
|
|
|
|
if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
|
|
|
|
(base_offset, length) = self.tables['cmap']
|
|
|
|
fp = self.fp
|
|
|
|
fp.seek(base_offset)
|
|
|
|
(version, nsubtables) = unpack('>HH', fp.read(4))
|
|
|
|
subtables = []
|
|
|
|
for i in xrange(nsubtables):
|
|
|
|
subtables.append(unpack('>HHL', fp.read(8)))
|
|
|
|
char2gid = {}
|
|
|
|
# Only supports subtable type 0, 2 and 4.
|
|
|
|
for (_1, _2, st_offset) in subtables:
|
|
|
|
fp.seek(base_offset+st_offset)
|
|
|
|
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
|
|
|
if fmttype == 0:
|
|
|
|
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
|
|
|
elif fmttype == 2:
|
|
|
|
subheaderkeys = unpack('>256H', fp.read(512))
|
|
|
|
firstbytes = [0]*8192
|
|
|
|
for (i,k) in enumerate(subheaderkeys):
|
|
|
|
firstbytes[k/8] = i
|
|
|
|
nhdrs = max(subheaderkeys)/8 + 1
|
|
|
|
hdrs = []
|
|
|
|
for i in xrange(nhdrs):
|
|
|
|
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
|
|
|
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
|
|
|
for (i,firstcode,entcount,delta,pos) in hdrs:
|
2007-12-31 02:40:32 +00:00
|
|
|
if not entcount: continue
|
2007-12-30 09:13:51 +00:00
|
|
|
first = firstcode + (firstbytes[i] << 8)
|
|
|
|
fp.seek(pos)
|
|
|
|
for c in xrange(entcount):
|
|
|
|
gid = unpack('>H', fp.read(2))
|
|
|
|
if gid:
|
|
|
|
gid += delta
|
|
|
|
char2gid[first+c] = gid
|
|
|
|
elif fmttype == 4:
|
|
|
|
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
|
|
|
segcount /= 2
|
|
|
|
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
|
|
fp.read(2)
|
|
|
|
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
|
|
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
|
|
|
pos = fp.tell()
|
|
|
|
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
|
|
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
|
|
|
if idr:
|
|
|
|
fp.seek(pos+idr)
|
|
|
|
for c in xrange(sc, ec+1):
|
|
|
|
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
|
|
|
else:
|
|
|
|
for c in xrange(sc, ec+1):
|
|
|
|
char2gid[c] = (c + idd) & 0xffff
|
|
|
|
gid2char = dict( (gid, pack('>H', char))
|
|
|
|
for (char,gid) in char2gid.iteritems() )
|
|
|
|
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
|
|
|
return CMap(cmapname).update(char2gid, gid2char)
|
|
|
|
|
|
|
|
class PDFCIDFont(PDFFont):
|
|
|
|
|
|
|
|
def __init__(self, fontid, spec):
|
|
|
|
if 'BaseFont' not in spec:
|
|
|
|
raise PDFFontError('BaseFont is missing')
|
|
|
|
try:
|
|
|
|
self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
|
|
|
|
self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
|
|
|
|
self.cidsysteminfo['Ordering'])
|
|
|
|
except KeyError:
|
|
|
|
raise PDFFontError('CIDSystemInfo not properly defined.')
|
|
|
|
self.basefont = literal_name(spec['BaseFont'])
|
|
|
|
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
|
|
|
descriptor = dict_value(spec['FontDescriptor'])
|
|
|
|
ttf = None
|
|
|
|
if 'FontFile2' in descriptor:
|
|
|
|
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
|
|
|
ttf = TrueTypeFont(self.basefont,
|
|
|
|
StringIO(self.fontfile.get_data()))
|
|
|
|
self.ucs2_cmap = None
|
|
|
|
if 'ToUnicode' in spec:
|
|
|
|
strm = stream_value(spec['ToUnicode'])
|
|
|
|
self.ucs2_cmap = CMap()
|
|
|
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
|
|
|
elif self.cidcoding == 'Adobe-Identity':
|
|
|
|
if ttf:
|
|
|
|
try:
|
|
|
|
self.ucs2_cmap = ttf.create_cmap()
|
|
|
|
except TrueTypeFont.CMapNotFound:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
|
|
|
|
def get_width(seq):
|
|
|
|
dic = {}
|
|
|
|
char1 = char2 = None
|
|
|
|
for v in seq:
|
|
|
|
if char1 == None:
|
|
|
|
char1 = v
|
|
|
|
elif char2 == None and isinstance(v, int):
|
|
|
|
char2 = v
|
|
|
|
else:
|
|
|
|
if char2 == None:
|
|
|
|
for (i,w) in enumerate(v):
|
|
|
|
dic[char1+i] = w
|
|
|
|
else:
|
|
|
|
for i in xrange(char1, char2+1):
|
|
|
|
dic[i] = v
|
|
|
|
char1 = char2 = None
|
|
|
|
return dic
|
|
|
|
self.vertical = self.cmap.is_vertical()
|
|
|
|
if self.vertical:
|
|
|
|
# writing mode: vertical
|
|
|
|
dic = get_width(list_value(spec.get('W2', [])))
|
|
|
|
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
|
|
|
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
|
|
|
(d,w) = spec.get('DW2', [880, -1000])
|
|
|
|
default_width = w
|
|
|
|
self.default_disp = d
|
|
|
|
else:
|
|
|
|
# writing mode: horizontal
|
|
|
|
widths = get_width(list_value(spec.get('W', [])))
|
|
|
|
self.disps = {}
|
|
|
|
default_width = spec.get('DW', 1000)
|
|
|
|
self.default_disp = 0
|
|
|
|
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
|
|
|
|
return
|
|
|
|
|
|
|
|
def is_vertical(self):
|
|
|
|
return self.vertical
|
|
|
|
|
|
|
|
def decode(self, bytes):
|
|
|
|
return self.cmap.decode(bytes)
|
|
|
|
|
|
|
|
def char_disp(self, cid):
|
|
|
|
return self.disps.get(cid, self.default_disp)
|
|
|
|
|
|
|
|
def to_unicode(self, cid):
|
|
|
|
if not self.ucs2_cmap:
|
|
|
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
|
|
|
code = self.ucs2_cmap.tocode(cid)
|
|
|
|
if not code:
|
|
|
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
|
|
|
chars = unpack('>%dH' % (len(code)/2), code)
|
|
|
|
return ''.join( unichr(c) for c in chars )
|
|
|
|
|
|
|
|
|
|
|
|
## Resource Manager
|
|
|
|
##
|
|
|
|
class PDFResourceManager:
|
|
|
|
|
|
|
|
'''
|
|
|
|
ResourceManager facilitates reuse of shared resources
|
|
|
|
such as fonts, images and cmaps so that large objects are not
|
|
|
|
allocated multiple times.
|
|
|
|
'''
|
|
|
|
|
|
|
|
def __init__(self, debug=0):
|
|
|
|
self.debug = debug
|
|
|
|
self.fonts = {}
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_procset(self, procs):
|
|
|
|
for proc in procs:
|
|
|
|
if proc == LITERAL_PDF:
|
|
|
|
pass
|
|
|
|
elif proc == LITERAL_TEXT:
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
|
|
|
pass
|
|
|
|
return
|
|
|
|
|
|
|
|
def get_cmap(self, name):
|
|
|
|
return CMapDB.get_cmap(name)
|
|
|
|
|
|
|
|
def get_font(self, fontid, spec):
|
|
|
|
if fontid in self.fonts:
|
|
|
|
font = self.fonts[fontid]
|
|
|
|
else:
|
|
|
|
spec = dict_value(spec)
|
|
|
|
assert spec['Type'] == LITERAL_FONT
|
|
|
|
# Create a Font object.
|
|
|
|
if 'Subtype' not in spec:
|
|
|
|
raise PDFFontError('Font Subtype is not specified.')
|
|
|
|
subtype = literal_name(spec['Subtype'])
|
|
|
|
if subtype in ('Type1', 'MMType1'):
|
|
|
|
# Type1 Font
|
|
|
|
font = PDFType1Font(fontid, spec)
|
|
|
|
elif subtype == 'TrueType':
|
|
|
|
# TrueType Font
|
|
|
|
font = PDFTrueTypeFont(fontid, spec)
|
|
|
|
elif subtype == 'Type3':
|
|
|
|
# Type3 Font
|
|
|
|
font = PDFType3Font(fontid, spec)
|
|
|
|
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
|
|
|
# CID Font
|
|
|
|
font = PDFCIDFont(fontid, spec)
|
|
|
|
elif subtype == 'Type0':
|
|
|
|
# Type0 Font
|
|
|
|
dfonts = list_value(spec['DescendantFonts'])
|
|
|
|
assert len(dfonts) == 1
|
|
|
|
subspec = dict_value(dfonts[0]).copy()
|
|
|
|
for k in ('Encoding', 'ToUnicode'):
|
|
|
|
if k in spec:
|
|
|
|
subspec[k] = resolve1(spec[k])
|
|
|
|
font = self.get_font(fontid, subspec)
|
|
|
|
else:
|
|
|
|
raise PDFFontError('Invalid Font: %r' % spec)
|
|
|
|
self.fonts[fontid] = font
|
|
|
|
return font
|
|
|
|
|
|
|
|
|
|
|
|
## Interpreter
|
|
|
|
##
|
|
|
|
class PDFPageInterpreter:
|
|
|
|
|
|
|
|
class TextState:
|
|
|
|
def __init__(self):
|
|
|
|
self.font = None
|
|
|
|
self.fontsize = 0
|
|
|
|
self.charspace = 0
|
|
|
|
self.wordspace = 0
|
|
|
|
self.scaling = 100
|
|
|
|
self.leading = 0
|
|
|
|
self.render = 0
|
|
|
|
self.rise = 0
|
|
|
|
self.reset()
|
|
|
|
return
|
|
|
|
def __repr__(self):
|
|
|
|
return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
|
|
|
|
' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
|
|
|
|
' render=%r, rise=%r>' %
|
|
|
|
(self.font, self.fontsize, self.matrix,
|
|
|
|
self.charspace, self.wordspace, self.scaling, self.leading,
|
|
|
|
self.render, self.rise))
|
|
|
|
def reset(self):
|
|
|
|
self.matrix = (1, 0, 0, 1, 0, 0)
|
|
|
|
self.linematrix = (0, 0)
|
|
|
|
return
|
|
|
|
|
|
|
|
def __init__(self, rsrc, device, debug=0):
|
|
|
|
self.rsrc = rsrc
|
|
|
|
self.device = device
|
|
|
|
self.debug = debug
|
|
|
|
return
|
|
|
|
|
|
|
|
def initpage(self, ctm):
|
|
|
|
self.fontmap = {}
|
|
|
|
self.xobjmap = {}
|
|
|
|
self.csmap = {}
|
|
|
|
# gstack: stack for graphical states.
|
|
|
|
self.gstack = []
|
|
|
|
self.ctm = ctm
|
|
|
|
self.device.set_ctm(self.ctm)
|
|
|
|
self.textstate = PDFPageInterpreter.TextState()
|
|
|
|
# argstack: stack for command arguments.
|
|
|
|
self.argstack = []
|
|
|
|
# set some global states.
|
|
|
|
self.scs = None
|
|
|
|
self.ncs = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def push(self, obj):
|
|
|
|
self.argstack.append(obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
def pop(self, n):
|
|
|
|
x = self.argstack[-n:]
|
|
|
|
self.argstack = self.argstack[:-n]
|
|
|
|
return x
|
|
|
|
|
|
|
|
def get_current_state(self):
|
|
|
|
return (self.ctm, self.textstate)
|
|
|
|
|
|
|
|
def set_current_state(self, state):
|
|
|
|
(self.ctm, self.textstate) = state
|
|
|
|
self.device.set_ctm(self.ctm)
|
|
|
|
return
|
|
|
|
|
|
|
|
# gsave
|
|
|
|
def do_q(self):
|
|
|
|
self.gstack.append(self.get_current_state())
|
|
|
|
return
|
|
|
|
# grestore
|
|
|
|
def do_Q(self):
|
|
|
|
if self.gstack:
|
|
|
|
self.set_current_state(self.gstack.pop())
|
|
|
|
return
|
|
|
|
|
|
|
|
# concat-matrix
|
|
|
|
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
|
|
|
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
|
|
|
|
self.device.set_ctm(self.ctm)
|
|
|
|
return
|
|
|
|
|
|
|
|
# setlinewidth
|
|
|
|
def do_w(self, width): return
|
|
|
|
# setlinecap
|
|
|
|
def do_J(self, cap): return
|
|
|
|
# setlinejoin
|
|
|
|
def do_j(self, join): return
|
|
|
|
# setmiterlimit
|
|
|
|
def do_M(self, limit): return
|
|
|
|
# setdash
|
|
|
|
def do_d(self, dash, phase): return
|
|
|
|
# setintent
|
|
|
|
def do_ri(self, intent): return
|
|
|
|
# setflatness
|
|
|
|
def do_i(self, flatness): return
|
|
|
|
# savedict
|
|
|
|
def do_gs(self, name): return
|
|
|
|
|
|
|
|
# moveto
|
|
|
|
def do_m(self, x, y): return
|
|
|
|
# lineto
|
|
|
|
def do_l(self, x, y): return
|
|
|
|
# curveto
|
|
|
|
def do_c(self, x1, y1, x2, y2, x3, y3): return
|
|
|
|
# urveto
|
|
|
|
def do_v(self, x2, y2, x3, y3): return
|
|
|
|
# rveto
|
|
|
|
def do_y(self, x1, y1, x3, y3): return
|
|
|
|
# closepath
|
|
|
|
def do_h(self): return
|
|
|
|
# rectangle
|
|
|
|
def do_re(self, x, y, w, h): return
|
|
|
|
|
|
|
|
# stroke
|
|
|
|
def do_S(self): return
|
|
|
|
# close-and-stroke
|
|
|
|
def do_s(self): return
|
|
|
|
# fill
|
|
|
|
def do_f(self): return
|
|
|
|
# fill (obsolete)
|
|
|
|
do_F = do_f
|
|
|
|
# fill-even-odd
|
|
|
|
def do_f_a(self): return
|
|
|
|
# fill-and-stroke
|
|
|
|
def do_B(self): return
|
|
|
|
# fill-and-stroke-even-odd
|
|
|
|
def do_B_a(self): return
|
|
|
|
# close-fill-and-stroke
|
|
|
|
def do_b(self): return
|
|
|
|
# close-fill-and-stroke-even-odd
|
|
|
|
def do_b_a(self): return
|
|
|
|
# close-only
|
|
|
|
def do_n(self): return
|
|
|
|
# clip
|
|
|
|
def do_W(self): return
|
|
|
|
# clip-even-odd
|
|
|
|
def do_W_a(self): return
|
|
|
|
|
|
|
|
# setcolorspace-stroking
|
|
|
|
def do_CS(self, name):
|
|
|
|
self.scs = self.csmap.get(literal_name(name), None)
|
|
|
|
return
|
|
|
|
# setcolorspace-non-strokine
|
|
|
|
def do_cs(self, name):
|
|
|
|
self.ncs = self.csmap.get(literal_name(name), None)
|
|
|
|
return
|
|
|
|
# setgray-stroking
|
|
|
|
def do_G(self, gray):
|
|
|
|
self.do_CS(LITERAL_DEVICE_GRAY)
|
|
|
|
return
|
|
|
|
# setgray-non-stroking
|
|
|
|
def do_g(self, gray):
|
|
|
|
self.do_cs(LITERAL_DEVICE_GRAY)
|
|
|
|
return
|
|
|
|
# setrgb-stroking
|
|
|
|
def do_RG(self, r, g, b):
|
|
|
|
self.do_CS(LITERAL_DEVICE_RGB)
|
|
|
|
return
|
|
|
|
# setrgb-non-stroking
|
|
|
|
def do_rg(self, r, g, b):
|
|
|
|
self.do_cs(LITERAL_DEVICE_RGB)
|
|
|
|
return
|
|
|
|
# setcmyk-stroking
|
|
|
|
def do_K(self, c, m, y, k):
|
|
|
|
self.do_CS(LITERAL_DEVICE_CMYK)
|
|
|
|
return
|
|
|
|
# setcmyk-non-stroking
|
|
|
|
def do_k(self, c, m, y, k):
|
|
|
|
self.do_cs(LITERAL_DEVICE_CMYK)
|
|
|
|
return
|
|
|
|
|
|
|
|
# setcolor
|
|
|
|
def do_SCN(self):
|
2007-12-31 02:40:32 +00:00
|
|
|
n = cs_params(self.scs)
|
2007-12-30 09:13:51 +00:00
|
|
|
self.pop(n)
|
|
|
|
return
|
|
|
|
def do_scn(self):
|
2007-12-31 02:40:32 +00:00
|
|
|
n = cs_params(self.ncs)
|
2007-12-30 09:13:51 +00:00
|
|
|
self.pop(n)
|
|
|
|
return
|
|
|
|
def do_SC(self):
|
|
|
|
self.do_SCN()
|
|
|
|
return
|
|
|
|
def do_sc(self):
|
|
|
|
self.do_scn()
|
|
|
|
return
|
|
|
|
|
|
|
|
# sharing-name
|
|
|
|
def do_sh(self, name): return
|
|
|
|
|
|
|
|
# begin-text
|
|
|
|
def do_BT(self):
|
|
|
|
self.textstate.reset()
|
|
|
|
return
|
|
|
|
# end-text
|
|
|
|
def do_ET(self):
|
|
|
|
return
|
|
|
|
|
|
|
|
# begin-compat
|
|
|
|
def do_BX(self): return
|
|
|
|
# end-compat
|
|
|
|
def do_EX(self): return
|
|
|
|
|
|
|
|
# marked content operators
|
|
|
|
def do_MP(self, tag): return
|
|
|
|
def do_DP(self, tag, props): return
|
|
|
|
def do_BMC(self, tag): return
|
|
|
|
def do_BDC(self, tag, props): return
|
|
|
|
def do_EMC(self): return
|
|
|
|
|
|
|
|
# setcharspace
|
|
|
|
def do_Tc(self, space):
|
|
|
|
self.textstate.charspace = space
|
|
|
|
return
|
|
|
|
# setwordspace
|
|
|
|
def do_Tw(self, space):
|
|
|
|
self.textstate.wordspace = space
|
|
|
|
return
|
|
|
|
# textscale
|
|
|
|
def do_Tz(self, scale):
|
|
|
|
self.textstate.scaling = scale
|
|
|
|
return
|
|
|
|
# setleading
|
|
|
|
def do_TL(self, leading):
|
|
|
|
self.textstate.leading = leading
|
|
|
|
return
|
|
|
|
# selectfont
|
|
|
|
def do_Tf(self, fontid, fontsize):
|
|
|
|
try:
|
|
|
|
self.textstate.font = self.fontmap[literal_name(fontid)]
|
|
|
|
except KeyError:
|
|
|
|
raise PDFInterpreterError('Undefined font id: %r' % fontid)
|
|
|
|
self.textstate.fontsize = fontsize
|
|
|
|
return
|
|
|
|
# setrendering
|
|
|
|
def do_Tr(self, render):
|
|
|
|
self.textstate.render = render
|
|
|
|
return
|
|
|
|
# settextrise
|
|
|
|
def do_Ts(self, rise):
|
|
|
|
self.textstate.rise = rise
|
|
|
|
return
|
|
|
|
|
|
|
|
# text-move
|
|
|
|
def do_Td(self, tx, ty):
|
|
|
|
(a,b,c,d,e,f) = self.textstate.matrix
|
|
|
|
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
|
|
|
|
self.textstate.linematrix = (0, 0)
|
|
|
|
return
|
|
|
|
# text-move
|
|
|
|
def do_TD(self, tx, ty):
|
|
|
|
(a,b,c,d,e,f) = self.textstate.matrix
|
|
|
|
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
|
|
|
|
self.textstate.leading = -ty
|
|
|
|
self.textstate.linematrix = (0, 0)
|
|
|
|
return
|
|
|
|
# textmatrix
|
|
|
|
def do_Tm(self, a,b,c,d,e,f):
|
|
|
|
self.textstate.matrix = (a,b,c,d,e,f)
|
|
|
|
self.textstate.linematrix = (0, 0)
|
|
|
|
return
|
|
|
|
# nextline
|
|
|
|
def do_T_a(self):
|
|
|
|
(a,b,c,d,e,f) = self.textstate.matrix
|
|
|
|
self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
|
|
|
|
self.textstate.linematrix = (0, 0)
|
|
|
|
return
|
|
|
|
|
|
|
|
# show-pos
|
|
|
|
def do_TJ(self, seq):
|
|
|
|
textstate = self.textstate
|
|
|
|
font = textstate.font
|
|
|
|
(a,b,c,d,e,f) = textstate.matrix
|
|
|
|
(lx,ly) = textstate.linematrix
|
|
|
|
s = ''.join( x for x in seq if isinstance(x, str) )
|
|
|
|
n = sum( x for x in seq if not isinstance(x, str) )
|
|
|
|
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
|
|
|
|
len(s) * textstate.charspace +
|
|
|
|
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
|
|
|
|
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
|
|
|
|
if font.is_vertical():
|
|
|
|
ly += w
|
|
|
|
else:
|
|
|
|
lx += w
|
|
|
|
textstate.linematrix = (lx,ly)
|
|
|
|
return
|
|
|
|
# show
|
|
|
|
def do_Tj(self, s):
|
|
|
|
self.do_TJ([s])
|
|
|
|
return
|
|
|
|
# quote
|
|
|
|
def do__q(self, s):
|
|
|
|
self.do_T_a()
|
|
|
|
self.do_TJ([s])
|
|
|
|
return
|
|
|
|
# doublequote
|
|
|
|
def do__w(self, aw, ac, s):
|
|
|
|
self.do_Tw(aw)
|
|
|
|
self.do_Tc(ac)
|
|
|
|
self.do_TJ([s])
|
|
|
|
return
|
|
|
|
|
|
|
|
# inline image
|
|
|
|
def do_BI(self): # never called
|
|
|
|
return
|
|
|
|
def do_ID(self): # never called
|
|
|
|
return
|
|
|
|
def do_EI(self, obj):
|
|
|
|
return
|
|
|
|
|
|
|
|
# invoke an XObject
|
|
|
|
def do_Do(self, xobjid):
|
|
|
|
xobjid = literal_name(xobjid)
|
|
|
|
try:
|
|
|
|
xobj = stream_value(self.xobjmap[xobjid])
|
|
|
|
except KeyError:
|
|
|
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
|
|
|
if xobj.dic['Subtype'] == LITERAL_FORM:
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'Processing xobj: %r' % xobj
|
|
|
|
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
|
|
|
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], xobj.dic['Matrix'])
|
|
|
|
return
|
|
|
|
|
|
|
|
def process_page(self, page):
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'Processing page: %r' % page
|
2007-12-31 02:40:32 +00:00
|
|
|
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)):
|
|
|
|
self.initpage(ctm)
|
2007-12-31 02:40:32 +00:00
|
|
|
self.device.begin_block(contid)
|
2007-12-30 09:13:51 +00:00
|
|
|
# Handle resource declarations.
|
|
|
|
for (k,v) in resources.iteritems():
|
2007-12-31 02:40:32 +00:00
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'Resource: %r: %r' % (k,v)
|
2007-12-30 09:13:51 +00:00
|
|
|
if k == 'Font':
|
|
|
|
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
|
|
|
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
|
|
|
elif k == 'ColorSpace':
|
|
|
|
for (csid,csspec) in dict_value(v).iteritems():
|
|
|
|
self.csmap[csid] = list_value(csspec)
|
|
|
|
elif k == 'ProcSet':
|
|
|
|
self.rsrc.get_procset(list_value(v))
|
|
|
|
elif k == 'XObject':
|
|
|
|
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
|
|
|
self.xobjmap[xobjid] = xobjstrm
|
|
|
|
for stream in contents:
|
|
|
|
self.execute(stream_value(stream))
|
2007-12-31 02:40:32 +00:00
|
|
|
self.device.end_block()
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def execute(self, stream):
|
|
|
|
for obj in stream.parse_data(inline=True, debug=self.debug):
|
|
|
|
if isinstance(obj, PSKeyword):
|
|
|
|
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
|
|
|
if hasattr(self, name):
|
|
|
|
func = getattr(self, name)
|
|
|
|
nargs = func.func_code.co_argcount-1
|
|
|
|
if nargs:
|
|
|
|
args = self.pop(nargs)
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'exec: %s %r' % (obj.name, args)
|
|
|
|
if len(args) == nargs:
|
|
|
|
func(*args)
|
|
|
|
else:
|
|
|
|
if 1 <= self.debug:
|
|
|
|
print >>stderr, 'exec: %s' % (obj.name)
|
|
|
|
func()
|
|
|
|
else:
|
|
|
|
raise PDFInterpreterError('unknown operator: %r' % obj.name)
|
|
|
|
else:
|
|
|
|
self.push(obj)
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
## PDFDevice
|
|
|
|
##
|
|
|
|
class PDFDevice:
|
|
|
|
|
|
|
|
def __init__(self, rsrc):
|
|
|
|
self.rsrc = rsrc
|
|
|
|
self.ctm = None
|
|
|
|
return
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return '<PDFDevice>'
|
|
|
|
|
|
|
|
def set_ctm(self, ctm):
|
|
|
|
self.ctm = ctm
|
|
|
|
return
|
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
def begin_block(self, name):
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
def end_block(self):
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
def render_string(self, textstate, textmatrix, size, seq):
|
2007-12-30 09:13:51 +00:00
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
|
|
## TextConverter
|
|
|
|
##
|
|
|
|
class TextConverter(PDFDevice):
|
|
|
|
|
|
|
|
def __init__(self, rsrc, codec, outfp=sys.stdout):
|
|
|
|
PDFDevice.__init__(self, rsrc)
|
|
|
|
self.outfp = outfp
|
|
|
|
self.codec = codec
|
|
|
|
return
|
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
def begin_block(self, name):
|
|
|
|
self.outfp.write('<block name="%s">\n' % name)
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
2007-12-31 02:40:32 +00:00
|
|
|
def end_block(self):
|
|
|
|
self.outfp.write('</block>\n')
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
2007-12-31 02:40:32 +00:00
|
|
|
def render_string(self, textstate, textmatrix, size, seq):
|
2007-12-30 09:13:51 +00:00
|
|
|
font = textstate.font
|
|
|
|
spwidth = int(-font.char_width(32) * 0.6) # space width
|
2007-12-31 02:40:32 +00:00
|
|
|
buf = ''
|
2007-12-30 09:13:51 +00:00
|
|
|
for x in seq:
|
|
|
|
if isinstance(x, int) or isinstance(x, float):
|
|
|
|
if not font.is_vertical() and x <= spwidth:
|
|
|
|
buf += ' '
|
|
|
|
else:
|
|
|
|
chars = font.decode(x)
|
|
|
|
for cid in chars:
|
|
|
|
try:
|
|
|
|
char = font.to_unicode(cid)
|
|
|
|
except PDFUnicodeNotDefined, e:
|
|
|
|
(cidcoding, cid) = e.args
|
|
|
|
char = u'[%s:%d]' % (cidcoding, cid)
|
|
|
|
buf += char
|
2007-12-31 02:40:32 +00:00
|
|
|
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
|
|
|
skewed = (b != 0 or c != 0)
|
2007-12-30 09:13:51 +00:00
|
|
|
if font.is_vertical():
|
2007-12-31 02:40:32 +00:00
|
|
|
size = -size
|
|
|
|
tag = 'vtext'
|
2007-12-30 09:13:51 +00:00
|
|
|
else:
|
2007-12-31 02:40:32 +00:00
|
|
|
tag = 'htext'
|
|
|
|
if skewed:
|
|
|
|
tag += ' skewed'
|
|
|
|
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
|
|
|
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
|
|
|
def f(x): return '%.03f' % x
|
|
|
|
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
|
|
|
|
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
|
2007-12-30 09:13:51 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# main
|
|
|
|
def main(argv):
|
|
|
|
import getopt
|
|
|
|
def usage():
|
|
|
|
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
|
|
|
|
return 100
|
|
|
|
try:
|
|
|
|
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
|
|
|
|
except getopt.GetoptError:
|
|
|
|
return usage()
|
|
|
|
if not args: return usage()
|
|
|
|
(debug, verbose) = (0, 0)
|
|
|
|
cmapdir = 'CMap'
|
|
|
|
cdbcmapdir = 'CDBCMap'
|
|
|
|
codec = 'ascii'
|
|
|
|
pages = set()
|
|
|
|
for (k, v) in opts:
|
|
|
|
if k == '-d': debug += 1
|
|
|
|
elif k == '-v': verbose += 1
|
|
|
|
elif k == '-p': pages.add(int(v))
|
|
|
|
elif k == '-c': codec = v
|
|
|
|
#
|
|
|
|
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
|
|
|
rsrc = PDFResourceManager(debug=debug)
|
|
|
|
device = TextConverter(rsrc, codec)
|
|
|
|
for fname in args:
|
|
|
|
doc = PDFDocument(debug=debug)
|
|
|
|
fp = file(fname)
|
|
|
|
parser = PDFParser(doc, fp, debug=debug)
|
|
|
|
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
|
|
|
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
|
|
|
if pages and (i not in pages): continue
|
|
|
|
interpreter.process_page(page)
|
|
|
|
fp.close()
|
|
|
|
return
|
|
|
|
|
|
|
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|