pdfminer.six/pdfparser.py

2217 lines
59 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# pdfparser.py, Yusuke Shinyama
# ver 0.1, Dec 24 2004-
# ver 0.2, Dec 24 2007
# TODO:
# - Code Documentation.
# - Error handling for invalid type.
# - Outlines.
# - Named Objects. (pages)
# - Writers.
# - Linearized PDF.
# - Encryption?
import sys, re
from struct import pack, unpack
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
try:
import cdb
except ImportError:
import pycdb as cdb
stderr = sys.stderr
## Utilities
##
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
def nunpack(s, default=0):
'''Unpacks up to 4 bytes.'''
l = len(s)
if not l:
return default
elif l == 1:
return ord(s)
elif l == 2:
return unpack('>H', s)[0]
elif l == 3:
return unpack('>L', '\x00'+s)[0]
elif l == 4:
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a coordination.'''
return (a*x+c*y+e, b*x+d*y+f)
## Exceptions
##
class PSException(Exception): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
class PDFException(PSException): pass
class PDFSyntaxError(PDFException): pass
class PDFEncrypted(PDFException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
## PostScript Types
##
class PSLiteral:
'''
PS literals (e.g. "/Name").
Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return '/%s' % self.name
class PSKeyword:
'''
PS keywords (e.g. "showpage").
Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return self.name
class PSSymbolTable:
'''
Symbol table that stores PSLiteral or PSKeyword.
'''
def __init__(self, classe):
self.dic = {}
self.classe = classe
return
def intern(self, name):
if name in self.dic:
lit = self.dic[name]
else:
lit = self.classe(name)
self.dic[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
# some predefined literals and keywords.
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
LITERAL_PDF = PSLiteralTable.intern('PDF')
LITERAL_TEXT = PSLiteralTable.intern('Text')
LITERAL_XREF = PSLiteralTable.intern('XRef')
LITERAL_FONT = PSLiteralTable.intern('Font')
LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_EI = PSKeywordTable.intern('EI')
## CMap
##
class CMap:
def __init__(self, debug=0):
self.debug = 0
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid):
assert isinstance(code, str)
assert isinstance(cid, int)
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
from glyphlist import charname2unicode
assert isinstance(cid, int)
if isinstance(code, PSLiteral):
code = pack('>H', charname2unicode[code.name])
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self):
return self.attrs.get('WMode', '0') == '1'
def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
## CDBCMap
##
class CDBCMap(CMap):
def __init__(self, cdbname, debug=0):
CMap.__init__(self, debug=debug)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
def tocid(self, code):
k = 'c'+code
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
def getall(self, c):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
return
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB
##
class CMapDB:
CMAP_ALIAS = {
}
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod
def initialize(klass, dirname, cdbdirname=None, debug=0):
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
klass.debug = debug
return
@classmethod
def get_cmap(klass, cmapname):
import os.path
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname]
else:
fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= klass.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname)
CMapParser(cmap, fp).parse()
fp.close()
klass.cmapdb[cmapname] = cmap
return cmap
## FontMetricsDB
##
class FontMetricsDB:
from fontmetrics import FONT_METRICS
@classmethod
def get_metrics(klass, fontname):
return klass.FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB:
from glyphlist import charname2unicode
from latin_enc import ENCODING
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(charname2unicode[name])
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
except KeyError:
pass
cid += 1
return cid2unicode
## Color Spaces
##
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
CS_COMPONENTS = {
PSLiteralTable.intern('CalRGB'): 3,
PSLiteralTable.intern('CalGray'): 1,
PSLiteralTable.intern('Lab'): 3,
PSLiteralTable.intern('DeviceRGB'): 3,
PSLiteralTable.intern('DeviceCMYK'): 4,
PSLiteralTable.intern('DeviceGray'): 1,
PSLiteralTable.intern('Separation'): 1,
PSLiteralTable.intern('Indexed'): 1,
PSLiteralTable.intern('Pattern'): 1,
}
def cs_params(cs):
t = cs[0]
if t == LITERAL_ICC_BASED:
return stream_value(cs[1]).dic['N']
elif t == LITERAL_DEVICE_N:
return len(list_value(cs[1]))
else:
return CS_COMPONENTS[t]
## PSBaseParser
##
class PSBaseParser:
'''PostScript parser that performs only basic tokenization.'''
def __init__(self, fp, debug=0):
self.fp = fp
self.debug = debug
self.bufsize = 4096
self.seek(0)
return
def __repr__(self):
return '<PSBaseParser: %r>' % (self.fp,)
def seek(self, pos):
'''
seeks to the given pos.
'''
if 2 <= self.debug:
print >>stderr, 'seek:', pos
self.fp.seek(pos)
self.linepos = pos
self.linebuf = None
self.curpos = 0
self.line = ''
return
EOLCHAR = re.compile(r'[\r\n]')
def nextline(self):
'''
fetches the next line that ends either with \\r or \\n.
'''
line = ''
eol = None
while 1:
if not self.linebuf or len(self.linebuf) <= self.curpos:
# fetch next chunk.
self.linebuf = self.fp.read(self.bufsize)
if not self.linebuf:
# at EOF.
break
self.curpos = 0
if eol:
c = self.linebuf[self.curpos]
# handle '\r\n'
if (eol == '\r' and c == '\n'):
line += c
self.curpos += 1
break
m = self.EOLCHAR.search(self.linebuf, self.curpos)
if m:
i = m.end(0)
line += self.linebuf[self.curpos:i]
eol = self.linebuf[i-1]
self.curpos = i
else:
# fetch further
line += self.linebuf[self.curpos:]
self.linebuf = None
self.linepos += len(line)
return line
def revreadlines(self):
'''
fetches lines backword. used to locate trailers.
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
buf = ''
while 0 < pos:
pos = max(0, pos-self.bufsize)
self.fp.seek(pos)
s = self.fp.read(self.bufsize)
if not s: break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
buf = s + buf
break
yield buf+s[n:]
s = s[:n]
buf = ''
return
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
def parse(self):
'''
Yields a list of basic tokens: keywords, literals, strings,
numbers and parentheses. Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
linepos0 = self.linepos
self.line = self.nextline()
if not self.line: break
if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
# do this before removing comment
if self.line.startswith('%%EOF'): break
charpos = 0
# tokenize
while 1:
m = self.TOKEN.search(self.line, charpos)
if not m: break
t = m.group(0)
pos = linepos0 + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
print >>stderr, 'comment: %r' % self.line[charpos:]
break
elif t == '/':
# literal object
mn = self.LITERAL.match(self.line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
if 2 <= self.debug:
print >>stderr, 'name: %r' % lit
elif t == '(':
# normal string object
s = ''
while 1:
ms = self.STRING_NORM.match(self.line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
self.line = self.nextline()
if not self.line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos = 0
elif charpos == len(self.line):
s += s1
self.line = self.nextline()
if not self.line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos = 0
else:
s += s1
break
if self.line[charpos] != ')':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos += 1
def convesc(m):
x = m.group(0)
if x[1:].isdigit():
return chr(int(x[1:], 8))
else:
return x[1]
s = self.STRING_NORM_SUB.sub(convesc, s)
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif t == '<':
# hex string object
ms = self.STRING_HEX.match(self.line, charpos)
charpos = ms.end(0)
if self.line[charpos] != '>':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos += 1
def convhex(m1):
return chr(int(m1.group(0), 16))
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif self.NUMBER.match(t):
# number
if '.' in t:
n = float(t)
else:
n = int(t)
if 2 <= self.debug:
print >>stderr, 'number: %r' % n
yield (pos, n)
elif t in ('true','false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t
yield (pos, (t == 'true'))
else:
# other token
if 2 <= self.debug:
print >>stderr, 'keyword: %r' % t
yield (pos, PSKeywordTable.intern(t))
return
## PSStackParser
##
class PSStackParser(PSBaseParser):
'''
PostScript parser that recognizes compound objects
such as arrays and dictionaries.
'''
def __init__(self, fp, debug=0):
PSBaseParser.__init__(self, fp, debug=debug)
self.context = []
self.partobj = None
return
def do_token(self, pos, token):
'''
Handles special tokens.
Returns true if the token denotes the end of an object.
'''
return False
def push(self, obj):
'''
Push an object to the stack.
'''
self.partobj.append(obj)
return
def pop(self, n):
'''
Pop N objects from the stack.
'''
if len(self.partobj) < n:
raise PSSyntaxError('stack too short < %d' % n)
r = self.partobj[-n:]
self.partobj = self.partobj[:-n]
return r
def popall(self):
'''
Discards all the objects on the stack.
'''
self.partobj = []
return
def parse(self):
'''
Yields a list of objects: keywords, literals, strings,
numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries.
'''
def startobj(type):
self.context.append((type, self.partobj))
self.partobj = []
return
def endobj(type1):
assert self.context
obj = self.partobj
(type0, self.partobj) = self.context.pop()
if type0 != type1:
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
(type0, self.partobj, type1, obj))
return obj
startobj('o')
for (pos,t) in PSBaseParser.parse(self):
if isinstance(t, int) or isinstance(t, float):
self.push(t)
elif isinstance(t, str):
self.push(t)
elif isinstance(t, PSLiteral):
self.push(t)
else:
c = keyword_name(t)
if c == '{' or c == '}':
self.push(t)
elif c == '[':
# begin array
if 2 <= self.debug:
print >>stderr, 'start array'
startobj('a')
elif c == ']':
# end array
a = endobj('a')
if 2 <= self.debug:
print >>stderr, 'end array: %r' % a
self.push(a)
elif c == '<<':
# begin dictionary
if 2 <= self.debug:
print >>stderr, 'start dict'
startobj('d')
elif c == '>>':
# end dictionary
objs = endobj('d')
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
if 2 <= self.debug:
print >>stderr, 'end dict: %r' % d
self.push(d)
elif self.do_token(pos, t):
break
return endobj('o')
## CMapParser
##
class CMapParser(PSStackParser):
def __init__(self, cmap, fp, debug=0):
PSStackParser.__init__(self, fp, debug=debug)
self.cmap = cmap
self.in_cmap = False
return
def do_token(self, pos, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
self.popall()
return
elif name == 'endcmap':
self.in_cmap = False
return
if not self.in_cmap: return
#
if name == 'def':
try:
(k,v) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError:
pass
return
if name == 'usecmap':
try:
(cmapname,) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
if 1 <= self.debug:
print >>stderr, 'codespace: %r' % self.partobj
self.popall()
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
for (s,e,cid) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert isinstance(cid, int)
assert len(s) == len(e)
sprefix = s[:-4]
eprefix = e[:-4]
assert sprefix == eprefix
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
self.popall()
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(code, str)
assert isinstance(cid, str)
self.cmap.register_code2cid(code, nunpack(cid))
self.popall()
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
for (s,e,code) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert len(s) == len(e)
s1 = nunpack(s)
e1 = nunpack(e)
assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
self.popall()
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(cid, str)
assert isinstance(code, str)
self.cmap.register_cid2code(nunpack(cid), code)
self.popall()
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
if 1 <= self.debug:
print >>stderr, 'notdefrange: %r' % self.partobj
self.popall()
return
return
## PDFStream type
##
class PDFStream:
def __init__(self, doc, dic, rawdata):
self.doc = doc
self.dic = dic
self.rawdata = rawdata
self.data = None
return
def __repr__(self):
return '<PDFStream: %r>' % (self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.doc.crypt:
# func DECRYPT is not implemented yet...
raise NotImplementedError
data = DECRYPT(self.doc.crypt, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f == LITERAL_FLATE_DECODE:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFValueError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
else:
raise PDFValueError('Invalid filter spec: %r' % f)
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def parse_data(self, inline=False, debug=0):
return PDFParser(self.doc, StringIO(self.get_data()),
inline=inline, debug=debug).parse()
## PDFObjRef
##
class PDFObjRef:
def __init__(self, doc, objid, genno):
if objid == 0:
raise PDFValueError('objid cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolveall(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow. Do not used it unless
you really need it.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolveall(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolveall(v)
return x
# Type cheking
def literal_name(x):
x = resolve1(x)
if not isinstance(x, PSLiteral):
raise PDFTypeError('literal required: %r' % x)
return x.name
def keyword_name(x):
x = resolve1(x)
if not isinstance(x, PSKeyword):
raise PDFTypeError('keyword required: %r' % x)
return x.name
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
raise PDFTypeError('string required: %r' % x)
return x
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
raise PDFTypeError('integer required: %r' % x)
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
raise PDFTypeError('float required: %r' % x)
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
raise PDFTypeError('int or float required: %r' % x)
return x
def list_value(x):
x = resolve1(x)
if not isinstance(x, list):
raise PDFTypeError('list required: %r' % x)
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
raise PDFTypeError('dict required: %r' % x)
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
raise PDFTypeError('stream required: %r' % x)
return x
## PDFPage
##
class PDFPage:
def __init__(self, doc, pageidx, attrs, parent_attrs):
self.doc = doc
self.pageid = pageidx
self.attrs = dict_value(attrs)
self.parent_attrs = parent_attrs
self.resources = self.get_attr('Resources')
self.mediabox = self.get_attr('MediaBox')
contents = resolve1(self.attrs['Contents'])
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
def get_attr(self, k):
if k in self.attrs:
return resolve1(self.attrs[k])
return self.parent_attrs.get(k)
## XRefs
## PDFXRef
##
class PDFXRef:
def __init__(self, parser):
while 1:
line = parser.nextline()
if not line:
raise PDFSyntaxError('premature eof: %r' % parser)
line = line.strip()
f = line.split(' ')
if len(f) != 2:
if line != 'trailer':
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
break
(start, nobjs) = map(long, f)
self.objid0 = start
self.objid1 = start+nobjs
self.offsets = []
for objid in xrange(start, start+nobjs):
line = parser.nextline()
f = line.strip().split(' ')
if len(f) != 3:
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use))
# read trailer
self.trailer = dict_value(parser.parse()[0])
return
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
raise IndexError
(genno, pos, use) = self.offsets[objid-self.objid0]
if use != 'n':
raise PDFValueError('unused objid=%r' % objid)
return (None, pos)
## PDFXRefStream
##
class PDFXRefStream:
def __init__(self, parser):
(objid, genno, _, stream) = list_value(parser.parse())
assert stream.dic['Type'] == LITERAL_XREF
size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size))
self.objid0 = start
self.objid1 = start+nobjs
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
return
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
raise IndexError
i = self.entlen * (objid-self.objid0)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
pos = nunpack(ent[self.fl1:self.fl1+self.fl2])
genno = nunpack(ent[self.fl1+self.fl2:])
return (None, pos)
elif f1 == 2:
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
## PDFDocument
##
class PDFDocument:
def __init__(self, debug=0):
self.debug = debug
self.xrefs = []
self.objs = {}
self.parsed_objs = {}
self.crypt = None
self.root = None
self.catalog = None
self.parser = None
return
def set_parser(self, parser):
if self.parser: return
self.parser = parser
self.xrefs = list(parser.read_xref())
for xref in self.xrefs:
trailer = xref.trailer
if 'Encrypt' in trailer:
self.crypt = dict_value(trailer['Encrypt'])
if 'Root' in trailer:
self.set_root(dict_value(trailer['Root']))
break
else:
raise PDFValueError('no /Root object!')
return
def getobj(self, objid):
assert self.xrefs
if objid in self.objs:
obj = self.objs[objid]
else:
for xref in self.xrefs:
try:
(strmid, index) = xref.getpos(objid)
break
except IndexError:
pass
else:
raise PDFValueError('Cannot locate objid=%r' % objid)
if strmid:
stream = stream_value(self.getobj(strmid))
if stream.dic['Type'] != LITERAL_OBJSTM:
raise PDFSyntaxError('Not a stream object: %r' % stream)
if 'N' not in stream.dic:
raise PDFSyntaxError('N is not defined: %r' % stream)
if strmid in self.parsed_objs:
objs = self.parsed_objs[stream]
else:
objs = stream.parse_data(self.debug)
self.parsed_objs[stream] = objs
obj = objs[stream.dic['N']*2+index]
else:
pos0 = self.parser.linepos
self.parser.seek(index)
seq = list_value(self.parser.parse())
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
raise PDFSyntaxError('invalid stream spec: %r' % seq)
obj = seq[3]
self.parser.seek(pos0)
if 2 <= self.debug:
print >>stderr, 'register: objid=%r: %r' % (objid, obj)
self.objs[objid] = obj
return obj
def get_pages(self, debug=0):
assert self.xrefs
def search(obj, parent):
tree = dict_value(obj)
if tree['Type'] == LITERAL_PAGES:
if 1 <= debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']:
for x in search(c, tree):
yield x
elif tree['Type'] == LITERAL_PAGE:
if 1 <= debug:
print >>stderr, 'Page: %r' % tree
yield (tree, parent)
for (i,(tree,parent)) in enumerate(search(self.catalog['Pages'], self.catalog)):
yield PDFPage(self, i, tree, parent)
return
def set_root(self, root):
self.root = root
self.catalog = dict_value(self.root)
if self.catalog['Type'] != LITERAL_CATALOG:
raise PDFValueError('Catalog not found!')
self.outline = self.catalog.get('Outline')
return
## PDFParser
##
class PDFParser(PSStackParser):
def __init__(self, doc, fp, inline=False, debug=0):
PSStackParser.__init__(self, fp, debug=debug)
self.inline = inline
self.doc = doc
self.doc.set_parser(self)
return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
EOIPAT = re.compile(r'\nEI\W')
def do_token(self, pos, token):
name = keyword_name(token)
if name in ('xref', 'trailer', 'startxref', 'endobj'):
return True
if name == 'R':
# reference to indirect object
try:
(objid, genno) = self.pop(2)
(objid, genno) = (int(objid), int(genno))
obj = PDFObjRef(self.doc, objid, genno)
self.push(obj)
if 2 <= self.debug:
print >>stderr, 'refer obj: %r' % obj
except PSSyntaxError:
pass
elif name == 'stream':
# stream object
(dic,) = self.pop(1)
dic = dict_value(dic)
if 'Length' not in dic:
raise PDFValueError('/Length is undefined: %r' % dic)
objlen = int_value(dic['Length'])
self.seek(pos)
line = self.nextline() # 'stream'
self.fp.seek(pos+len(line))
data = self.fp.read(objlen)
self.seek(pos+len(line)+objlen)
while 1:
line = self.nextline()
if not line:
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
(self.linepos, line))
if line.strip():
if not line.startswith('endstream'):
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
(self.linepos, line))
break
if 1 <= self.debug:
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
(pos, objlen, dic, data[:10])
obj = PDFStream(self.doc, dic, data)
self.push(obj)
elif self.inline and name == 'BI':
# inline image within a content stream
self.context.append(('BI', self.partobj))
self.partobj = []
elif self.inline and name == 'ID':
objs = self.partobj
(type0, self.partobj) = self.context.pop()
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
pos += len('ID ')
self.fp.seek(pos)
data = self.fp.read(8192)
# XXX how do we know the real length other than scanning?
m = self.EOIPAT.search(data)
assert m
objlen = m.start(0)
obj = PDFStream(self.doc, dic, data[:objlen])
self.push(obj)
self.seek(pos+objlen+len('\nEI'))
self.push(KEYWORD_EI)
else:
self.push(token)
return False
def find_xref(self):
# find the first xref table
prev = None
for line in self.revreadlines():
line = line.strip()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line == 'startxref': break
if line:
prev = line
else:
raise PDFSyntaxError('startxref not found!')
if 1 <= self.debug:
print >>stderr, 'xref found: pos=%r' % prev
self.seek(long(prev))
return
# read xref tables and trailers
def read_xref(self):
self.find_xref()
while 1:
# read xref table
pos0 = self.linepos
line = self.nextline()
if 2 <= self.debug:
print >>stderr, 'line: %r' % line
if line[0].isdigit():
# XRefStream: PDF-1.5
self.seek(pos0)
xref = PDFXRefStream(self)
elif line.strip() != 'xref':
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
(self.linepos, line))
else:
xref = PDFXRef(self)
yield xref
trailer = xref.trailer
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % trailer
if 'XRefStm' in trailer:
self.seek(int_value(trailer['XRefStm']))
if 'Prev' in trailer:
# find previous xref
pos0 = int_value(trailer['Prev'])
self.seek(pos0)
if 1 <= self.debug:
print >>stderr, 'prev trailer: pos=%d' % pos0
else:
break
return
## Fonts
##
# PDFFont
class PDFFont:
def __init__(self, fontid, descriptor, widths, default_width=None):
self.fontid = fontid
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor['FontName']
if isinstance(self.fontname, PSLiteral):
self.fontname = literal_name(self.fontname)
self.ascent = descriptor['Ascent']
self.descent = descriptor['Descent']
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = descriptor.get('Leading', 0)
self.bbox = descriptor['FontBBox']
return
def __repr__(self):
return '<PDFFont: fontid=%r>' % (self.fontid,)
def is_vertical(self):
return False
def decode(self, bytes):
return map(ord, bytes)
def char_width(self, cid):
return self.widths.get(cid, self.default_width)
def char_disp(self, cid):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, fontid, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
if 'Encoding' in spec:
encoding = resolve1(spec['Encoding'])
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
diff = encoding.get('Differences', None)
self.encoding = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
PDFFont.__init__(self, fontid, descriptor, widths)
return
def to_unicode(self, cid):
if not self.ucs2_cmap:
try:
return self.encoding[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(None, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, fontid, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
self.basefont = literal_name(spec['BaseFont'])
try:
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
except KeyError:
try:
descriptor = dict_value(spec['FontDescriptor'])
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
widths = dict( (i+firstchar,w) for (i,w)
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
return
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
pass
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, fontid, spec):
try:
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
widths = dict( (i+firstchar,w) for (i,w)
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
return
# PDFCIDFont
## TrueTypeFont
##
class TrueTypeFont:
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
cmapname = 'Adobe-Identity-UCS-%s' % self.name
return CMap(cmapname).update(char2gid, gid2char)
class PDFCIDFont(PDFFont):
def __init__(self, fontid, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
try:
self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
self.cidsysteminfo['Ordering'])
except KeyError:
raise PDFFontError('CIDSystemInfo not properly defined.')
self.basefont = literal_name(spec['BaseFont'])
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
descriptor = dict_value(spec['FontDescriptor'])
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
self.ucs2_cmap = ttf.create_cmap()
except TrueTypeFont.CMapNotFound:
pass
else:
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
def get_width(seq):
dic = {}
char1 = char2 = None
for v in seq:
if char1 == None:
char1 = v
elif char2 == None and isinstance(v, int):
char2 = v
else:
if char2 == None:
for (i,w) in enumerate(v):
dic[char1+i] = w
else:
for i in xrange(char1, char2+1):
dic[i] = v
char1 = char2 = None
return dic
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
dic = get_width(list_value(spec.get('W2', [])))
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
(d,w) = spec.get('DW2', [880, -1000])
default_width = w
self.default_disp = d
else:
# writing mode: horizontal
widths = get_width(list_value(spec.get('W', [])))
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
return
def is_vertical(self):
return self.vertical
def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid):
if not self.ucs2_cmap:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
## Resource Manager
##
class PDFResourceManager:
'''
ResourceManager facilitates reuse of shared resources
such as fonts, images and cmaps so that large objects are not
allocated multiple times.
'''
def __init__(self, debug=0):
self.debug = debug
self.fonts = {}
return
def get_procset(self, procs):
for proc in procs:
if proc == LITERAL_PDF:
pass
elif proc == LITERAL_TEXT:
pass
else:
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
pass
return
def get_cmap(self, name):
return CMapDB.get_cmap(name)
def get_font(self, fontid, spec):
if fontid in self.fonts:
font = self.fonts[fontid]
else:
spec = dict_value(spec)
assert spec['Type'] == LITERAL_FONT
# Create a Font object.
if 'Subtype' not in spec:
raise PDFFontError('Font Subtype is not specified.')
subtype = literal_name(spec['Subtype'])
if subtype in ('Type1', 'MMType1'):
# Type1 Font
font = PDFType1Font(fontid, spec)
elif subtype == 'TrueType':
# TrueType Font
font = PDFTrueTypeFont(fontid, spec)
elif subtype == 'Type3':
# Type3 Font
font = PDFType3Font(fontid, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font
font = PDFCIDFont(fontid, spec)
elif subtype == 'Type0':
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
assert len(dfonts) == 1
subspec = dict_value(dfonts[0]).copy()
for k in ('Encoding', 'ToUnicode'):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(fontid, subspec)
else:
raise PDFFontError('Invalid Font: %r' % spec)
self.fonts[fontid] = font
return font
## Interpreter
##
class PDFPageInterpreter:
class TextState:
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
' render=%r, rise=%r>' %
(self.font, self.fontsize, self.matrix,
self.charspace, self.wordspace, self.scaling, self.leading,
self.render, self.rise))
def reset(self):
self.matrix = (1, 0, 0, 1, 0, 0)
self.linematrix = (0, 0)
return
def __init__(self, rsrc, device, debug=0):
self.rsrc = rsrc
self.device = device
self.debug = debug
return
def initpage(self, ctm):
self.fontmap = {}
self.xobjmap = {}
self.csmap = {}
# gstack: stack for graphical states.
self.gstack = []
self.ctm = ctm
self.device.set_ctm(self.ctm)
self.textstate = PDFPageInterpreter.TextState()
# argstack: stack for command arguments.
self.argstack = []
# set some global states.
self.scs = None
self.ncs = None
return
def push(self, obj):
self.argstack.append(obj)
return
def pop(self, n):
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
def get_current_state(self):
return (self.ctm, self.textstate)
def set_current_state(self, state):
(self.ctm, self.textstate) = state
self.device.set_ctm(self.ctm)
return
# gsave
def do_q(self):
self.gstack.append(self.get_current_state())
return
# grestore
def do_Q(self):
if self.gstack:
self.set_current_state(self.gstack.pop())
return
# concat-matrix
def do_cm(self, a1, b1, c1, d1, e1, f1):
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
self.device.set_ctm(self.ctm)
return
# setlinewidth
def do_w(self, width): return
# setlinecap
def do_J(self, cap): return
# setlinejoin
def do_j(self, join): return
# setmiterlimit
def do_M(self, limit): return
# setdash
def do_d(self, dash, phase): return
# setintent
def do_ri(self, intent): return
# setflatness
def do_i(self, flatness): return
# savedict
def do_gs(self, name): return
# moveto
def do_m(self, x, y): return
# lineto
def do_l(self, x, y): return
# curveto
def do_c(self, x1, y1, x2, y2, x3, y3): return
# urveto
def do_v(self, x2, y2, x3, y3): return
# rveto
def do_y(self, x1, y1, x3, y3): return
# closepath
def do_h(self): return
# rectangle
def do_re(self, x, y, w, h): return
# stroke
def do_S(self): return
# close-and-stroke
def do_s(self): return
# fill
def do_f(self): return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self): return
# fill-and-stroke
def do_B(self): return
# fill-and-stroke-even-odd
def do_B_a(self): return
# close-fill-and-stroke
def do_b(self): return
# close-fill-and-stroke-even-odd
def do_b_a(self): return
# close-only
def do_n(self): return
# clip
def do_W(self): return
# clip-even-odd
def do_W_a(self): return
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap.get(literal_name(name), None)
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap.get(literal_name(name), None)
return
# setgray-stroking
def do_G(self, gray):
self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
self.do_cs(LITERAL_DEVICE_CMYK)
return
# setcolor
def do_SCN(self):
n = cs_params(self.scs)
self.pop(n)
return
def do_scn(self):
n = cs_params(self.ncs)
self.pop(n)
return
def do_SC(self):
self.do_SCN()
return
def do_sc(self):
self.do_scn()
return
# sharing-name
def do_sh(self, name): return
# begin-text
def do_BT(self):
self.textstate.reset()
return
# end-text
def do_ET(self):
return
# begin-compat
def do_BX(self): return
# end-compat
def do_EX(self): return
# marked content operators
def do_MP(self, tag): return
def do_DP(self, tag, props): return
def do_BMC(self, tag): return
def do_BDC(self, tag, props): return
def do_EMC(self): return
# setcharspace
def do_Tc(self, space):
self.textstate.charspace = space
return
# setwordspace
def do_Tw(self, space):
self.textstate.wordspace = space
return
# textscale
def do_Tz(self, scale):
self.textstate.scaling = scale
return
# setleading
def do_TL(self, leading):
self.textstate.leading = leading
return
# selectfont
def do_Tf(self, fontid, fontsize):
try:
self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError:
raise PDFInterpreterError('Undefined font id: %r' % fontid)
self.textstate.fontsize = fontsize
return
# setrendering
def do_Tr(self, render):
self.textstate.render = render
return
# settextrise
def do_Ts(self, rise):
self.textstate.rise = rise
return
# text-move
def do_Td(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
self.textstate.linematrix = (0, 0)
return
# text-move
def do_TD(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
self.textstate.leading = -ty
self.textstate.linematrix = (0, 0)
return
# textmatrix
def do_Tm(self, a,b,c,d,e,f):
self.textstate.matrix = (a,b,c,d,e,f)
self.textstate.linematrix = (0, 0)
return
# nextline
def do_T_a(self):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
self.textstate.linematrix = (0, 0)
return
# show-pos
def do_TJ(self, seq):
textstate = self.textstate
font = textstate.font
(a,b,c,d,e,f) = textstate.matrix
(lx,ly) = textstate.linematrix
s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) )
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
len(s) * textstate.charspace +
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
if font.is_vertical():
ly += w
else:
lx += w
textstate.linematrix = (lx,ly)
return
# show
def do_Tj(self, s):
self.do_TJ([s])
return
# quote
def do__q(self, s):
self.do_T_a()
self.do_TJ([s])
return
# doublequote
def do__w(self, aw, ac, s):
self.do_Tw(aw)
self.do_Tc(ac)
self.do_TJ([s])
return
# inline image
def do_BI(self): # never called
return
def do_ID(self): # never called
return
def do_EI(self, obj):
return
# invoke an XObject
def do_Do(self, xobjid):
xobjid = literal_name(xobjid)
try:
xobj = stream_value(self.xobjmap[xobjid])
except KeyError:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
if xobj.dic['Subtype'] == LITERAL_FORM:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device)
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], xobj.dic['Matrix'])
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
return
def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)):
self.initpage(ctm)
self.device.begin_block(contid)
# Handle resource declarations.
for (k,v) in resources.iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
elif k == 'ColorSpace':
for (csid,csspec) in dict_value(v).iteritems():
self.csmap[csid] = list_value(csspec)
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
for stream in contents:
self.execute(stream_value(stream))
self.device.end_block()
return
def execute(self, stream):
for obj in stream.parse_data(inline=True, debug=self.debug):
if isinstance(obj, PSKeyword):
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
if hasattr(self, name):
func = getattr(self, name)
nargs = func.func_code.co_argcount-1
if nargs:
args = self.pop(nargs)
if 1 <= self.debug:
print >>stderr, 'exec: %s %r' % (obj.name, args)
if len(args) == nargs:
func(*args)
else:
if 1 <= self.debug:
print >>stderr, 'exec: %s' % (obj.name)
func()
else:
raise PDFInterpreterError('unknown operator: %r' % obj.name)
else:
self.push(obj)
return
## PDFDevice
##
class PDFDevice:
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_block(self, name):
return
def end_block(self):
return
def render_string(self, textstate, textmatrix, size, seq):
raise NotImplementedError
## TextConverter
##
class TextConverter(PDFDevice):
def __init__(self, rsrc, codec, outfp=sys.stdout):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
return
def begin_block(self, name):
self.outfp.write('<block name="%s">\n' % name)
return
def end_block(self):
self.outfp.write('</block>\n')
return
def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width
buf = ''
for x in seq:
if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth:
buf += ' '
else:
chars = font.decode(x)
for cid in chars:
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid)
buf += char
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
skewed = (b != 0 or c != 0)
if font.is_vertical():
size = -size
tag = 'vtext'
else:
tag = 'htext'
if skewed:
tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
def f(x): return '%.03f' % x
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
except getopt.GetoptError:
return usage()
if not args: return usage()
(debug, verbose) = (0, 0)
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
codec = 'ascii'
pages = set()
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-v': verbose += 1
elif k == '-p': pages.add(int(v))
elif k == '-c': codec = v
#
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
device = TextConverter(rsrc, codec)
for fname in args:
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue
interpreter.process_page(page)
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))