split files.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@4 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2007-12-31 03:41:45 +00:00
parent 60d291d08b
commit 6d93b4a7f7
6 changed files with 1825 additions and 1755 deletions

383
cmap.py Normal file
View File

@ -0,0 +1,383 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from struct import pack, unpack
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:
import cdb
except ImportError:
import pycdb as cdb
## CMap
##
class CMap:
def __init__(self, debug=0):
self.debug = debug
self.code2cid = {}
self.cid2code = {}
self.attrs = {}
return
def __repr__(self):
return '<CMap: %s>' % self.attrs.get('CMapName')
def update(self, code2cid=None, cid2code=None):
if code2cid:
self.code2cid.update(code2cid)
if cid2code:
self.cid2code.update(cid2code)
return self
def copycmap(self, cmap):
self.code2cid.update(cmap.getall_code2cid())
self.cid2code.update(cmap.getall_cid2code())
return self
def register_code2cid(self, code, cid):
assert isinstance(code, str)
assert isinstance(cid, int)
self.code2cid[code] = cid
return self
def register_cid2code(self, cid, code):
from glyphlist import charname2unicode
assert isinstance(cid, int)
if isinstance(code, PSLiteral):
code = pack('>H', charname2unicode[code.name])
self.cid2code[cid] = code
return self
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
else:
x = c
return
def is_vertical(self):
return self.attrs.get('WMode', '0') == '1'
def tocid(self, code):
return self.code2cid.get(code)
def tocode(self, cid):
return self.cid2code.get(cid)
def getall_attrs(self):
return self.attrs.iteritems()
def getall_code2cid(self):
return self.code2cid.iteritems()
def getall_cid2code(self):
return self.cid2code.iteritems()
## CDBCMap
##
class CDBCMap(CMap):
def __init__(self, cdbname, debug=0):
CMap.__init__(self, debug=debug)
self.cdbname = cdbname
self.db = cdb.init(cdbname)
return
def __repr__(self):
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
def tocid(self, code):
k = 'c'+code
if not self.db.has_key(k):
return None
return unpack('>L', self.db[k])
def tocode(self, cid):
k = 'i'+pack('>L', cid)
if not self.db.has_key(k):
return None
return self.db[k]
def is_vertical(self):
return (self.db.has_key('/WMode') and
self.db['/WMode'] == '1')
def getall(self, c):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith(c):
yield (k[1:], unpack('>L', v)[0])
return
def getall_attrs(self):
while 1:
x = self.db.each()
if not x: break
(k,v) = x
if k.startswith('/'):
yield (k[1:], eval(v)[0])
return
def getall_cid2code(self):
return self.getall('i')
def getall_code2cid(self):
return self.getall('c')
def decode(self, bytes):
if self.debug:
print >>stderr, 'decode: %r, %r' % (self, bytes)
x = ''
for c in bytes:
if x:
if x+c in self.code2cid:
yield self.code2cid[x+c]
elif self.db.has_key('c'+x+c):
(dest,) = unpack('>L', self.db['c'+x+c])
self.code2cid[x+c] = dest
yield dest
x = ''
elif c in self.code2cid:
yield self.code2cid[c]
elif self.db.has_key('c'+c):
(dest,) = unpack('>L', self.db['c'+c])
self.code2cid[c] = dest
yield dest
else:
x = c
return
## CMapDB
##
class CMapDB:
CMAP_ALIAS = {
}
debug = 0
dirname = None
cdbdirname = None
cmapdb = {}
@classmethod
def initialize(klass, dirname, cdbdirname=None, debug=0):
klass.dirname = dirname
klass.cdbdirname = cdbdirname or dirname
klass.debug = debug
return
@classmethod
def get_cmap(klass, cmapname):
import os.path
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
if cmapname in klass.cmapdb:
cmap = klass.cmapdb[cmapname]
else:
fname = os.path.join(klass.dirname, cmapname)
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
if os.path.exists(cdbname):
if 1 <= klass.debug:
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
cmap = CDBCMap(cdbname)
elif os.path.exists(fname):
if 1 <= klass.debug:
print >>stderr, 'Reading: CMap %r...' % fname
cmap = CMap()
fp = file(fname)
CMapParser(cmap, fp).parse()
fp.close()
klass.cmapdb[cmapname] = cmap
return cmap
## CMapParser
##
class CMapParser(PSStackParser):
def __init__(self, cmap, fp, debug=0):
PSStackParser.__init__(self, fp, debug=debug)
self.cmap = cmap
self.in_cmap = False
return
def do_token(self, _, token):
name = token.name
if name == 'begincmap':
self.in_cmap = True
self.popall()
return
elif name == 'endcmap':
self.in_cmap = False
return
if not self.in_cmap: return
#
if name == 'def':
try:
(k,v) = self.pop(2)
self.cmap.attrs[literal_name(k)] = v
except PSSyntaxError:
pass
return
if name == 'usecmap':
try:
(cmapname,) = self.pop(1)
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
except PSSyntaxError:
pass
return
if name == 'begincodespacerange':
self.popall()
return
if name == 'endcodespacerange':
if 1 <= self.debug:
print >>stderr, 'codespace: %r' % self.partobj
self.popall()
return
if name == 'begincidrange':
self.popall()
return
if name == 'endcidrange':
for (s,e,cid) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert isinstance(cid, int)
assert len(s) == len(e)
sprefix = s[:-4]
eprefix = e[:-4]
assert sprefix == eprefix
svar = s[-4:]
evar = e[-4:]
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
assert s1 <= e1
for i in xrange(e1-s1+1):
x = sprefix+pack('>L',s1+i)[-vlen:]
self.cmap.register_code2cid(x, cid+i)
self.popall()
return
if name == 'begincidchar':
self.popall()
return
if name == 'endcidchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(code, str)
assert isinstance(cid, str)
self.cmap.register_code2cid(code, nunpack(cid))
self.popall()
return
if name == 'beginbfrange':
self.popall()
return
if name == 'endbfrange':
for (s,e,code) in choplist(3, self.partobj):
assert isinstance(s, str)
assert isinstance(e, str)
assert len(s) == len(e)
s1 = nunpack(s)
e1 = nunpack(e)
assert s1 <= e1
if isinstance(code, list):
for i in xrange(e1-s1+1):
self.cmap.register_cid2code(s1+i, code[i])
else:
var = code[-4:]
base = nunpack(var)
prefix = code[:-4]
vlen = len(var)
for i in xrange(e1-s1+1):
x = prefix+pack('>L',base+i)[-vlen:]
self.cmap.register_cid2code(s1+i, x)
self.popall()
return
if name == 'beginbfchar':
self.popall()
return
if name == 'endbfchar':
for (cid,code) in choplist(2, self.partobj):
assert isinstance(cid, str)
assert isinstance(code, str)
self.cmap.register_cid2code(nunpack(cid), code)
self.popall()
return
if name == 'beginnotdefrange':
self.popall()
return
if name == 'endnotdefrange':
if 1 <= self.debug:
print >>stderr, 'notdefrange: %r' % self.partobj
self.popall()
return
return
## FontMetricsDB
##
class FontMetricsDB:
from fontmetrics import FONT_METRICS
@classmethod
def get_metrics(klass, fontname):
return klass.FONT_METRICS[fontname]
## EncodingDB
##
class EncodingDB:
from glyphlist import charname2unicode
from latin_enc import ENCODING
std2unicode = {}
mac2unicode = {}
win2unicode = {}
pdf2unicode = {}
for (name,std,mac,win,pdf) in ENCODING:
c = unichr(charname2unicode[name])
if std: std2unicode[std] = c
if mac: mac2unicode[mac] = c
if win: win2unicode[win] = c
if pdf: pdf2unicode[pdf] = c
encodings = {
'StandardEncoding': std2unicode,
'MacRomanEncoding': mac2unicode,
'WinAnsiEncoding': win2unicode,
'PDFDocEncoding': pdf2unicode,
}
@classmethod
def get_encoding(klass, name, diff=None):
cid2unicode = klass.encodings.get(name, klass.std2unicode)
if diff:
cid2unicode = cid2unicode.copy()
cid = 0
for x in diff:
if isinstance(x, int):
cid = x
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
except KeyError:
pass
cid += 1
return cid2unicode

111
pdf2txt.py Executable file
View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser
from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix
from cmap import CMapDB
## TextConverter
##
class TextConverter(PDFDevice):
def __init__(self, outfp, rsrc, codec):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
return
def close(self):
self.outfp.write('\n')
return
def begin_block(self, name):
self.outfp.write('<block name="%s">\n' % name)
return
def end_block(self):
self.outfp.write('</block>\n')
return
def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width
buf = ''
for x in seq:
if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth:
buf += ' '
else:
chars = font.decode(x)
for cid in chars:
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid)
buf += char
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
skewed = (b != 0 or c != 0)
if font.is_vertical():
size = -size
tag = 'vtext'
else:
tag = 'htext'
if skewed:
tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
def f(x): return '%.03f' % x
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
return
# pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec)
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue
interpreter.process_page(page)
fp.close()
device.close()
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
codec = 'ascii'
pages = set()
outfp = stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pages.add(int(v))
elif k == '-o': outfp = file(v, 'wb')
elif k == '-c': codec = v
#
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
for fname in args:
pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug)
return
if __name__ == '__main__': sys.exit(main(sys.argv))

827
pdfinterp.py Normal file
View File

@ -0,0 +1,827 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from struct import pack, unpack
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from psparser import PSException, PSSyntaxError, PSTypeError, \
PSStackParser, PSLiteral, PSKeyword, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
from pdfparser import resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value, PDFException
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
## Exceptions
##
class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
## Constants
##
LITERAL_PDF = PSLiteralTable.intern('PDF')
LITERAL_TEXT = PSLiteralTable.intern('Text')
LITERAL_FONT = PSLiteralTable.intern('Font')
LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
CS_COMPONENTS = {
PSLiteralTable.intern('CalRGB'): 3,
PSLiteralTable.intern('CalGray'): 1,
PSLiteralTable.intern('Lab'): 3,
PSLiteralTable.intern('DeviceRGB'): 3,
PSLiteralTable.intern('DeviceCMYK'): 4,
PSLiteralTable.intern('DeviceGray'): 1,
PSLiteralTable.intern('Separation'): 1,
PSLiteralTable.intern('Indexed'): 1,
PSLiteralTable.intern('Pattern'): 1,
}
## Matrix operations
##
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1,
a0*c1+c0*d1, b0*c1+d0*d1,
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
def apply_matrix((a,b,c,d,e,f), (x,y)):
'''Applies a matrix to a coordination.'''
return (a*x+c*y+e, b*x+d*y+f)
def cs_params(cs):
t = cs[0]
if t == LITERAL_ICC_BASED:
return stream_value(cs[1]).dic['N']
elif t == LITERAL_DEVICE_N:
return len(list_value(cs[1]))
else:
return CS_COMPONENTS[t]
## Fonts
##
# PDFFont
class PDFFont:
def __init__(self, fontid, descriptor, widths, default_width=None):
self.fontid = fontid
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor['FontName']
if isinstance(self.fontname, PSLiteral):
self.fontname = literal_name(self.fontname)
self.ascent = descriptor['Ascent']
self.descent = descriptor['Descent']
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = descriptor.get('Leading', 0)
self.bbox = descriptor['FontBBox']
return
def __repr__(self):
return '<PDFFont: fontid=%r>' % (self.fontid,)
def is_vertical(self):
return False
def decode(self, bytes):
return map(ord, bytes)
def char_width(self, cid):
return self.widths.get(cid, self.default_width)
def char_disp(self, cid):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, fontid, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
if 'Encoding' in spec:
encoding = resolve1(spec['Encoding'])
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
diff = encoding.get('Differences', None)
self.encoding = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
PDFFont.__init__(self, fontid, descriptor, widths)
return
def to_unicode(self, cid):
if not self.ucs2_cmap:
try:
return self.encoding[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(None, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, fontid, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
self.basefont = literal_name(spec['BaseFont'])
try:
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
except KeyError:
try:
descriptor = dict_value(spec['FontDescriptor'])
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
widths = dict( (i+firstchar,w) for (i,w)
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
return
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
pass
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, fontid, spec):
try:
firstchar = int_value(spec['FirstChar'])
lastchar = int_value(spec['LastChar'])
widths = dict( (i+firstchar,w) for (i,w)
in enumerate(list_value(spec['Widths'])) )
except KeyError, k:
raise PDFFontError('%s is missing' % k)
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
return
# PDFCIDFont
## TrueTypeFont
##
class TrueTypeFont:
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
cmapname = 'Adobe-Identity-UCS-%s' % self.name
return CMap(cmapname).update(char2gid, gid2char)
class PDFCIDFont(PDFFont):
def __init__(self, fontid, spec):
if 'BaseFont' not in spec:
raise PDFFontError('BaseFont is missing')
try:
self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
self.cidsysteminfo['Ordering'])
except KeyError:
raise PDFFontError('CIDSystemInfo not properly defined.')
self.basefont = literal_name(spec['BaseFont'])
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
descriptor = dict_value(spec['FontDescriptor'])
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
self.ucs2_cmap = ttf.create_cmap()
except TrueTypeFont.CMapNotFound:
pass
else:
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
def get_width(seq):
dic = {}
char1 = char2 = None
for v in seq:
if char1 == None:
char1 = v
elif char2 == None and isinstance(v, int):
char2 = v
else:
if char2 == None:
for (i,w) in enumerate(v):
dic[char1+i] = w
else:
for i in xrange(char1, char2+1):
dic[i] = v
char1 = char2 = None
return dic
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
dic = get_width(list_value(spec.get('W2', [])))
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
(d,w) = spec.get('DW2', [880, -1000])
default_width = w
self.default_disp = d
else:
# writing mode: horizontal
widths = get_width(list_value(spec.get('W', [])))
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
return
def is_vertical(self):
return self.vertical
def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid):
if not self.ucs2_cmap:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
## Resource Manager
##
class PDFResourceManager:
'''
ResourceManager facilitates reuse of shared resources
such as fonts, images and cmaps so that large objects are not
allocated multiple times.
'''
def __init__(self, debug=0):
self.debug = debug
self.fonts = {}
return
def get_procset(self, procs):
for proc in procs:
if proc == LITERAL_PDF:
pass
elif proc == LITERAL_TEXT:
pass
else:
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
pass
return
def get_cmap(self, name):
return CMapDB.get_cmap(name)
def get_font(self, fontid, spec):
if fontid in self.fonts:
font = self.fonts[fontid]
else:
spec = dict_value(spec)
assert spec['Type'] == LITERAL_FONT
# Create a Font object.
if 'Subtype' not in spec:
raise PDFFontError('Font Subtype is not specified.')
subtype = literal_name(spec['Subtype'])
if subtype in ('Type1', 'MMType1'):
# Type1 Font
font = PDFType1Font(fontid, spec)
elif subtype == 'TrueType':
# TrueType Font
font = PDFTrueTypeFont(fontid, spec)
elif subtype == 'Type3':
# Type3 Font
font = PDFType3Font(fontid, spec)
elif subtype in ('CIDFontType0', 'CIDFontType2'):
# CID Font
font = PDFCIDFont(fontid, spec)
elif subtype == 'Type0':
# Type0 Font
dfonts = list_value(spec['DescendantFonts'])
assert len(dfonts) == 1
subspec = dict_value(dfonts[0]).copy()
for k in ('Encoding', 'ToUnicode'):
if k in spec:
subspec[k] = resolve1(spec[k])
font = self.get_font(fontid, subspec)
else:
raise PDFFontError('Invalid Font: %r' % spec)
self.fonts[fontid] = font
return font
## PDFDevice
##
class PDFDevice:
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def close(self):
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_block(self, name):
return
def end_block(self):
return
def render_string(self, textstate, textmatrix, size, seq):
raise NotImplementedError
## Interpreter
##
class PDFPageInterpreter:
class TextState:
def __init__(self):
self.font = None
self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset()
return
def __repr__(self):
return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
' render=%r, rise=%r>' %
(self.font, self.fontsize, self.matrix,
self.charspace, self.wordspace, self.scaling, self.leading,
self.render, self.rise))
def reset(self):
self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0)
return
def __init__(self, rsrc, device, debug=0):
self.rsrc = rsrc
self.device = device
self.debug = debug
return
def initpage(self, ctm):
self.fontmap = {}
self.xobjmap = {}
self.csmap = {}
# gstack: stack for graphical states.
self.gstack = []
self.ctm = ctm
self.device.set_ctm(self.ctm)
self.textstate = PDFPageInterpreter.TextState()
# argstack: stack for command arguments.
self.argstack = []
# set some global states.
self.scs = None
self.ncs = None
return
def push(self, obj):
self.argstack.append(obj)
return
def pop(self, n):
x = self.argstack[-n:]
self.argstack = self.argstack[:-n]
return x
def get_current_state(self):
return (self.ctm, self.textstate)
def set_current_state(self, state):
(self.ctm, self.textstate) = state
self.device.set_ctm(self.ctm)
return
# gsave
def do_q(self):
self.gstack.append(self.get_current_state())
return
# grestore
def do_Q(self):
if self.gstack:
self.set_current_state(self.gstack.pop())
return
# concat-matrix
def do_cm(self, a1, b1, c1, d1, e1, f1):
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
self.device.set_ctm(self.ctm)
return
# setlinewidth
def do_w(self, width): return
# setlinecap
def do_J(self, cap): return
# setlinejoin
def do_j(self, join): return
# setmiterlimit
def do_M(self, limit): return
# setdash
def do_d(self, dash, phase): return
# setintent
def do_ri(self, intent): return
# setflatness
def do_i(self, flatness): return
# savedict
def do_gs(self, name): return
# moveto
def do_m(self, x, y): return
# lineto
def do_l(self, x, y): return
# curveto
def do_c(self, x1, y1, x2, y2, x3, y3): return
# urveto
def do_v(self, x2, y2, x3, y3): return
# rveto
def do_y(self, x1, y1, x3, y3): return
# closepath
def do_h(self): return
# rectangle
def do_re(self, x, y, w, h): return
# stroke
def do_S(self): return
# close-and-stroke
def do_s(self): return
# fill
def do_f(self): return
# fill (obsolete)
do_F = do_f
# fill-even-odd
def do_f_a(self): return
# fill-and-stroke
def do_B(self): return
# fill-and-stroke-even-odd
def do_B_a(self): return
# close-fill-and-stroke
def do_b(self): return
# close-fill-and-stroke-even-odd
def do_b_a(self): return
# close-only
def do_n(self): return
# clip
def do_W(self): return
# clip-even-odd
def do_W_a(self): return
# setcolorspace-stroking
def do_CS(self, name):
self.scs = self.csmap.get(literal_name(name), None)
return
# setcolorspace-non-strokine
def do_cs(self, name):
self.ncs = self.csmap.get(literal_name(name), None)
return
# setgray-stroking
def do_G(self, gray):
self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
self.do_cs(LITERAL_DEVICE_CMYK)
return
# setcolor
def do_SCN(self):
n = cs_params(self.scs)
self.pop(n)
return
def do_scn(self):
n = cs_params(self.ncs)
self.pop(n)
return
def do_SC(self):
self.do_SCN()
return
def do_sc(self):
self.do_scn()
return
# sharing-name
def do_sh(self, name): return
# begin-text
def do_BT(self):
self.textstate.reset()
return
# end-text
def do_ET(self):
return
# begin-compat
def do_BX(self): return
# end-compat
def do_EX(self): return
# marked content operators
def do_MP(self, tag): return
def do_DP(self, tag, props): return
def do_BMC(self, tag): return
def do_BDC(self, tag, props): return
def do_EMC(self): return
# setcharspace
def do_Tc(self, space):
self.textstate.charspace = space
return
# setwordspace
def do_Tw(self, space):
self.textstate.wordspace = space
return
# textscale
def do_Tz(self, scale):
self.textstate.scaling = scale
return
# setleading
def do_TL(self, leading):
self.textstate.leading = leading
return
# selectfont
def do_Tf(self, fontid, fontsize):
try:
self.textstate.font = self.fontmap[literal_name(fontid)]
except KeyError:
raise PDFInterpreterError('Undefined font id: %r' % fontid)
self.textstate.fontsize = fontsize
return
# setrendering
def do_Tr(self, render):
self.textstate.render = render
return
# settextrise
def do_Ts(self, rise):
self.textstate.rise = rise
return
# text-move
def do_Td(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
self.textstate.linematrix = (0, 0)
return
# text-move
def do_TD(self, tx, ty):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
self.textstate.leading = -ty
self.textstate.linematrix = (0, 0)
return
# textmatrix
def do_Tm(self, a,b,c,d,e,f):
self.textstate.matrix = (a,b,c,d,e,f)
self.textstate.linematrix = (0, 0)
return
# nextline
def do_T_a(self):
(a,b,c,d,e,f) = self.textstate.matrix
self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
self.textstate.linematrix = (0, 0)
return
# show-pos
def do_TJ(self, seq):
textstate = self.textstate
font = textstate.font
(a,b,c,d,e,f) = textstate.matrix
(lx,ly) = textstate.linematrix
s = ''.join( x for x in seq if isinstance(x, str) )
n = sum( x for x in seq if not isinstance(x, str) )
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
len(s) * textstate.charspace +
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
if font.is_vertical():
ly += w
else:
lx += w
textstate.linematrix = (lx,ly)
return
# show
def do_Tj(self, s):
self.do_TJ([s])
return
# quote
def do__q(self, s):
self.do_T_a()
self.do_TJ([s])
return
# doublequote
def do__w(self, aw, ac, s):
self.do_Tw(aw)
self.do_Tc(ac)
self.do_TJ([s])
return
# inline image
def do_BI(self): # never called
return
def do_ID(self): # never called
return
def do_EI(self, obj):
return
# invoke an XObject
def do_Do(self, xobjid):
xobjid = literal_name(xobjid)
try:
xobj = stream_value(self.xobjmap[xobjid])
except KeyError:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
if xobj.dic['Subtype'] == LITERAL_FORM:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device)
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
xobj.dic.get('Matrix', MATRIX_IDENTITY))
return
def process_page(self, page):
if 1 <= self.debug:
print >>stderr, 'Processing page: %r' % page
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
return
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
self.initpage(ctm)
self.device.begin_block(contid)
# Handle resource declarations.
for (k,v) in dict_value(resources).iteritems():
if 1 <= self.debug:
print >>stderr, 'Resource: %r: %r' % (k,v)
if k == 'Font':
for (fontid,fontrsrc) in dict_value(v).iteritems():
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
elif k == 'ColorSpace':
for (csid,csspec) in dict_value(v).iteritems():
self.csmap[csid] = list_value(csspec)
elif k == 'ProcSet':
self.rsrc.get_procset(list_value(v))
elif k == 'XObject':
for (xobjid,xobjstrm) in dict_value(v).iteritems():
self.xobjmap[xobjid] = xobjstrm
for stream in list_value(contents):
self.execute(stream_value(stream))
self.device.end_block()
return
def execute(self, stream):
for obj in stream.parse_data(inline=True, debug=self.debug):
if isinstance(obj, PSKeyword):
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
if hasattr(self, name):
func = getattr(self, name)
nargs = func.func_code.co_argcount-1
if nargs:
args = self.pop(nargs)
if 1 <= self.debug:
print >>stderr, 'exec: %s %r' % (obj.name, args)
if len(args) == nargs:
func(*args)
else:
if 1 <= self.debug:
print >>stderr, 'exec: %s' % (obj.name)
func()
else:
raise PDFInterpreterError('unknown operator: %r' % obj.name)
else:
self.push(obj)
return

File diff suppressed because it is too large Load Diff

396
psparser.py Normal file
View File

@ -0,0 +1,396 @@
#!/usr/bin/env python
import sys, re
stderr = sys.stderr
from utils import choplist
## PS Exceptions
##
class PSException(Exception): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass
## PostScript Types
##
class PSLiteral:
'''
PS literals (e.g. "/Name").
Caution: Never create these objects directly.
Use PSLiteralTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return '/%s' % self.name
class PSKeyword:
'''
PS keywords (e.g. "showpage").
Caution: Never create these objects directly.
Use PSKeywordTable.intern() instead.
'''
def __init__(self, name):
self.name = name
return
def __repr__(self):
return self.name
class PSSymbolTable:
'''
Symbol table that stores PSLiteral or PSKeyword.
'''
def __init__(self, classe):
self.dic = {}
self.classe = classe
return
def intern(self, name):
if name in self.dic:
lit = self.dic[name]
else:
lit = self.classe(name)
self.dic[name] = lit
return lit
PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)
def literal_name(x):
if not isinstance(x, PSLiteral):
raise PSTypeError('literal required: %r' % x)
return x.name
def keyword_name(x):
if not isinstance(x, PSKeyword):
raise PSTypeError('keyword required: %r' % x)
return x.name
## PSBaseParser
##
class PSBaseParser:
'''PostScript parser that performs only basic tokenization.'''
def __init__(self, fp, debug=0):
self.fp = fp
self.debug = debug
self.bufsize = 4096
self.seek(0)
return
def __repr__(self):
return '<PSBaseParser: %r>' % (self.fp,)
def seek(self, pos):
'''
seeks to the given pos.
'''
if 2 <= self.debug:
print >>stderr, 'seek:', pos
self.fp.seek(pos)
self.linepos = pos
self.linebuf = None
self.curpos = 0
self.line = ''
return
EOLCHAR = re.compile(r'[\r\n]')
def nextline(self):
'''
fetches the next line that ends either with \\r or \\n.
'''
line = ''
eol = None
while 1:
if not self.linebuf or len(self.linebuf) <= self.curpos:
# fetch next chunk.
self.linebuf = self.fp.read(self.bufsize)
if not self.linebuf:
# at EOF.
break
self.curpos = 0
if eol:
c = self.linebuf[self.curpos]
# handle '\r\n'
if (eol == '\r' and c == '\n'):
line += c
self.curpos += 1
break
m = self.EOLCHAR.search(self.linebuf, self.curpos)
if m:
i = m.end(0)
line += self.linebuf[self.curpos:i]
eol = self.linebuf[i-1]
self.curpos = i
else:
# fetch further
line += self.linebuf[self.curpos:]
self.linebuf = None
self.linepos += len(line)
return line
def revreadlines(self):
'''
fetches lines backword. used to locate trailers.
'''
self.fp.seek(0, 2)
pos = self.fp.tell()
buf = ''
while 0 < pos:
pos = max(0, pos-self.bufsize)
self.fp.seek(pos)
s = self.fp.read(self.bufsize)
if not s: break
while 1:
n = max(s.rfind('\r'), s.rfind('\n'))
if n == -1:
buf = s + buf
break
yield buf+s[n:]
s = s[:n]
buf = ''
return
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
def parse(self):
'''
Yields a list of basic tokens: keywords, literals, strings,
numbers and parentheses. Comments are skipped.
Nested objects (i.e. arrays and dictionaries) are not handled.
'''
while 1:
# do not strip line! we need to distinguish last '\n' or '\r'
linepos0 = self.linepos
self.line = self.nextline()
if not self.line: break
if 2 <= self.debug:
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
# do this before removing comment
if self.line.startswith('%%EOF'): break
charpos = 0
# tokenize
while 1:
m = self.TOKEN.search(self.line, charpos)
if not m: break
t = m.group(0)
pos = linepos0 + m.start(0)
charpos = m.end(0)
if t == '%':
# skip comment
if 2 <= self.debug:
print >>stderr, 'comment: %r' % self.line[charpos:]
break
elif t == '/':
# literal object
mn = self.LITERAL.match(self.line, m.start(0)+1)
lit = PSLiteralTable.intern(mn.group(0))
yield (pos, lit)
charpos = mn.end(0)
if 2 <= self.debug:
print >>stderr, 'name: %r' % lit
elif t == '(':
# normal string object
s = ''
while 1:
ms = self.STRING_NORM.match(self.line, charpos)
if not ms: break
s1 = ms.group(0)
charpos = ms.end(0)
if len(s1) == 1 and s1[-1] == '\\':
s += s1[-1:]
self.line = self.nextline()
if not self.line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos = 0
elif charpos == len(self.line):
s += s1
self.line = self.nextline()
if not self.line:
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos = 0
else:
s += s1
break
if self.line[charpos] != ')':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos += 1
def convesc(m):
x = m.group(0)
if x[1:].isdigit():
return chr(int(x[1:], 8))
else:
return x[1]
s = self.STRING_NORM_SUB.sub(convesc, s)
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif t == '<':
# hex string object
ms = self.STRING_HEX.match(self.line, charpos)
charpos = ms.end(0)
if self.line[charpos] != '>':
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
(self.linepos, self.line))
charpos += 1
def convhex(m1):
return chr(int(m1.group(0), 16))
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
if 2 <= self.debug:
print >>stderr, 'str: %r' % s
yield (pos, s)
elif self.NUMBER.match(t):
# number
if '.' in t:
n = float(t)
else:
n = int(t)
if 2 <= self.debug:
print >>stderr, 'number: %r' % n
yield (pos, n)
elif t in ('true','false'):
# boolean
if 2 <= self.debug:
print >>stderr, 'boolean: %r' % t
yield (pos, (t == 'true'))
else:
# other token
if 2 <= self.debug:
print >>stderr, 'keyword: %r' % t
yield (pos, PSKeywordTable.intern(t))
return
## PSStackParser
##
class PSStackParser(PSBaseParser):
'''
PostScript parser that recognizes compound objects
such as arrays and dictionaries.
'''
def __init__(self, fp, debug=0):
PSBaseParser.__init__(self, fp, debug=debug)
self.context = []
self.partobj = None
return
def do_token(self, pos, token):
'''
Handles special tokens.
Returns true if the token denotes the end of an object.
'''
return False
def push(self, obj):
'''
Push an object to the stack.
'''
self.partobj.append(obj)
return
def pop(self, n):
'''
Pop N objects from the stack.
'''
if len(self.partobj) < n:
raise PSSyntaxError('stack too short < %d' % n)
r = self.partobj[-n:]
self.partobj = self.partobj[:-n]
return r
def popall(self):
'''
Discards all the objects on the stack.
'''
self.partobj = []
return
def parse(self):
'''
Yields a list of objects: keywords, literals, strings,
numbers, arrays and dictionaries. Arrays and dictionaries
are represented as Python sequence and dictionaries.
'''
def startobj(type):
self.context.append((type, self.partobj))
self.partobj = []
return
def endobj(type1):
assert self.context
obj = self.partobj
(type0, self.partobj) = self.context.pop()
if type0 != type1:
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
(type0, self.partobj, type1, obj))
return obj
startobj('o')
for (pos,t) in PSBaseParser.parse(self):
if isinstance(t, int) or isinstance(t, float):
self.push(t)
elif isinstance(t, str):
self.push(t)
elif isinstance(t, PSLiteral):
self.push(t)
else:
c = keyword_name(t)
if c == '{' or c == '}':
self.push(t)
elif c == '[':
# begin array
if 2 <= self.debug:
print >>stderr, 'start array'
startobj('a')
elif c == ']':
# end array
a = endobj('a')
if 2 <= self.debug:
print >>stderr, 'end array: %r' % a
self.push(a)
elif c == '<<':
# begin dictionary
if 2 <= self.debug:
print >>stderr, 'start dict'
startobj('d')
elif c == '>>':
# end dictionary
objs = endobj('d')
if len(objs) % 2 != 0:
raise PSTypeError('invalid dictionary construct: %r' % objs)
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
if 2 <= self.debug:
print >>stderr, 'end dict: %r' % d
self.push(d)
elif self.do_token(pos, t):
break
return endobj('o')

29
utils.py Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
## Utilities
##
def choplist(n, seq):
'''Groups every n elements of the list.'''
r = []
for x in seq:
r.append(x)
if len(r) == n:
yield tuple(r)
r = []
return
def nunpack(s, default=0):
'''Unpacks up to 4 bytes.'''
l = len(s)
if not l:
return default
elif l == 1:
return ord(s)
elif l == 2:
return unpack('>H', s)[0]
elif l == 3:
return unpack('>L', '\x00'+s)[0]
elif l == 4:
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)