split files.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@4 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
60d291d08b
commit
6d93b4a7f7
|
@ -0,0 +1,383 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
from utils import choplist, nunpack
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
PSLiteral, PSKeyword, literal_name, keyword_name, \
|
||||
PSStackParser
|
||||
try:
|
||||
import cdb
|
||||
except ImportError:
|
||||
import pycdb as cdb
|
||||
|
||||
|
||||
## CMap
|
||||
##
|
||||
class CMap:
|
||||
|
||||
def __init__(self, debug=0):
|
||||
self.debug = debug
|
||||
self.code2cid = {}
|
||||
self.cid2code = {}
|
||||
self.attrs = {}
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||
|
||||
def update(self, code2cid=None, cid2code=None):
|
||||
if code2cid:
|
||||
self.code2cid.update(code2cid)
|
||||
if cid2code:
|
||||
self.cid2code.update(cid2code)
|
||||
return self
|
||||
|
||||
def copycmap(self, cmap):
|
||||
self.code2cid.update(cmap.getall_code2cid())
|
||||
self.cid2code.update(cmap.getall_cid2code())
|
||||
return self
|
||||
|
||||
def register_code2cid(self, code, cid):
|
||||
assert isinstance(code, str)
|
||||
assert isinstance(cid, int)
|
||||
self.code2cid[code] = cid
|
||||
return self
|
||||
|
||||
def register_cid2code(self, cid, code):
|
||||
from glyphlist import charname2unicode
|
||||
assert isinstance(cid, int)
|
||||
if isinstance(code, PSLiteral):
|
||||
code = pack('>H', charname2unicode[code.name])
|
||||
self.cid2code[cid] = code
|
||||
return self
|
||||
|
||||
def decode(self, bytes):
|
||||
if self.debug:
|
||||
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
||||
x = ''
|
||||
for c in bytes:
|
||||
if x:
|
||||
if x+c in self.code2cid:
|
||||
yield self.code2cid[x+c]
|
||||
x = ''
|
||||
elif c in self.code2cid:
|
||||
yield self.code2cid[c]
|
||||
else:
|
||||
x = c
|
||||
return
|
||||
|
||||
def is_vertical(self):
|
||||
return self.attrs.get('WMode', '0') == '1'
|
||||
|
||||
def tocid(self, code):
|
||||
return self.code2cid.get(code)
|
||||
def tocode(self, cid):
|
||||
return self.cid2code.get(cid)
|
||||
|
||||
def getall_attrs(self):
|
||||
return self.attrs.iteritems()
|
||||
def getall_code2cid(self):
|
||||
return self.code2cid.iteritems()
|
||||
def getall_cid2code(self):
|
||||
return self.cid2code.iteritems()
|
||||
|
||||
|
||||
## CDBCMap
|
||||
##
|
||||
class CDBCMap(CMap):
|
||||
|
||||
def __init__(self, cdbname, debug=0):
|
||||
CMap.__init__(self, debug=debug)
|
||||
self.cdbname = cdbname
|
||||
self.db = cdb.init(cdbname)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
|
||||
|
||||
def tocid(self, code):
|
||||
k = 'c'+code
|
||||
if not self.db.has_key(k):
|
||||
return None
|
||||
return unpack('>L', self.db[k])
|
||||
def tocode(self, cid):
|
||||
k = 'i'+pack('>L', cid)
|
||||
if not self.db.has_key(k):
|
||||
return None
|
||||
return self.db[k]
|
||||
|
||||
def is_vertical(self):
|
||||
return (self.db.has_key('/WMode') and
|
||||
self.db['/WMode'] == '1')
|
||||
|
||||
def getall(self, c):
|
||||
while 1:
|
||||
x = self.db.each()
|
||||
if not x: break
|
||||
(k,v) = x
|
||||
if k.startswith(c):
|
||||
yield (k[1:], unpack('>L', v)[0])
|
||||
return
|
||||
|
||||
def getall_attrs(self):
|
||||
while 1:
|
||||
x = self.db.each()
|
||||
if not x: break
|
||||
(k,v) = x
|
||||
if k.startswith('/'):
|
||||
yield (k[1:], eval(v)[0])
|
||||
return
|
||||
|
||||
def getall_cid2code(self):
|
||||
return self.getall('i')
|
||||
def getall_code2cid(self):
|
||||
return self.getall('c')
|
||||
|
||||
def decode(self, bytes):
|
||||
if self.debug:
|
||||
print >>stderr, 'decode: %r, %r' % (self, bytes)
|
||||
x = ''
|
||||
for c in bytes:
|
||||
if x:
|
||||
if x+c in self.code2cid:
|
||||
yield self.code2cid[x+c]
|
||||
elif self.db.has_key('c'+x+c):
|
||||
(dest,) = unpack('>L', self.db['c'+x+c])
|
||||
self.code2cid[x+c] = dest
|
||||
yield dest
|
||||
x = ''
|
||||
elif c in self.code2cid:
|
||||
yield self.code2cid[c]
|
||||
elif self.db.has_key('c'+c):
|
||||
(dest,) = unpack('>L', self.db['c'+c])
|
||||
self.code2cid[c] = dest
|
||||
yield dest
|
||||
else:
|
||||
x = c
|
||||
return
|
||||
|
||||
|
||||
## CMapDB
|
||||
##
|
||||
class CMapDB:
|
||||
|
||||
CMAP_ALIAS = {
|
||||
}
|
||||
|
||||
debug = 0
|
||||
dirname = None
|
||||
cdbdirname = None
|
||||
cmapdb = {}
|
||||
|
||||
@classmethod
|
||||
def initialize(klass, dirname, cdbdirname=None, debug=0):
|
||||
klass.dirname = dirname
|
||||
klass.cdbdirname = cdbdirname or dirname
|
||||
klass.debug = debug
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def get_cmap(klass, cmapname):
|
||||
import os.path
|
||||
cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
|
||||
if cmapname in klass.cmapdb:
|
||||
cmap = klass.cmapdb[cmapname]
|
||||
else:
|
||||
fname = os.path.join(klass.dirname, cmapname)
|
||||
cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
|
||||
if os.path.exists(cdbname):
|
||||
if 1 <= klass.debug:
|
||||
print >>stderr, 'Opening: CDBCMap %r...' % cdbname
|
||||
cmap = CDBCMap(cdbname)
|
||||
elif os.path.exists(fname):
|
||||
if 1 <= klass.debug:
|
||||
print >>stderr, 'Reading: CMap %r...' % fname
|
||||
cmap = CMap()
|
||||
fp = file(fname)
|
||||
CMapParser(cmap, fp).parse()
|
||||
fp.close()
|
||||
klass.cmapdb[cmapname] = cmap
|
||||
return cmap
|
||||
|
||||
|
||||
## CMapParser
|
||||
##
|
||||
class CMapParser(PSStackParser):
|
||||
|
||||
def __init__(self, cmap, fp, debug=0):
|
||||
PSStackParser.__init__(self, fp, debug=debug)
|
||||
self.cmap = cmap
|
||||
self.in_cmap = False
|
||||
return
|
||||
|
||||
def do_token(self, _, token):
|
||||
name = token.name
|
||||
if name == 'begincmap':
|
||||
self.in_cmap = True
|
||||
self.popall()
|
||||
return
|
||||
elif name == 'endcmap':
|
||||
self.in_cmap = False
|
||||
return
|
||||
if not self.in_cmap: return
|
||||
#
|
||||
if name == 'def':
|
||||
try:
|
||||
(k,v) = self.pop(2)
|
||||
self.cmap.attrs[literal_name(k)] = v
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
if name == 'usecmap':
|
||||
try:
|
||||
(cmapname,) = self.pop(1)
|
||||
self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
|
||||
except PSSyntaxError:
|
||||
pass
|
||||
return
|
||||
|
||||
if name == 'begincodespacerange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endcodespacerange':
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'codespace: %r' % self.partobj
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'begincidrange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endcidrange':
|
||||
for (s,e,cid) in choplist(3, self.partobj):
|
||||
assert isinstance(s, str)
|
||||
assert isinstance(e, str)
|
||||
assert isinstance(cid, int)
|
||||
assert len(s) == len(e)
|
||||
sprefix = s[:-4]
|
||||
eprefix = e[:-4]
|
||||
assert sprefix == eprefix
|
||||
svar = s[-4:]
|
||||
evar = e[-4:]
|
||||
s1 = nunpack(svar)
|
||||
e1 = nunpack(evar)
|
||||
vlen = len(svar)
|
||||
assert s1 <= e1
|
||||
for i in xrange(e1-s1+1):
|
||||
x = sprefix+pack('>L',s1+i)[-vlen:]
|
||||
self.cmap.register_code2cid(x, cid+i)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'begincidchar':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endcidchar':
|
||||
for (cid,code) in choplist(2, self.partobj):
|
||||
assert isinstance(code, str)
|
||||
assert isinstance(cid, str)
|
||||
self.cmap.register_code2cid(code, nunpack(cid))
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginbfrange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endbfrange':
|
||||
for (s,e,code) in choplist(3, self.partobj):
|
||||
assert isinstance(s, str)
|
||||
assert isinstance(e, str)
|
||||
assert len(s) == len(e)
|
||||
s1 = nunpack(s)
|
||||
e1 = nunpack(e)
|
||||
assert s1 <= e1
|
||||
if isinstance(code, list):
|
||||
for i in xrange(e1-s1+1):
|
||||
self.cmap.register_cid2code(s1+i, code[i])
|
||||
else:
|
||||
var = code[-4:]
|
||||
base = nunpack(var)
|
||||
prefix = code[:-4]
|
||||
vlen = len(var)
|
||||
for i in xrange(e1-s1+1):
|
||||
x = prefix+pack('>L',base+i)[-vlen:]
|
||||
self.cmap.register_cid2code(s1+i, x)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginbfchar':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endbfchar':
|
||||
for (cid,code) in choplist(2, self.partobj):
|
||||
assert isinstance(cid, str)
|
||||
assert isinstance(code, str)
|
||||
self.cmap.register_cid2code(nunpack(cid), code)
|
||||
self.popall()
|
||||
return
|
||||
|
||||
if name == 'beginnotdefrange':
|
||||
self.popall()
|
||||
return
|
||||
if name == 'endnotdefrange':
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'notdefrange: %r' % self.partobj
|
||||
self.popall()
|
||||
return
|
||||
|
||||
return
|
||||
|
||||
|
||||
## FontMetricsDB
|
||||
##
|
||||
class FontMetricsDB:
|
||||
from fontmetrics import FONT_METRICS
|
||||
|
||||
@classmethod
|
||||
def get_metrics(klass, fontname):
|
||||
return klass.FONT_METRICS[fontname]
|
||||
|
||||
|
||||
## EncodingDB
|
||||
##
|
||||
class EncodingDB:
|
||||
|
||||
from glyphlist import charname2unicode
|
||||
from latin_enc import ENCODING
|
||||
|
||||
std2unicode = {}
|
||||
mac2unicode = {}
|
||||
win2unicode = {}
|
||||
pdf2unicode = {}
|
||||
for (name,std,mac,win,pdf) in ENCODING:
|
||||
c = unichr(charname2unicode[name])
|
||||
if std: std2unicode[std] = c
|
||||
if mac: mac2unicode[mac] = c
|
||||
if win: win2unicode[win] = c
|
||||
if pdf: pdf2unicode[pdf] = c
|
||||
|
||||
encodings = {
|
||||
'StandardEncoding': std2unicode,
|
||||
'MacRomanEncoding': mac2unicode,
|
||||
'WinAnsiEncoding': win2unicode,
|
||||
'PDFDocEncoding': pdf2unicode,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_encoding(klass, name, diff=None):
|
||||
cid2unicode = klass.encodings.get(name, klass.std2unicode)
|
||||
if diff:
|
||||
cid2unicode = cid2unicode.copy()
|
||||
cid = 0
|
||||
for x in diff:
|
||||
if isinstance(x, int):
|
||||
cid = x
|
||||
elif isinstance(x, PSLiteral):
|
||||
try:
|
||||
cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
|
||||
except KeyError:
|
||||
pass
|
||||
cid += 1
|
||||
return cid2unicode
|
|
@ -0,0 +1,111 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdfparser import PDFDocument, PDFParser
|
||||
from pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
||||
mult_matrix, apply_matrix
|
||||
from cmap import CMapDB
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
class TextConverter(PDFDevice):
|
||||
|
||||
def __init__(self, outfp, rsrc, codec):
|
||||
PDFDevice.__init__(self, rsrc)
|
||||
self.outfp = outfp
|
||||
self.codec = codec
|
||||
return
|
||||
|
||||
def close(self):
|
||||
self.outfp.write('\n')
|
||||
return
|
||||
|
||||
def begin_block(self, name):
|
||||
self.outfp.write('<block name="%s">\n' % name)
|
||||
return
|
||||
def end_block(self):
|
||||
self.outfp.write('</block>\n')
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
font = textstate.font
|
||||
spwidth = int(-font.char_width(32) * 0.6) # space width
|
||||
buf = ''
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
if not font.is_vertical() and x <= spwidth:
|
||||
buf += ' '
|
||||
else:
|
||||
chars = font.decode(x)
|
||||
for cid in chars:
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
char = u'[%s:%d]' % (cidcoding, cid)
|
||||
buf += char
|
||||
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
|
||||
skewed = (b != 0 or c != 0)
|
||||
if font.is_vertical():
|
||||
size = -size
|
||||
tag = 'vtext'
|
||||
else:
|
||||
tag = 'htext'
|
||||
if skewed:
|
||||
tag += ' skewed'
|
||||
s = buf.encode(self.codec, 'xmlcharrefreplace')
|
||||
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
|
||||
def f(x): return '%.03f' % x
|
||||
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
|
||||
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
|
||||
return
|
||||
|
||||
|
||||
# pdf2txt
|
||||
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
|
||||
device = TextConverter(outfp, rsrc, codec)
|
||||
doc = PDFDocument(debug=debug)
|
||||
fp = file(fname)
|
||||
parser = PDFParser(doc, fp, debug=debug)
|
||||
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
||||
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
||||
if pages and (i not in pages): continue
|
||||
interpreter.process_page(page)
|
||||
fp.close()
|
||||
device.close()
|
||||
return
|
||||
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
|
||||
except getopt.GetoptError:
|
||||
return usage()
|
||||
if not args: return usage()
|
||||
debug = 0
|
||||
cmapdir = 'CMap'
|
||||
cdbcmapdir = 'CDBCMap'
|
||||
codec = 'ascii'
|
||||
pages = set()
|
||||
outfp = stdout
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-p': pages.add(int(v))
|
||||
elif k == '-o': outfp = file(v, 'wb')
|
||||
elif k == '-c': codec = v
|
||||
#
|
||||
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
||||
rsrc = PDFResourceManager(debug=debug)
|
||||
for fname in args:
|
||||
pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug)
|
||||
return
|
||||
|
||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
|
@ -0,0 +1,827 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||
PSStackParser, PSLiteral, PSKeyword, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||
from pdfparser import resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value, PDFException
|
||||
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFResourceError(PDFException): pass
|
||||
class PDFInterpreterError(PDFException): pass
|
||||
class PDFFontError(PDFException): pass
|
||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||
|
||||
|
||||
## Constants
|
||||
##
|
||||
LITERAL_PDF = PSLiteralTable.intern('PDF')
|
||||
LITERAL_TEXT = PSLiteralTable.intern('Text')
|
||||
LITERAL_FONT = PSLiteralTable.intern('Font')
|
||||
LITERAL_FORM = PSLiteralTable.intern('Form')
|
||||
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||
LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
|
||||
LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
CS_COMPONENTS = {
|
||||
PSLiteralTable.intern('CalRGB'): 3,
|
||||
PSLiteralTable.intern('CalGray'): 1,
|
||||
PSLiteralTable.intern('Lab'): 3,
|
||||
PSLiteralTable.intern('DeviceRGB'): 3,
|
||||
PSLiteralTable.intern('DeviceCMYK'): 4,
|
||||
PSLiteralTable.intern('DeviceGray'): 1,
|
||||
PSLiteralTable.intern('Separation'): 1,
|
||||
PSLiteralTable.intern('Indexed'): 1,
|
||||
PSLiteralTable.intern('Pattern'): 1,
|
||||
}
|
||||
|
||||
|
||||
## Matrix operations
|
||||
##
|
||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||
'''Multiplies two matrices.'''
|
||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||
a0*c1+c0*d1, b0*c1+d0*d1,
|
||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||
|
||||
def apply_matrix((a,b,c,d,e,f), (x,y)):
|
||||
'''Applies a matrix to a coordination.'''
|
||||
return (a*x+c*y+e, b*x+d*y+f)
|
||||
|
||||
def cs_params(cs):
|
||||
t = cs[0]
|
||||
if t == LITERAL_ICC_BASED:
|
||||
return stream_value(cs[1]).dic['N']
|
||||
elif t == LITERAL_DEVICE_N:
|
||||
return len(list_value(cs[1]))
|
||||
else:
|
||||
return CS_COMPONENTS[t]
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
|
||||
# PDFFont
|
||||
class PDFFont:
|
||||
|
||||
def __init__(self, fontid, descriptor, widths, default_width=None):
|
||||
self.fontid = fontid
|
||||
self.descriptor = descriptor
|
||||
self.widths = widths
|
||||
self.fontname = descriptor['FontName']
|
||||
if isinstance(self.fontname, PSLiteral):
|
||||
self.fontname = literal_name(self.fontname)
|
||||
self.ascent = descriptor['Ascent']
|
||||
self.descent = descriptor['Descent']
|
||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||
self.leading = descriptor.get('Leading', 0)
|
||||
self.bbox = descriptor['FontBBox']
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFFont: fontid=%r>' % (self.fontid,)
|
||||
|
||||
def is_vertical(self):
|
||||
return False
|
||||
|
||||
def decode(self, bytes):
|
||||
return map(ord, bytes)
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
||||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, fontid, descriptor, widths, spec):
|
||||
# Font encoding is specified either by a name of
|
||||
# built-in encoding or a dictionary that describes
|
||||
# the differences.
|
||||
if 'Encoding' in spec:
|
||||
encoding = resolve1(spec['Encoding'])
|
||||
else:
|
||||
encoding = LITERAL_STANDARD_ENCODING
|
||||
if isinstance(encoding, dict):
|
||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||
diff = encoding.get('Differences', None)
|
||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
||||
else:
|
||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
||||
PDFFont.__init__(self, fontid, descriptor, widths)
|
||||
return
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
try:
|
||||
return self.encoding[cid]
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, fontid, spec):
|
||||
if 'BaseFont' not in spec:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
try:
|
||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||
except KeyError:
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
firstchar = int_value(spec['FirstChar'])
|
||||
lastchar = int_value(spec['LastChar'])
|
||||
widths = dict( (i+firstchar,w) for (i,w)
|
||||
in enumerate(list_value(spec['Widths'])) )
|
||||
except KeyError, k:
|
||||
raise PDFFontError('%s is missing' % k)
|
||||
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
||||
return
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
pass
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
def __init__(self, fontid, spec):
|
||||
try:
|
||||
firstchar = int_value(spec['FirstChar'])
|
||||
lastchar = int_value(spec['LastChar'])
|
||||
widths = dict( (i+firstchar,w) for (i,w)
|
||||
in enumerate(list_value(spec['Widths'])) )
|
||||
except KeyError, k:
|
||||
raise PDFFontError('%s is missing' % k)
|
||||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
|
||||
return
|
||||
|
||||
# PDFCIDFont
|
||||
|
||||
## TrueTypeFont
|
||||
##
|
||||
class TrueTypeFont:
|
||||
|
||||
class CMapNotFound(Exception): pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
self.tables = {}
|
||||
fonttype = fp.read(4)
|
||||
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
for i in xrange(ntables):
|
||||
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
||||
self.tables[name] = (offset, length)
|
||||
return
|
||||
|
||||
def create_cmap(self):
|
||||
if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
|
||||
(base_offset, length) = self.tables['cmap']
|
||||
fp = self.fp
|
||||
fp.seek(base_offset)
|
||||
(version, nsubtables) = unpack('>HH', fp.read(4))
|
||||
subtables = []
|
||||
for i in xrange(nsubtables):
|
||||
subtables.append(unpack('>HHL', fp.read(8)))
|
||||
char2gid = {}
|
||||
# Only supports subtable type 0, 2 and 4.
|
||||
for (_1, _2, st_offset) in subtables:
|
||||
fp.seek(base_offset+st_offset)
|
||||
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
||||
if fmttype == 0:
|
||||
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
||||
elif fmttype == 2:
|
||||
subheaderkeys = unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
for (i,k) in enumerate(subheaderkeys):
|
||||
firstbytes[k/8] = i
|
||||
nhdrs = max(subheaderkeys)/8 + 1
|
||||
hdrs = []
|
||||
for i in xrange(nhdrs):
|
||||
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
||||
if not entcount: continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in xrange(entcount):
|
||||
gid = unpack('>H', fp.read(2))
|
||||
if gid:
|
||||
gid += delta
|
||||
char2gid[first+c] = gid
|
||||
elif fmttype == 4:
|
||||
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
segcount /= 2
|
||||
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
fp.read(2)
|
||||
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||
pos = fp.tell()
|
||||
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
||||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
||||
else:
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (c + idd) & 0xffff
|
||||
gid2char = dict( (gid, pack('>H', char))
|
||||
for (char,gid) in char2gid.iteritems() )
|
||||
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
||||
return CMap(cmapname).update(char2gid, gid2char)
|
||||
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, fontid, spec):
|
||||
if 'BaseFont' not in spec:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
try:
|
||||
self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
|
||||
self.cidsysteminfo['Ordering'])
|
||||
except KeyError:
|
||||
raise PDFFontError('CIDSystemInfo not properly defined.')
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
ttf = None
|
||||
if 'FontFile2' in descriptor:
|
||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||
ttf = TrueTypeFont(self.basefont,
|
||||
StringIO(self.fontfile.get_data()))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
|
||||
elif self.cidcoding == 'Adobe-Identity':
|
||||
if ttf:
|
||||
try:
|
||||
self.ucs2_cmap = ttf.create_cmap()
|
||||
except TrueTypeFont.CMapNotFound:
|
||||
pass
|
||||
else:
|
||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
|
||||
|
||||
def get_width(seq):
|
||||
dic = {}
|
||||
char1 = char2 = None
|
||||
for v in seq:
|
||||
if char1 == None:
|
||||
char1 = v
|
||||
elif char2 == None and isinstance(v, int):
|
||||
char2 = v
|
||||
else:
|
||||
if char2 == None:
|
||||
for (i,w) in enumerate(v):
|
||||
dic[char1+i] = w
|
||||
else:
|
||||
for i in xrange(char1, char2+1):
|
||||
dic[i] = v
|
||||
char1 = char2 = None
|
||||
return dic
|
||||
self.vertical = self.cmap.is_vertical()
|
||||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
dic = get_width(list_value(spec.get('W2', [])))
|
||||
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
||||
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
||||
(d,w) = spec.get('DW2', [880, -1000])
|
||||
default_width = w
|
||||
self.default_disp = d
|
||||
else:
|
||||
# writing mode: horizontal
|
||||
widths = get_width(list_value(spec.get('W', [])))
|
||||
self.disps = {}
|
||||
default_width = spec.get('DW', 1000)
|
||||
self.default_disp = 0
|
||||
PDFFont.__init__(self, fontid, descriptor, widths, default_width)
|
||||
return
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def decode(self, bytes):
|
||||
return self.cmap.decode(bytes)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return self.disps.get(cid, self.default_disp)
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
## Resource Manager
|
||||
##
|
||||
class PDFResourceManager:
|
||||
|
||||
'''
|
||||
ResourceManager facilitates reuse of shared resources
|
||||
such as fonts, images and cmaps so that large objects are not
|
||||
allocated multiple times.
|
||||
'''
|
||||
|
||||
def __init__(self, debug=0):
|
||||
self.debug = debug
|
||||
self.fonts = {}
|
||||
return
|
||||
|
||||
def get_procset(self, procs):
|
||||
for proc in procs:
|
||||
if proc == LITERAL_PDF:
|
||||
pass
|
||||
elif proc == LITERAL_TEXT:
|
||||
pass
|
||||
else:
|
||||
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
||||
pass
|
||||
return
|
||||
|
||||
def get_cmap(self, name):
|
||||
return CMapDB.get_cmap(name)
|
||||
|
||||
def get_font(self, fontid, spec):
|
||||
if fontid in self.fonts:
|
||||
font = self.fonts[fontid]
|
||||
else:
|
||||
spec = dict_value(spec)
|
||||
assert spec['Type'] == LITERAL_FONT
|
||||
# Create a Font object.
|
||||
if 'Subtype' not in spec:
|
||||
raise PDFFontError('Font Subtype is not specified.')
|
||||
subtype = literal_name(spec['Subtype'])
|
||||
if subtype in ('Type1', 'MMType1'):
|
||||
# Type1 Font
|
||||
font = PDFType1Font(fontid, spec)
|
||||
elif subtype == 'TrueType':
|
||||
# TrueType Font
|
||||
font = PDFTrueTypeFont(fontid, spec)
|
||||
elif subtype == 'Type3':
|
||||
# Type3 Font
|
||||
font = PDFType3Font(fontid, spec)
|
||||
elif subtype in ('CIDFontType0', 'CIDFontType2'):
|
||||
# CID Font
|
||||
font = PDFCIDFont(fontid, spec)
|
||||
elif subtype == 'Type0':
|
||||
# Type0 Font
|
||||
dfonts = list_value(spec['DescendantFonts'])
|
||||
assert len(dfonts) == 1
|
||||
subspec = dict_value(dfonts[0]).copy()
|
||||
for k in ('Encoding', 'ToUnicode'):
|
||||
if k in spec:
|
||||
subspec[k] = resolve1(spec[k])
|
||||
font = self.get_font(fontid, subspec)
|
||||
else:
|
||||
raise PDFFontError('Invalid Font: %r' % spec)
|
||||
self.fonts[fontid] = font
|
||||
return font
|
||||
|
||||
|
||||
## PDFDevice
|
||||
##
|
||||
class PDFDevice:
|
||||
|
||||
def __init__(self, rsrc):
|
||||
self.rsrc = rsrc
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFDevice>'
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def set_ctm(self, ctm):
|
||||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_block(self, name):
|
||||
return
|
||||
def end_block(self):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, size, seq):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
## Interpreter
|
||||
##
|
||||
class PDFPageInterpreter:
|
||||
|
||||
class TextState:
|
||||
def __init__(self):
|
||||
self.font = None
|
||||
self.fontsize = 0
|
||||
self.charspace = 0
|
||||
self.wordspace = 0
|
||||
self.scaling = 100
|
||||
self.leading = 0
|
||||
self.render = 0
|
||||
self.rise = 0
|
||||
self.reset()
|
||||
return
|
||||
def __repr__(self):
|
||||
return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
|
||||
' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
|
||||
' render=%r, rise=%r>' %
|
||||
(self.font, self.fontsize, self.matrix,
|
||||
self.charspace, self.wordspace, self.scaling, self.leading,
|
||||
self.render, self.rise))
|
||||
def reset(self):
|
||||
self.matrix = MATRIX_IDENTITY
|
||||
self.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
def __init__(self, rsrc, device, debug=0):
|
||||
self.rsrc = rsrc
|
||||
self.device = device
|
||||
self.debug = debug
|
||||
return
|
||||
|
||||
def initpage(self, ctm):
|
||||
self.fontmap = {}
|
||||
self.xobjmap = {}
|
||||
self.csmap = {}
|
||||
# gstack: stack for graphical states.
|
||||
self.gstack = []
|
||||
self.ctm = ctm
|
||||
self.device.set_ctm(self.ctm)
|
||||
self.textstate = PDFPageInterpreter.TextState()
|
||||
# argstack: stack for command arguments.
|
||||
self.argstack = []
|
||||
# set some global states.
|
||||
self.scs = None
|
||||
self.ncs = None
|
||||
return
|
||||
|
||||
def push(self, obj):
|
||||
self.argstack.append(obj)
|
||||
return
|
||||
|
||||
def pop(self, n):
|
||||
x = self.argstack[-n:]
|
||||
self.argstack = self.argstack[:-n]
|
||||
return x
|
||||
|
||||
def get_current_state(self):
|
||||
return (self.ctm, self.textstate)
|
||||
|
||||
def set_current_state(self, state):
|
||||
(self.ctm, self.textstate) = state
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
# gsave
|
||||
def do_q(self):
|
||||
self.gstack.append(self.get_current_state())
|
||||
return
|
||||
# grestore
|
||||
def do_Q(self):
|
||||
if self.gstack:
|
||||
self.set_current_state(self.gstack.pop())
|
||||
return
|
||||
|
||||
# concat-matrix
|
||||
def do_cm(self, a1, b1, c1, d1, e1, f1):
|
||||
self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
|
||||
self.device.set_ctm(self.ctm)
|
||||
return
|
||||
|
||||
# setlinewidth
|
||||
def do_w(self, width): return
|
||||
# setlinecap
|
||||
def do_J(self, cap): return
|
||||
# setlinejoin
|
||||
def do_j(self, join): return
|
||||
# setmiterlimit
|
||||
def do_M(self, limit): return
|
||||
# setdash
|
||||
def do_d(self, dash, phase): return
|
||||
# setintent
|
||||
def do_ri(self, intent): return
|
||||
# setflatness
|
||||
def do_i(self, flatness): return
|
||||
# savedict
|
||||
def do_gs(self, name): return
|
||||
|
||||
# moveto
|
||||
def do_m(self, x, y): return
|
||||
# lineto
|
||||
def do_l(self, x, y): return
|
||||
# curveto
|
||||
def do_c(self, x1, y1, x2, y2, x3, y3): return
|
||||
# urveto
|
||||
def do_v(self, x2, y2, x3, y3): return
|
||||
# rveto
|
||||
def do_y(self, x1, y1, x3, y3): return
|
||||
# closepath
|
||||
def do_h(self): return
|
||||
# rectangle
|
||||
def do_re(self, x, y, w, h): return
|
||||
|
||||
# stroke
|
||||
def do_S(self): return
|
||||
# close-and-stroke
|
||||
def do_s(self): return
|
||||
# fill
|
||||
def do_f(self): return
|
||||
# fill (obsolete)
|
||||
do_F = do_f
|
||||
# fill-even-odd
|
||||
def do_f_a(self): return
|
||||
# fill-and-stroke
|
||||
def do_B(self): return
|
||||
# fill-and-stroke-even-odd
|
||||
def do_B_a(self): return
|
||||
# close-fill-and-stroke
|
||||
def do_b(self): return
|
||||
# close-fill-and-stroke-even-odd
|
||||
def do_b_a(self): return
|
||||
# close-only
|
||||
def do_n(self): return
|
||||
# clip
|
||||
def do_W(self): return
|
||||
# clip-even-odd
|
||||
def do_W_a(self): return
|
||||
|
||||
# setcolorspace-stroking
|
||||
def do_CS(self, name):
|
||||
self.scs = self.csmap.get(literal_name(name), None)
|
||||
return
|
||||
# setcolorspace-non-strokine
|
||||
def do_cs(self, name):
|
||||
self.ncs = self.csmap.get(literal_name(name), None)
|
||||
return
|
||||
# setgray-stroking
|
||||
def do_G(self, gray):
|
||||
self.do_CS(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
# setgray-non-stroking
|
||||
def do_g(self, gray):
|
||||
self.do_cs(LITERAL_DEVICE_GRAY)
|
||||
return
|
||||
# setrgb-stroking
|
||||
def do_RG(self, r, g, b):
|
||||
self.do_CS(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
# setrgb-non-stroking
|
||||
def do_rg(self, r, g, b):
|
||||
self.do_cs(LITERAL_DEVICE_RGB)
|
||||
return
|
||||
# setcmyk-stroking
|
||||
def do_K(self, c, m, y, k):
|
||||
self.do_CS(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
# setcmyk-non-stroking
|
||||
def do_k(self, c, m, y, k):
|
||||
self.do_cs(LITERAL_DEVICE_CMYK)
|
||||
return
|
||||
|
||||
# setcolor
|
||||
def do_SCN(self):
|
||||
n = cs_params(self.scs)
|
||||
self.pop(n)
|
||||
return
|
||||
def do_scn(self):
|
||||
n = cs_params(self.ncs)
|
||||
self.pop(n)
|
||||
return
|
||||
def do_SC(self):
|
||||
self.do_SCN()
|
||||
return
|
||||
def do_sc(self):
|
||||
self.do_scn()
|
||||
return
|
||||
|
||||
# sharing-name
|
||||
def do_sh(self, name): return
|
||||
|
||||
# begin-text
|
||||
def do_BT(self):
|
||||
self.textstate.reset()
|
||||
return
|
||||
# end-text
|
||||
def do_ET(self):
|
||||
return
|
||||
|
||||
# begin-compat
|
||||
def do_BX(self): return
|
||||
# end-compat
|
||||
def do_EX(self): return
|
||||
|
||||
# marked content operators
|
||||
def do_MP(self, tag): return
|
||||
def do_DP(self, tag, props): return
|
||||
def do_BMC(self, tag): return
|
||||
def do_BDC(self, tag, props): return
|
||||
def do_EMC(self): return
|
||||
|
||||
# setcharspace
|
||||
def do_Tc(self, space):
|
||||
self.textstate.charspace = space
|
||||
return
|
||||
# setwordspace
|
||||
def do_Tw(self, space):
|
||||
self.textstate.wordspace = space
|
||||
return
|
||||
# textscale
|
||||
def do_Tz(self, scale):
|
||||
self.textstate.scaling = scale
|
||||
return
|
||||
# setleading
|
||||
def do_TL(self, leading):
|
||||
self.textstate.leading = leading
|
||||
return
|
||||
# selectfont
|
||||
def do_Tf(self, fontid, fontsize):
|
||||
try:
|
||||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||
except KeyError:
|
||||
raise PDFInterpreterError('Undefined font id: %r' % fontid)
|
||||
self.textstate.fontsize = fontsize
|
||||
return
|
||||
# setrendering
|
||||
def do_Tr(self, render):
|
||||
self.textstate.render = render
|
||||
return
|
||||
# settextrise
|
||||
def do_Ts(self, rise):
|
||||
self.textstate.rise = rise
|
||||
return
|
||||
|
||||
# text-move
|
||||
def do_Td(self, tx, ty):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
# text-move
|
||||
def do_TD(self, tx, ty):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
|
||||
self.textstate.leading = -ty
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
# textmatrix
|
||||
def do_Tm(self, a,b,c,d,e,f):
|
||||
self.textstate.matrix = (a,b,c,d,e,f)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
# nextline
|
||||
def do_T_a(self):
|
||||
(a,b,c,d,e,f) = self.textstate.matrix
|
||||
self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
|
||||
self.textstate.linematrix = (0, 0)
|
||||
return
|
||||
|
||||
# show-pos
|
||||
def do_TJ(self, seq):
|
||||
textstate = self.textstate
|
||||
font = textstate.font
|
||||
(a,b,c,d,e,f) = textstate.matrix
|
||||
(lx,ly) = textstate.linematrix
|
||||
s = ''.join( x for x in seq if isinstance(x, str) )
|
||||
n = sum( x for x in seq if not isinstance(x, str) )
|
||||
w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
|
||||
len(s) * textstate.charspace +
|
||||
s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
|
||||
self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
|
||||
if font.is_vertical():
|
||||
ly += w
|
||||
else:
|
||||
lx += w
|
||||
textstate.linematrix = (lx,ly)
|
||||
return
|
||||
# show
|
||||
def do_Tj(self, s):
|
||||
self.do_TJ([s])
|
||||
return
|
||||
# quote
|
||||
def do__q(self, s):
|
||||
self.do_T_a()
|
||||
self.do_TJ([s])
|
||||
return
|
||||
# doublequote
|
||||
def do__w(self, aw, ac, s):
|
||||
self.do_Tw(aw)
|
||||
self.do_Tc(ac)
|
||||
self.do_TJ([s])
|
||||
return
|
||||
|
||||
# inline image
|
||||
def do_BI(self): # never called
|
||||
return
|
||||
def do_ID(self): # never called
|
||||
return
|
||||
def do_EI(self, obj):
|
||||
return
|
||||
|
||||
# invoke an XObject
|
||||
def do_Do(self, xobjid):
|
||||
xobjid = literal_name(xobjid)
|
||||
try:
|
||||
xobj = stream_value(self.xobjmap[xobjid])
|
||||
except KeyError:
|
||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||
if xobj.dic['Subtype'] == LITERAL_FORM:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing xobj: %r' % xobj
|
||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||
interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
|
||||
xobj.dic.get('Matrix', MATRIX_IDENTITY))
|
||||
return
|
||||
|
||||
def process_page(self, page):
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing page: %r' % page
|
||||
self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
|
||||
return
|
||||
|
||||
def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
|
||||
self.initpage(ctm)
|
||||
self.device.begin_block(contid)
|
||||
# Handle resource declarations.
|
||||
for (k,v) in dict_value(resources).iteritems():
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Resource: %r: %r' % (k,v)
|
||||
if k == 'Font':
|
||||
for (fontid,fontrsrc) in dict_value(v).iteritems():
|
||||
self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
|
||||
elif k == 'ColorSpace':
|
||||
for (csid,csspec) in dict_value(v).iteritems():
|
||||
self.csmap[csid] = list_value(csspec)
|
||||
elif k == 'ProcSet':
|
||||
self.rsrc.get_procset(list_value(v))
|
||||
elif k == 'XObject':
|
||||
for (xobjid,xobjstrm) in dict_value(v).iteritems():
|
||||
self.xobjmap[xobjid] = xobjstrm
|
||||
for stream in list_value(contents):
|
||||
self.execute(stream_value(stream))
|
||||
self.device.end_block()
|
||||
return
|
||||
|
||||
def execute(self, stream):
|
||||
for obj in stream.parse_data(inline=True, debug=self.debug):
|
||||
if isinstance(obj, PSKeyword):
|
||||
name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
|
||||
if hasattr(self, name):
|
||||
func = getattr(self, name)
|
||||
nargs = func.func_code.co_argcount-1
|
||||
if nargs:
|
||||
args = self.pop(nargs)
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'exec: %s %r' % (obj.name, args)
|
||||
if len(args) == nargs:
|
||||
func(*args)
|
||||
else:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'exec: %s' % (obj.name)
|
||||
func()
|
||||
else:
|
||||
raise PDFInterpreterError('unknown operator: %r' % obj.name)
|
||||
else:
|
||||
self.push(obj)
|
||||
return
|
1834
pdfparser.py
1834
pdfparser.py
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,396 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, re
|
||||
stderr = sys.stderr
|
||||
from utils import choplist
|
||||
|
||||
|
||||
## PS Exceptions
|
||||
##
|
||||
class PSException(Exception): pass
|
||||
class PSSyntaxError(PSException): pass
|
||||
class PSTypeError(PSException): pass
|
||||
class PSValueError(PSException): pass
|
||||
|
||||
|
||||
## PostScript Types
|
||||
##
|
||||
class PSLiteral:
|
||||
'''
|
||||
PS literals (e.g. "/Name").
|
||||
Caution: Never create these objects directly.
|
||||
Use PSLiteralTable.intern() instead.
|
||||
'''
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
def __repr__(self):
|
||||
return '/%s' % self.name
|
||||
|
||||
class PSKeyword:
|
||||
'''
|
||||
PS keywords (e.g. "showpage").
|
||||
Caution: Never create these objects directly.
|
||||
Use PSKeywordTable.intern() instead.
|
||||
'''
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
return
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
class PSSymbolTable:
|
||||
'''
|
||||
Symbol table that stores PSLiteral or PSKeyword.
|
||||
'''
|
||||
def __init__(self, classe):
|
||||
self.dic = {}
|
||||
self.classe = classe
|
||||
return
|
||||
|
||||
def intern(self, name):
|
||||
if name in self.dic:
|
||||
lit = self.dic[name]
|
||||
else:
|
||||
lit = self.classe(name)
|
||||
self.dic[name] = lit
|
||||
return lit
|
||||
|
||||
PSLiteralTable = PSSymbolTable(PSLiteral)
|
||||
PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||
|
||||
|
||||
def literal_name(x):
|
||||
if not isinstance(x, PSLiteral):
|
||||
raise PSTypeError('literal required: %r' % x)
|
||||
return x.name
|
||||
|
||||
def keyword_name(x):
|
||||
if not isinstance(x, PSKeyword):
|
||||
raise PSTypeError('keyword required: %r' % x)
|
||||
return x.name
|
||||
|
||||
|
||||
## PSBaseParser
|
||||
##
|
||||
class PSBaseParser:
|
||||
|
||||
'''PostScript parser that performs only basic tokenization.'''
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
self.fp = fp
|
||||
self.debug = debug
|
||||
self.bufsize = 4096
|
||||
self.seek(0)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PSBaseParser: %r>' % (self.fp,)
|
||||
|
||||
def seek(self, pos):
|
||||
'''
|
||||
seeks to the given pos.
|
||||
'''
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'seek:', pos
|
||||
self.fp.seek(pos)
|
||||
self.linepos = pos
|
||||
self.linebuf = None
|
||||
self.curpos = 0
|
||||
self.line = ''
|
||||
return
|
||||
|
||||
EOLCHAR = re.compile(r'[\r\n]')
|
||||
def nextline(self):
|
||||
'''
|
||||
fetches the next line that ends either with \\r or \\n.
|
||||
'''
|
||||
line = ''
|
||||
eol = None
|
||||
while 1:
|
||||
if not self.linebuf or len(self.linebuf) <= self.curpos:
|
||||
# fetch next chunk.
|
||||
self.linebuf = self.fp.read(self.bufsize)
|
||||
if not self.linebuf:
|
||||
# at EOF.
|
||||
break
|
||||
self.curpos = 0
|
||||
if eol:
|
||||
c = self.linebuf[self.curpos]
|
||||
# handle '\r\n'
|
||||
if (eol == '\r' and c == '\n'):
|
||||
line += c
|
||||
self.curpos += 1
|
||||
break
|
||||
m = self.EOLCHAR.search(self.linebuf, self.curpos)
|
||||
if m:
|
||||
i = m.end(0)
|
||||
line += self.linebuf[self.curpos:i]
|
||||
eol = self.linebuf[i-1]
|
||||
self.curpos = i
|
||||
else:
|
||||
# fetch further
|
||||
line += self.linebuf[self.curpos:]
|
||||
self.linebuf = None
|
||||
self.linepos += len(line)
|
||||
return line
|
||||
|
||||
def revreadlines(self):
|
||||
'''
|
||||
fetches lines backword. used to locate trailers.
|
||||
'''
|
||||
self.fp.seek(0, 2)
|
||||
pos = self.fp.tell()
|
||||
buf = ''
|
||||
while 0 < pos:
|
||||
pos = max(0, pos-self.bufsize)
|
||||
self.fp.seek(pos)
|
||||
s = self.fp.read(self.bufsize)
|
||||
if not s: break
|
||||
while 1:
|
||||
n = max(s.rfind('\r'), s.rfind('\n'))
|
||||
if n == -1:
|
||||
buf = s + buf
|
||||
break
|
||||
yield buf+s[n:]
|
||||
s = s[:n]
|
||||
buf = ''
|
||||
return
|
||||
|
||||
SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
|
||||
TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
|
||||
LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
|
||||
NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
|
||||
STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
|
||||
STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
|
||||
STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
|
||||
STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
|
||||
|
||||
def parse(self):
|
||||
'''
|
||||
Yields a list of basic tokens: keywords, literals, strings,
|
||||
numbers and parentheses. Comments are skipped.
|
||||
Nested objects (i.e. arrays and dictionaries) are not handled.
|
||||
'''
|
||||
while 1:
|
||||
# do not strip line! we need to distinguish last '\n' or '\r'
|
||||
linepos0 = self.linepos
|
||||
self.line = self.nextline()
|
||||
if not self.line: break
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
|
||||
# do this before removing comment
|
||||
if self.line.startswith('%%EOF'): break
|
||||
charpos = 0
|
||||
|
||||
# tokenize
|
||||
while 1:
|
||||
m = self.TOKEN.search(self.line, charpos)
|
||||
if not m: break
|
||||
t = m.group(0)
|
||||
pos = linepos0 + m.start(0)
|
||||
charpos = m.end(0)
|
||||
|
||||
if t == '%':
|
||||
# skip comment
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'comment: %r' % self.line[charpos:]
|
||||
break
|
||||
|
||||
elif t == '/':
|
||||
# literal object
|
||||
mn = self.LITERAL.match(self.line, m.start(0)+1)
|
||||
lit = PSLiteralTable.intern(mn.group(0))
|
||||
yield (pos, lit)
|
||||
charpos = mn.end(0)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'name: %r' % lit
|
||||
|
||||
elif t == '(':
|
||||
# normal string object
|
||||
s = ''
|
||||
while 1:
|
||||
ms = self.STRING_NORM.match(self.line, charpos)
|
||||
if not ms: break
|
||||
s1 = ms.group(0)
|
||||
charpos = ms.end(0)
|
||||
if len(s1) == 1 and s1[-1] == '\\':
|
||||
s += s1[-1:]
|
||||
self.line = self.nextline()
|
||||
if not self.line:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
charpos = 0
|
||||
elif charpos == len(self.line):
|
||||
s += s1
|
||||
self.line = self.nextline()
|
||||
if not self.line:
|
||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
charpos = 0
|
||||
else:
|
||||
s += s1
|
||||
break
|
||||
if self.line[charpos] != ')':
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
charpos += 1
|
||||
def convesc(m):
|
||||
x = m.group(0)
|
||||
if x[1:].isdigit():
|
||||
return chr(int(x[1:], 8))
|
||||
else:
|
||||
return x[1]
|
||||
s = self.STRING_NORM_SUB.sub(convesc, s)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'str: %r' % s
|
||||
yield (pos, s)
|
||||
|
||||
elif t == '<':
|
||||
# hex string object
|
||||
ms = self.STRING_HEX.match(self.line, charpos)
|
||||
charpos = ms.end(0)
|
||||
if self.line[charpos] != '>':
|
||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||
(self.linepos, self.line))
|
||||
charpos += 1
|
||||
def convhex(m1):
|
||||
return chr(int(m1.group(0), 16))
|
||||
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'str: %r' % s
|
||||
yield (pos, s)
|
||||
|
||||
elif self.NUMBER.match(t):
|
||||
# number
|
||||
if '.' in t:
|
||||
n = float(t)
|
||||
else:
|
||||
n = int(t)
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'number: %r' % n
|
||||
yield (pos, n)
|
||||
|
||||
elif t in ('true','false'):
|
||||
# boolean
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'boolean: %r' % t
|
||||
yield (pos, (t == 'true'))
|
||||
|
||||
else:
|
||||
# other token
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'keyword: %r' % t
|
||||
yield (pos, PSKeywordTable.intern(t))
|
||||
|
||||
return
|
||||
|
||||
|
||||
## PSStackParser
|
||||
##
|
||||
class PSStackParser(PSBaseParser):
|
||||
|
||||
'''
|
||||
PostScript parser that recognizes compound objects
|
||||
such as arrays and dictionaries.
|
||||
'''
|
||||
|
||||
def __init__(self, fp, debug=0):
|
||||
PSBaseParser.__init__(self, fp, debug=debug)
|
||||
self.context = []
|
||||
self.partobj = None
|
||||
return
|
||||
|
||||
def do_token(self, pos, token):
|
||||
'''
|
||||
Handles special tokens.
|
||||
Returns true if the token denotes the end of an object.
|
||||
'''
|
||||
return False
|
||||
|
||||
def push(self, obj):
|
||||
'''
|
||||
Push an object to the stack.
|
||||
'''
|
||||
self.partobj.append(obj)
|
||||
return
|
||||
|
||||
def pop(self, n):
|
||||
'''
|
||||
Pop N objects from the stack.
|
||||
'''
|
||||
if len(self.partobj) < n:
|
||||
raise PSSyntaxError('stack too short < %d' % n)
|
||||
r = self.partobj[-n:]
|
||||
self.partobj = self.partobj[:-n]
|
||||
return r
|
||||
|
||||
def popall(self):
|
||||
'''
|
||||
Discards all the objects on the stack.
|
||||
'''
|
||||
self.partobj = []
|
||||
return
|
||||
|
||||
def parse(self):
|
||||
'''
|
||||
Yields a list of objects: keywords, literals, strings,
|
||||
numbers, arrays and dictionaries. Arrays and dictionaries
|
||||
are represented as Python sequence and dictionaries.
|
||||
'''
|
||||
|
||||
def startobj(type):
|
||||
self.context.append((type, self.partobj))
|
||||
self.partobj = []
|
||||
return
|
||||
|
||||
def endobj(type1):
|
||||
assert self.context
|
||||
obj = self.partobj
|
||||
(type0, self.partobj) = self.context.pop()
|
||||
if type0 != type1:
|
||||
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
||||
(type0, self.partobj, type1, obj))
|
||||
return obj
|
||||
|
||||
startobj('o')
|
||||
|
||||
for (pos,t) in PSBaseParser.parse(self):
|
||||
if isinstance(t, int) or isinstance(t, float):
|
||||
self.push(t)
|
||||
elif isinstance(t, str):
|
||||
self.push(t)
|
||||
elif isinstance(t, PSLiteral):
|
||||
self.push(t)
|
||||
else:
|
||||
c = keyword_name(t)
|
||||
if c == '{' or c == '}':
|
||||
self.push(t)
|
||||
elif c == '[':
|
||||
# begin array
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'start array'
|
||||
startobj('a')
|
||||
elif c == ']':
|
||||
# end array
|
||||
a = endobj('a')
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'end array: %r' % a
|
||||
self.push(a)
|
||||
elif c == '<<':
|
||||
# begin dictionary
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'start dict'
|
||||
startobj('d')
|
||||
elif c == '>>':
|
||||
# end dictionary
|
||||
objs = endobj('d')
|
||||
if len(objs) % 2 != 0:
|
||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||
if 2 <= self.debug:
|
||||
print >>stderr, 'end dict: %r' % d
|
||||
self.push(d)
|
||||
elif self.do_token(pos, t):
|
||||
break
|
||||
|
||||
return endobj('o')
|
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
## Utilities
|
||||
##
|
||||
def choplist(n, seq):
|
||||
'''Groups every n elements of the list.'''
|
||||
r = []
|
||||
for x in seq:
|
||||
r.append(x)
|
||||
if len(r) == n:
|
||||
yield tuple(r)
|
||||
r = []
|
||||
return
|
||||
|
||||
def nunpack(s, default=0):
|
||||
'''Unpacks up to 4 bytes.'''
|
||||
l = len(s)
|
||||
if not l:
|
||||
return default
|
||||
elif l == 1:
|
||||
return ord(s)
|
||||
elif l == 2:
|
||||
return unpack('>H', s)[0]
|
||||
elif l == 3:
|
||||
return unpack('>L', '\x00'+s)[0]
|
||||
elif l == 4:
|
||||
return unpack('>L', s)[0]
|
||||
else:
|
||||
return TypeError('invalid length: %d' % l)
|
Loading…
Reference in New Issue