tmp
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
24bdd33557
commit
c41c279321
|
@ -2,10 +2,53 @@
|
||||||
import sys
|
import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
|
from pdflib.pdffont import PDFUnicodeNotDefined
|
||||||
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||||
|
|
||||||
|
|
||||||
|
## PDFDevice
|
||||||
|
##
|
||||||
|
class PDFDevice(object):
|
||||||
|
|
||||||
|
debug = 0
|
||||||
|
|
||||||
|
def __init__(self, rsrc):
|
||||||
|
self.rsrc = rsrc
|
||||||
|
self.ctm = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFDevice>'
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
def set_ctm(self, ctm):
|
||||||
|
self.ctm = ctm
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_tag(self, tag, props=None):
|
||||||
|
return
|
||||||
|
def end_tag(self):
|
||||||
|
return
|
||||||
|
def do_tag(self, tag, props=None):
|
||||||
|
return
|
||||||
|
|
||||||
|
def begin_page(self, page):
|
||||||
|
return
|
||||||
|
def end_page(self, page):
|
||||||
|
return
|
||||||
|
def begin_figure(self, name, bbox):
|
||||||
|
return
|
||||||
|
def end_figure(self, name):
|
||||||
|
return
|
||||||
|
|
||||||
|
def render_string(self, textstate, textmatrix, seq):
|
||||||
|
raise NotImplementedError
|
||||||
|
def render_image(self, stream, size, matrix):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
## PageItem
|
## PageItem
|
||||||
##
|
##
|
||||||
class PageItem(object):
|
class PageItem(object):
|
||||||
|
|
|
@ -3,10 +3,10 @@ import sys
|
||||||
stdout = sys.stdout
|
stdout = sys.stdout
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||||
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
PDFPageInterpreter, PDFUnicodeNotDefined
|
from pdflib.pdffont import PDFUnicodeNotDefined
|
||||||
from pdflib.cmap import CMapDB
|
from pdflib.cmap import CMapDB
|
||||||
from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
|
from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator
|
||||||
|
|
||||||
|
|
||||||
def enc(x, codec):
|
def enc(x, codec):
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
stderr = sys.stderr
|
||||||
|
from pdflib.psparser import PSLiteralTable
|
||||||
|
|
||||||
|
|
||||||
|
## ColorSpace
|
||||||
|
##
|
||||||
|
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||||
|
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||||
|
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||||
|
|
||||||
|
class ColorSpace(object):
|
||||||
|
|
||||||
|
def __init__(self, name, ncomponents):
|
||||||
|
self.name = name
|
||||||
|
self.ncomponents = ncomponents
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||||
|
|
||||||
|
|
||||||
|
PREDEFINED_COLORSPACE = dict(
|
||||||
|
(name, ColorSpace(name,n)) for (name,n) in {
|
||||||
|
'CalRGB': 3,
|
||||||
|
'CalGray': 1,
|
||||||
|
'Lab': 3,
|
||||||
|
'DeviceRGB': 3,
|
||||||
|
'DeviceCMYK': 4,
|
||||||
|
'DeviceGray': 1,
|
||||||
|
'Separation': 1,
|
||||||
|
'Indexed': 1,
|
||||||
|
'Pattern': 1,
|
||||||
|
}.iteritems())
|
|
@ -0,0 +1,341 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys
|
||||||
|
stderr = sys.stderr
|
||||||
|
from struct import pack, unpack
|
||||||
|
try:
|
||||||
|
from cStringIO import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from StringIO import StringIO
|
||||||
|
from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
||||||
|
literal_name, keyword_name, STRICT
|
||||||
|
from pdflib.pdftypes import PDFException, \
|
||||||
|
resolve1, int_value, float_value, num_value, \
|
||||||
|
str_value, list_value, dict_value, stream_value
|
||||||
|
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||||
|
|
||||||
|
|
||||||
|
## Fonts
|
||||||
|
##
|
||||||
|
|
||||||
|
class PDFFontError(PDFException): pass
|
||||||
|
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||||
|
|
||||||
|
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||||
|
|
||||||
|
|
||||||
|
# PDFFont
|
||||||
|
class PDFFont(object):
|
||||||
|
|
||||||
|
def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
|
||||||
|
self.descriptor = descriptor
|
||||||
|
self.widths = widths
|
||||||
|
self.fontname = descriptor.get('FontName', 'unknown')
|
||||||
|
if isinstance(self.fontname, PSLiteral):
|
||||||
|
self.fontname = literal_name(self.fontname)
|
||||||
|
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||||
|
self.descent = num_value(descriptor.get('Descent', 0))
|
||||||
|
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||||
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
|
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||||
|
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFFont>'
|
||||||
|
|
||||||
|
def is_vertical(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_multibyte(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def decode(self, bytes):
|
||||||
|
return map(ord, bytes)
|
||||||
|
|
||||||
|
def char_width(self, cid):
|
||||||
|
return self.widths.get(cid, self.default_width)
|
||||||
|
|
||||||
|
def char_disp(self, cid):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def string_width(self, s):
|
||||||
|
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||||
|
|
||||||
|
# PDFSimpleFont
|
||||||
|
class PDFSimpleFont(PDFFont):
|
||||||
|
|
||||||
|
def __init__(self, descriptor, widths, spec, font_matrix=None):
|
||||||
|
# Font encoding is specified either by a name of
|
||||||
|
# built-in encoding or a dictionary that describes
|
||||||
|
# the differences.
|
||||||
|
if 'Encoding' in spec:
|
||||||
|
encoding = resolve1(spec['Encoding'])
|
||||||
|
else:
|
||||||
|
encoding = LITERAL_STANDARD_ENCODING
|
||||||
|
if isinstance(encoding, dict):
|
||||||
|
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||||
|
diff = list_value(encoding.get('Differences', None))
|
||||||
|
self.encoding = EncodingDB.get_encoding(name, diff)
|
||||||
|
else:
|
||||||
|
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
||||||
|
self.ucs2_cmap = None
|
||||||
|
if 'ToUnicode' in spec:
|
||||||
|
strm = stream_value(spec['ToUnicode'])
|
||||||
|
self.ucs2_cmap = CMap()
|
||||||
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||||
|
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
|
||||||
|
return
|
||||||
|
|
||||||
|
def to_unicode(self, cid):
|
||||||
|
if not self.ucs2_cmap:
|
||||||
|
try:
|
||||||
|
return self.encoding[cid]
|
||||||
|
except KeyError:
|
||||||
|
raise PDFUnicodeNotDefined(None, cid)
|
||||||
|
code = self.ucs2_cmap.tocode(cid)
|
||||||
|
if not code:
|
||||||
|
raise PDFUnicodeNotDefined(None, cid)
|
||||||
|
chars = unpack('>%dH' % (len(code)/2), code)
|
||||||
|
return ''.join( unichr(c) for c in chars )
|
||||||
|
|
||||||
|
|
||||||
|
# PDFType1Font
|
||||||
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
|
def __init__(self, spec):
|
||||||
|
try:
|
||||||
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('BaseFont is missing')
|
||||||
|
self.basefont = 'unknown'
|
||||||
|
try:
|
||||||
|
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||||
|
except KeyError:
|
||||||
|
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||||
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
|
lastchar = int_value(spec.get('LastChar', 255))
|
||||||
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
|
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||||
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
# PDFTrueTypeFont
|
||||||
|
class PDFTrueTypeFont(PDFType1Font):
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||||
|
|
||||||
|
# PDFType3Font
|
||||||
|
class PDFType3Font(PDFSimpleFont):
|
||||||
|
|
||||||
|
def __init__(self, spec):
|
||||||
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
|
lastchar = int_value(spec.get('LastChar', 0))
|
||||||
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
|
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
||||||
|
if 'FontDescriptor' in spec:
|
||||||
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
|
else:
|
||||||
|
descriptor = {'FontName':spec.get('Name'),
|
||||||
|
'Ascent':0, 'Descent':0,
|
||||||
|
'FontBBox':spec['FontBBox']}
|
||||||
|
PDFSimpleFont.__init__(self, descriptor, widths, spec,
|
||||||
|
font_matrix=tuple(list_value(spec.get('FontMatrix'))))
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFType3Font>'
|
||||||
|
|
||||||
|
|
||||||
|
# PDFCIDFont
|
||||||
|
|
||||||
|
## TrueTypeFont
|
||||||
|
##
|
||||||
|
class TrueTypeFont(object):
|
||||||
|
|
||||||
|
class CMapNotFound(Exception): pass
|
||||||
|
|
||||||
|
def __init__(self, name, fp):
|
||||||
|
self.name = name
|
||||||
|
self.fp = fp
|
||||||
|
self.tables = {}
|
||||||
|
fonttype = fp.read(4)
|
||||||
|
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||||
|
for i in xrange(ntables):
|
||||||
|
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
||||||
|
self.tables[name] = (offset, length)
|
||||||
|
return
|
||||||
|
|
||||||
|
def create_cmap(self):
|
||||||
|
if 'cmap' not in self.tables:
|
||||||
|
raise TrueTypeFont.CMapNotFound
|
||||||
|
(base_offset, length) = self.tables['cmap']
|
||||||
|
fp = self.fp
|
||||||
|
fp.seek(base_offset)
|
||||||
|
(version, nsubtables) = unpack('>HH', fp.read(4))
|
||||||
|
subtables = []
|
||||||
|
for i in xrange(nsubtables):
|
||||||
|
subtables.append(unpack('>HHL', fp.read(8)))
|
||||||
|
char2gid = {}
|
||||||
|
# Only supports subtable type 0, 2 and 4.
|
||||||
|
for (_1, _2, st_offset) in subtables:
|
||||||
|
fp.seek(base_offset+st_offset)
|
||||||
|
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
||||||
|
if fmttype == 0:
|
||||||
|
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
||||||
|
elif fmttype == 2:
|
||||||
|
subheaderkeys = unpack('>256H', fp.read(512))
|
||||||
|
firstbytes = [0]*8192
|
||||||
|
for (i,k) in enumerate(subheaderkeys):
|
||||||
|
firstbytes[k/8] = i
|
||||||
|
nhdrs = max(subheaderkeys)/8 + 1
|
||||||
|
hdrs = []
|
||||||
|
for i in xrange(nhdrs):
|
||||||
|
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
||||||
|
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
||||||
|
for (i,firstcode,entcount,delta,pos) in hdrs:
|
||||||
|
if not entcount: continue
|
||||||
|
first = firstcode + (firstbytes[i] << 8)
|
||||||
|
fp.seek(pos)
|
||||||
|
for c in xrange(entcount):
|
||||||
|
gid = unpack('>H', fp.read(2))
|
||||||
|
if gid:
|
||||||
|
gid += delta
|
||||||
|
char2gid[first+c] = gid
|
||||||
|
elif fmttype == 4:
|
||||||
|
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||||
|
segcount /= 2
|
||||||
|
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||||
|
fp.read(2)
|
||||||
|
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||||
|
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||||
|
pos = fp.tell()
|
||||||
|
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||||
|
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
||||||
|
if idr:
|
||||||
|
fp.seek(pos+idr)
|
||||||
|
for c in xrange(sc, ec+1):
|
||||||
|
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
||||||
|
else:
|
||||||
|
for c in xrange(sc, ec+1):
|
||||||
|
char2gid[c] = (c + idd) & 0xffff
|
||||||
|
gid2char = dict( (gid, pack('>H', char))
|
||||||
|
for (char,gid) in char2gid.iteritems() )
|
||||||
|
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
||||||
|
return CMap(cmapname).update(char2gid, gid2char)
|
||||||
|
|
||||||
|
class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
|
def __init__(self, spec):
|
||||||
|
try:
|
||||||
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('BaseFont is missing')
|
||||||
|
self.basefont = 'unknown'
|
||||||
|
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||||
|
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||||
|
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||||
|
try:
|
||||||
|
name = literal_name(spec['Encoding'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('Encoding is unspecified')
|
||||||
|
name = 'unknown'
|
||||||
|
try:
|
||||||
|
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
||||||
|
except CMapDB.CMapNotFound, e:
|
||||||
|
raise PDFFontError(e)
|
||||||
|
try:
|
||||||
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('FontDescriptor is missing')
|
||||||
|
descriptor = {}
|
||||||
|
ttf = None
|
||||||
|
if 'FontFile2' in descriptor:
|
||||||
|
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||||
|
ttf = TrueTypeFont(self.basefont,
|
||||||
|
StringIO(self.fontfile.get_data()))
|
||||||
|
self.ucs2_cmap = None
|
||||||
|
if 'ToUnicode' in spec:
|
||||||
|
strm = stream_value(spec['ToUnicode'])
|
||||||
|
self.ucs2_cmap = CMap()
|
||||||
|
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||||
|
elif self.cidcoding == 'Adobe-Identity':
|
||||||
|
if ttf:
|
||||||
|
try:
|
||||||
|
self.ucs2_cmap = ttf.create_cmap()
|
||||||
|
except TrueTypeFont.CMapNotFound:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
||||||
|
strict=STRICT)
|
||||||
|
except CMapDB.CMapNotFound, e:
|
||||||
|
raise PDFFontError(e)
|
||||||
|
|
||||||
|
def get_width(seq):
|
||||||
|
dic = {}
|
||||||
|
char1 = char2 = None
|
||||||
|
for v in seq:
|
||||||
|
if char1 == None:
|
||||||
|
char1 = v
|
||||||
|
elif char2 == None and isinstance(v, int):
|
||||||
|
char2 = v
|
||||||
|
else:
|
||||||
|
if char2 == None:
|
||||||
|
for (i,w) in enumerate(v):
|
||||||
|
dic[char1+i] = w
|
||||||
|
else:
|
||||||
|
for i in xrange(char1, char2+1):
|
||||||
|
dic[i] = v
|
||||||
|
char1 = char2 = None
|
||||||
|
return dic
|
||||||
|
self.vertical = self.cmap.is_vertical()
|
||||||
|
if self.vertical:
|
||||||
|
# writing mode: vertical
|
||||||
|
dic = get_width(list_value(spec.get('W2', [])))
|
||||||
|
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
||||||
|
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
||||||
|
(d,w) = spec.get('DW2', [880, -1000])
|
||||||
|
default_width = w
|
||||||
|
self.default_disp = d
|
||||||
|
else:
|
||||||
|
# writing mode: horizontal
|
||||||
|
widths = get_width(list_value(spec.get('W', [])))
|
||||||
|
self.disps = {}
|
||||||
|
default_width = spec.get('DW', 1000)
|
||||||
|
self.default_disp = 0
|
||||||
|
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
||||||
|
|
||||||
|
def is_vertical(self):
|
||||||
|
return self.vertical
|
||||||
|
|
||||||
|
def is_multibyte(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def decode(self, bytes):
|
||||||
|
return self.cmap.decode(bytes)
|
||||||
|
|
||||||
|
def char_disp(self, cid):
|
||||||
|
return self.disps.get(cid, self.default_disp)
|
||||||
|
|
||||||
|
def to_unicode(self, cid):
|
||||||
|
if not self.ucs2_cmap:
|
||||||
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
code = self.ucs2_cmap.tocode(cid)
|
||||||
|
if not code:
|
||||||
|
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||||
|
chars = unpack('>%dH' % (len(code)/2), code)
|
||||||
|
return ''.join( unichr(c) for c in chars )
|
||||||
|
|
||||||
|
|
|
@ -6,33 +6,22 @@ try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
from pdflib.psparser import PSException, PSTypeError, PSEOF, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSStackParser, PSKeyword, STRICT
|
||||||
from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
|
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||||
int_value, float_value, num_value, \
|
resolve1, int_value, float_value, num_value, \
|
||||||
str_value, list_value, dict_value, stream_value
|
str_value, list_value, dict_value, stream_value
|
||||||
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
|
||||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
|
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||||
|
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
||||||
|
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||||
|
|
||||||
|
|
||||||
## Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
class PDFResourceError(PDFException): pass
|
class PDFResourceError(PDFException): pass
|
||||||
class PDFInterpreterError(PDFException): pass
|
class PDFInterpreterError(PDFException): pass
|
||||||
class PDFFontError(PDFException): pass
|
|
||||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
|
||||||
|
|
||||||
|
|
||||||
## ColorSpace
|
|
||||||
##
|
|
||||||
class ColorSpace(object):
|
|
||||||
def __init__(self, name, ncomponents):
|
|
||||||
self.name = name
|
|
||||||
self.ncomponents = ncomponents
|
|
||||||
return
|
|
||||||
def __repr__(self):
|
|
||||||
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
|
||||||
|
|
||||||
|
|
||||||
## Constants
|
## Constants
|
||||||
|
@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text')
|
||||||
LITERAL_FONT = PSLiteralTable.intern('Font')
|
LITERAL_FONT = PSLiteralTable.intern('Font')
|
||||||
LITERAL_FORM = PSLiteralTable.intern('Form')
|
LITERAL_FORM = PSLiteralTable.intern('Form')
|
||||||
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
||||||
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
|
||||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
|
||||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
|
||||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
|
||||||
KEYWORD_BI = PSKeywordTable.intern('BI')
|
|
||||||
KEYWORD_ID = PSKeywordTable.intern('ID')
|
|
||||||
KEYWORD_EI = PSKeywordTable.intern('EI')
|
|
||||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
|
||||||
|
|
||||||
PREDEFINED_COLORSPACE = dict(
|
|
||||||
(name, ColorSpace(name,n)) for (name,n) in {
|
|
||||||
'CalRGB': 3,
|
|
||||||
'CalGray': 1,
|
|
||||||
'Lab': 3,
|
|
||||||
'DeviceRGB': 3,
|
|
||||||
'DeviceCMYK': 4,
|
|
||||||
'DeviceGray': 1,
|
|
||||||
'Separation': 1,
|
|
||||||
'Indexed': 1,
|
|
||||||
'Pattern': 1,
|
|
||||||
}.iteritems())
|
|
||||||
|
|
||||||
|
|
||||||
## Fonts
|
|
||||||
##
|
|
||||||
|
|
||||||
# PDFFont
|
|
||||||
class PDFFont(object):
|
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, default_width=None):
|
|
||||||
self.descriptor = descriptor
|
|
||||||
self.widths = widths
|
|
||||||
self.fontname = descriptor.get('FontName', 'unknown')
|
|
||||||
if isinstance(self.fontname, PSLiteral):
|
|
||||||
self.fontname = literal_name(self.fontname)
|
|
||||||
self.ascent = num_value(descriptor.get('Ascent', 0))
|
|
||||||
self.descent = num_value(descriptor.get('Descent', 0))
|
|
||||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
|
||||||
self.leading = num_value(descriptor.get('Leading', 0))
|
|
||||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFFont>'
|
|
||||||
|
|
||||||
def is_vertical(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def is_multibyte(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
|
||||||
return map(ord, bytes)
|
|
||||||
|
|
||||||
def char_width(self, cid):
|
|
||||||
return self.widths.get(cid, self.default_width)
|
|
||||||
|
|
||||||
def char_disp(self, cid):
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def string_width(self, s):
|
|
||||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
|
||||||
|
|
||||||
|
|
||||||
# PDFSimpleFont
|
|
||||||
class PDFSimpleFont(PDFFont):
|
|
||||||
|
|
||||||
def __init__(self, descriptor, widths, spec):
|
|
||||||
# Font encoding is specified either by a name of
|
|
||||||
# built-in encoding or a dictionary that describes
|
|
||||||
# the differences.
|
|
||||||
if 'Encoding' in spec:
|
|
||||||
encoding = resolve1(spec['Encoding'])
|
|
||||||
else:
|
|
||||||
encoding = LITERAL_STANDARD_ENCODING
|
|
||||||
if isinstance(encoding, dict):
|
|
||||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
|
||||||
diff = list_value(encoding.get('Differences', None))
|
|
||||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
|
||||||
else:
|
|
||||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
|
||||||
self.ucs2_cmap = None
|
|
||||||
if 'ToUnicode' in spec:
|
|
||||||
strm = stream_value(spec['ToUnicode'])
|
|
||||||
self.ucs2_cmap = CMap()
|
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
|
||||||
PDFFont.__init__(self, descriptor, widths)
|
|
||||||
return
|
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
|
||||||
if not self.ucs2_cmap:
|
|
||||||
try:
|
|
||||||
return self.encoding[cid]
|
|
||||||
except KeyError:
|
|
||||||
raise PDFUnicodeNotDefined(None, cid)
|
|
||||||
code = self.ucs2_cmap.tocode(cid)
|
|
||||||
if not code:
|
|
||||||
raise PDFUnicodeNotDefined(None, cid)
|
|
||||||
chars = unpack('>%dH' % (len(code)/2), code)
|
|
||||||
return ''.join( unichr(c) for c in chars )
|
|
||||||
|
|
||||||
|
|
||||||
# PDFType1Font
|
|
||||||
class PDFType1Font(PDFSimpleFont):
|
|
||||||
|
|
||||||
def __init__(self, spec):
|
|
||||||
try:
|
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
|
||||||
except KeyError:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFFontError('BaseFont is missing')
|
|
||||||
self.basefont = 'unknown'
|
|
||||||
try:
|
|
||||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
|
||||||
except KeyError:
|
|
||||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
|
||||||
lastchar = int_value(spec.get('LastChar', 255))
|
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
|
||||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
|
||||||
|
|
||||||
# PDFTrueTypeFont
|
|
||||||
class PDFTrueTypeFont(PDFType1Font):
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
|
||||||
|
|
||||||
# PDFType3Font
|
|
||||||
class PDFType3Font(PDFSimpleFont):
|
|
||||||
def __init__(self, spec):
|
|
||||||
firstchar = int_value(spec.get('FirstChar', 0))
|
|
||||||
lastchar = int_value(spec.get('LastChar', 0))
|
|
||||||
widths = list_value(spec.get('Widths', [0]*256))
|
|
||||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
|
||||||
if 'FontDescriptor' in spec:
|
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
|
||||||
else:
|
|
||||||
descriptor = {'FontName':spec.get('Name'),
|
|
||||||
'Ascent':0, 'Descent':0,
|
|
||||||
'FontBBox':spec['FontBBox']}
|
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFType3Font>'
|
|
||||||
|
|
||||||
|
|
||||||
# PDFCIDFont
|
|
||||||
|
|
||||||
## TrueTypeFont
|
|
||||||
##
|
|
||||||
class TrueTypeFont(object):
|
|
||||||
|
|
||||||
class CMapNotFound(Exception): pass
|
|
||||||
|
|
||||||
def __init__(self, name, fp):
|
|
||||||
self.name = name
|
|
||||||
self.fp = fp
|
|
||||||
self.tables = {}
|
|
||||||
fonttype = fp.read(4)
|
|
||||||
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
|
||||||
for i in xrange(ntables):
|
|
||||||
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
|
||||||
self.tables[name] = (offset, length)
|
|
||||||
return
|
|
||||||
|
|
||||||
def create_cmap(self):
|
|
||||||
if 'cmap' not in self.tables:
|
|
||||||
raise TrueTypeFont.CMapNotFound
|
|
||||||
(base_offset, length) = self.tables['cmap']
|
|
||||||
fp = self.fp
|
|
||||||
fp.seek(base_offset)
|
|
||||||
(version, nsubtables) = unpack('>HH', fp.read(4))
|
|
||||||
subtables = []
|
|
||||||
for i in xrange(nsubtables):
|
|
||||||
subtables.append(unpack('>HHL', fp.read(8)))
|
|
||||||
char2gid = {}
|
|
||||||
# Only supports subtable type 0, 2 and 4.
|
|
||||||
for (_1, _2, st_offset) in subtables:
|
|
||||||
fp.seek(base_offset+st_offset)
|
|
||||||
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
|
||||||
if fmttype == 0:
|
|
||||||
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
|
||||||
elif fmttype == 2:
|
|
||||||
subheaderkeys = unpack('>256H', fp.read(512))
|
|
||||||
firstbytes = [0]*8192
|
|
||||||
for (i,k) in enumerate(subheaderkeys):
|
|
||||||
firstbytes[k/8] = i
|
|
||||||
nhdrs = max(subheaderkeys)/8 + 1
|
|
||||||
hdrs = []
|
|
||||||
for i in xrange(nhdrs):
|
|
||||||
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
|
||||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
|
||||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
|
||||||
if not entcount: continue
|
|
||||||
first = firstcode + (firstbytes[i] << 8)
|
|
||||||
fp.seek(pos)
|
|
||||||
for c in xrange(entcount):
|
|
||||||
gid = unpack('>H', fp.read(2))
|
|
||||||
if gid:
|
|
||||||
gid += delta
|
|
||||||
char2gid[first+c] = gid
|
|
||||||
elif fmttype == 4:
|
|
||||||
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
|
||||||
segcount /= 2
|
|
||||||
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
||||||
fp.read(2)
|
|
||||||
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
||||||
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
|
||||||
pos = fp.tell()
|
|
||||||
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
|
||||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
|
||||||
if idr:
|
|
||||||
fp.seek(pos+idr)
|
|
||||||
for c in xrange(sc, ec+1):
|
|
||||||
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
|
||||||
else:
|
|
||||||
for c in xrange(sc, ec+1):
|
|
||||||
char2gid[c] = (c + idd) & 0xffff
|
|
||||||
gid2char = dict( (gid, pack('>H', char))
|
|
||||||
for (char,gid) in char2gid.iteritems() )
|
|
||||||
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
|
||||||
return CMap(cmapname).update(char2gid, gid2char)
|
|
||||||
|
|
||||||
class PDFCIDFont(PDFFont):
|
|
||||||
|
|
||||||
def __init__(self, spec):
|
|
||||||
try:
|
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
|
||||||
except KeyError:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFFontError('BaseFont is missing')
|
|
||||||
self.basefont = 'unknown'
|
|
||||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
|
||||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
|
||||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
|
||||||
try:
|
|
||||||
name = literal_name(spec['Encoding'])
|
|
||||||
except KeyError:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFFontError('Encoding is unspecified')
|
|
||||||
name = 'unknown'
|
|
||||||
try:
|
|
||||||
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
|
||||||
except CMapDB.CMapNotFound, e:
|
|
||||||
raise PDFFontError(e)
|
|
||||||
try:
|
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
|
||||||
except KeyError:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFFontError('FontDescriptor is missing')
|
|
||||||
descriptor = {}
|
|
||||||
ttf = None
|
|
||||||
if 'FontFile2' in descriptor:
|
|
||||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
|
||||||
ttf = TrueTypeFont(self.basefont,
|
|
||||||
StringIO(self.fontfile.get_data()))
|
|
||||||
self.ucs2_cmap = None
|
|
||||||
if 'ToUnicode' in spec:
|
|
||||||
strm = stream_value(spec['ToUnicode'])
|
|
||||||
self.ucs2_cmap = CMap()
|
|
||||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
|
||||||
elif self.cidcoding == 'Adobe-Identity':
|
|
||||||
if ttf:
|
|
||||||
try:
|
|
||||||
self.ucs2_cmap = ttf.create_cmap()
|
|
||||||
except TrueTypeFont.CMapNotFound:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
|
||||||
strict=STRICT)
|
|
||||||
except CMapDB.CMapNotFound, e:
|
|
||||||
raise PDFFontError(e)
|
|
||||||
|
|
||||||
def get_width(seq):
|
|
||||||
dic = {}
|
|
||||||
char1 = char2 = None
|
|
||||||
for v in seq:
|
|
||||||
if char1 == None:
|
|
||||||
char1 = v
|
|
||||||
elif char2 == None and isinstance(v, int):
|
|
||||||
char2 = v
|
|
||||||
else:
|
|
||||||
if char2 == None:
|
|
||||||
for (i,w) in enumerate(v):
|
|
||||||
dic[char1+i] = w
|
|
||||||
else:
|
|
||||||
for i in xrange(char1, char2+1):
|
|
||||||
dic[i] = v
|
|
||||||
char1 = char2 = None
|
|
||||||
return dic
|
|
||||||
self.vertical = self.cmap.is_vertical()
|
|
||||||
if self.vertical:
|
|
||||||
# writing mode: vertical
|
|
||||||
dic = get_width(list_value(spec.get('W2', [])))
|
|
||||||
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
|
||||||
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
|
||||||
(d,w) = spec.get('DW2', [880, -1000])
|
|
||||||
default_width = w
|
|
||||||
self.default_disp = d
|
|
||||||
else:
|
|
||||||
# writing mode: horizontal
|
|
||||||
widths = get_width(list_value(spec.get('W', [])))
|
|
||||||
self.disps = {}
|
|
||||||
default_width = spec.get('DW', 1000)
|
|
||||||
self.default_disp = 0
|
|
||||||
PDFFont.__init__(self, descriptor, widths, default_width)
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
|
||||||
|
|
||||||
def is_vertical(self):
|
|
||||||
return self.vertical
|
|
||||||
|
|
||||||
def is_multibyte(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def decode(self, bytes):
|
|
||||||
return self.cmap.decode(bytes)
|
|
||||||
|
|
||||||
def char_disp(self, cid):
|
|
||||||
return self.disps.get(cid, self.default_disp)
|
|
||||||
|
|
||||||
def to_unicode(self, cid):
|
|
||||||
if not self.ucs2_cmap:
|
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
|
||||||
code = self.ucs2_cmap.tocode(cid)
|
|
||||||
if not code:
|
|
||||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
|
||||||
chars = unpack('>%dH' % (len(code)/2), code)
|
|
||||||
return ''.join( unichr(c) for c in chars )
|
|
||||||
|
|
||||||
|
|
||||||
## Resource Manager
|
## Resource Manager
|
||||||
|
@ -388,7 +39,7 @@ class PDFResourceManager(object):
|
||||||
|
|
||||||
'''
|
'''
|
||||||
ResourceManager facilitates reuse of shared resources
|
ResourceManager facilitates reuse of shared resources
|
||||||
such as fonts, images and cmaps so that large objects are not
|
such as fonts and images so that large objects are not
|
||||||
allocated multiple times.
|
allocated multiple times.
|
||||||
'''
|
'''
|
||||||
debug = 0
|
debug = 0
|
||||||
|
@ -399,24 +50,21 @@ class PDFResourceManager(object):
|
||||||
|
|
||||||
def get_procset(self, procs):
|
def get_procset(self, procs):
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
if proc == LITERAL_PDF:
|
if proc is LITERAL_PDF:
|
||||||
pass
|
pass
|
||||||
elif proc == LITERAL_TEXT:
|
elif proc is LITERAL_TEXT:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_cmap(self, name):
|
|
||||||
return CMapDB.get_cmap(name, strict=STRICT)
|
|
||||||
|
|
||||||
def get_font(self, objid, spec):
|
def get_font(self, objid, spec):
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
font = self.fonts[objid]
|
font = self.fonts[objid]
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
if spec['Type'] != LITERAL_FONT:
|
if spec['Type'] is not LITERAL_FONT:
|
||||||
raise PDFFontError('Type is not /Font')
|
raise PDFFontError('Type is not /Font')
|
||||||
# Create a Font object.
|
# Create a Font object.
|
||||||
if 'Subtype' in spec:
|
if 'Subtype' in spec:
|
||||||
|
@ -455,49 +103,6 @@ class PDFResourceManager(object):
|
||||||
return font
|
return font
|
||||||
|
|
||||||
|
|
||||||
## PDFDevice
|
|
||||||
##
|
|
||||||
class PDFDevice(object):
|
|
||||||
|
|
||||||
debug = 0
|
|
||||||
|
|
||||||
def __init__(self, rsrc):
|
|
||||||
self.rsrc = rsrc
|
|
||||||
self.ctm = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFDevice>'
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
return
|
|
||||||
|
|
||||||
def set_ctm(self, ctm):
|
|
||||||
self.ctm = ctm
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_tag(self, tag, props=None):
|
|
||||||
return
|
|
||||||
def end_tag(self):
|
|
||||||
return
|
|
||||||
def do_tag(self, tag, props=None):
|
|
||||||
return
|
|
||||||
|
|
||||||
def begin_page(self, page):
|
|
||||||
return
|
|
||||||
def end_page(self, page):
|
|
||||||
return
|
|
||||||
def begin_figure(self, name, bbox):
|
|
||||||
return
|
|
||||||
def end_figure(self, name):
|
|
||||||
return
|
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, seq):
|
|
||||||
raise NotImplementedError
|
|
||||||
def render_image(self, stream, size, matrix):
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
|
|
||||||
## PDFContentParser
|
## PDFContentParser
|
||||||
##
|
##
|
||||||
class PDFContentParser(PSStackParser):
|
class PDFContentParser(PSStackParser):
|
||||||
|
@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser):
|
||||||
self.add_results(*self.popall())
|
self.add_results(*self.popall())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
KEYWORD_BI = PSKeywordTable.intern('BI')
|
||||||
|
KEYWORD_ID = PSKeywordTable.intern('ID')
|
||||||
|
KEYWORD_EI = PSKeywordTable.intern('EI')
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
if token == KEYWORD_BI:
|
if token is self.KEYWORD_BI:
|
||||||
# inline image within a content stream
|
# inline image within a content stream
|
||||||
self.start_type(pos, 'inline')
|
self.start_type(pos, 'inline')
|
||||||
elif token == KEYWORD_ID:
|
elif token is self.KEYWORD_ID:
|
||||||
try:
|
try:
|
||||||
(_, objs) = self.end_type('inline')
|
(_, objs) = self.end_type('inline')
|
||||||
if len(objs) % 2 != 0:
|
if len(objs) % 2 != 0:
|
||||||
|
@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser):
|
||||||
(pos, data) = self.get_inline_data(pos+len('ID '))
|
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||||
obj = PDFStream(d, data)
|
obj = PDFStream(d, data)
|
||||||
self.push((pos, obj))
|
self.push((pos, obj))
|
||||||
self.push((pos, KEYWORD_EI))
|
self.push((pos, self.KEYWORD_EI))
|
||||||
except PSTypeError:
|
except PSTypeError:
|
||||||
if STRICT: raise
|
if STRICT: raise
|
||||||
else:
|
else:
|
||||||
|
@ -975,7 +583,7 @@ class PDFPageInterpreter(object):
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
subtype = xobj.dic.get('Subtype')
|
subtype = xobj.dic.get('Subtype')
|
||||||
if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
|
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
|
||||||
interpreter = self.dup()
|
interpreter = self.dup()
|
||||||
(x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
|
(x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
|
||||||
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
|
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
|
||||||
|
@ -985,7 +593,7 @@ class PDFPageInterpreter(object):
|
||||||
self.device.begin_figure(xobjid, bbox)
|
self.device.begin_figure(xobjid, bbox)
|
||||||
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
|
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
||||||
(x0,y0) = apply_matrix(self.ctm, (0,0))
|
(x0,y0) = apply_matrix(self.ctm, (0,0))
|
||||||
(x1,y1) = apply_matrix(self.ctm, (1,1))
|
(x1,y1) = apply_matrix(self.ctm, (1,1))
|
||||||
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
|
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
|
||||||
|
|
|
@ -7,26 +7,22 @@
|
||||||
import sys, re
|
import sys, re
|
||||||
import md5, struct
|
import md5, struct
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack
|
from pdflib.utils import choplist, nunpack
|
||||||
from arcfour import Arcfour
|
from pdflib.arcfour import Arcfour
|
||||||
from lzw import LZWDecoder
|
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||||
PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
STRICT
|
||||||
literal_name, keyword_name, \
|
from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
||||||
PSStackParser, STRICT
|
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
||||||
|
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
||||||
|
|
||||||
|
|
||||||
## PDF Exceptions
|
## Exceptions
|
||||||
##
|
##
|
||||||
class PDFException(PSException): pass
|
|
||||||
class PDFSyntaxError(PDFException): pass
|
class PDFSyntaxError(PDFException): pass
|
||||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||||
class PDFEncryptionError(PDFException): pass
|
class PDFEncryptionError(PDFException): pass
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
class PDFTypeError(PDFException): pass
|
|
||||||
class PDFValueError(PDFException): pass
|
|
||||||
class PDFNotImplementedError(PSException): pass
|
|
||||||
|
|
||||||
|
|
||||||
# some predefined literals and keywords.
|
# some predefined literals and keywords.
|
||||||
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
|
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
|
||||||
|
@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
|
||||||
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
|
||||||
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
|
|
||||||
PSLiteralTable.intern('Fl'))
|
|
||||||
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
|
|
||||||
PSLiteralTable.intern('LZW'))
|
|
||||||
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
|
|
||||||
PSLiteralTable.intern('A85'))
|
|
||||||
KEYWORD_R = PSKeywordTable.intern('R')
|
|
||||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
|
||||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
|
||||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
|
||||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
|
||||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
|
||||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
|
||||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
|
||||||
|
|
||||||
class PDFObject(PSObject): pass
|
|
||||||
|
|
||||||
|
|
||||||
## PDFObjRef
|
|
||||||
##
|
|
||||||
class PDFObjRef(PDFObject):
|
|
||||||
|
|
||||||
def __init__(self, doc, objid, _):
|
|
||||||
if objid == 0:
|
|
||||||
if STRICT:
|
|
||||||
raise PDFValueError('PDF object id cannot be 0.')
|
|
||||||
self.doc = doc
|
|
||||||
self.objid = objid
|
|
||||||
#self.genno = genno # Never used.
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFObjRef:%d>' % (self.objid)
|
|
||||||
|
|
||||||
def resolve(self):
|
|
||||||
return self.doc.getobj(self.objid)
|
|
||||||
|
|
||||||
|
|
||||||
# resolve
|
|
||||||
def resolve1(x):
|
|
||||||
'''
|
|
||||||
Resolve an object. If this is an array or dictionary,
|
|
||||||
it may still contains some indirect objects inside.
|
|
||||||
'''
|
|
||||||
while isinstance(x, PDFObjRef):
|
|
||||||
x = x.resolve()
|
|
||||||
return x
|
|
||||||
|
|
||||||
def resolve_all(x):
|
|
||||||
'''
|
|
||||||
Recursively resolve X and all the internals.
|
|
||||||
Make sure there is no indirect reference within the nested object.
|
|
||||||
This procedure might be slow.
|
|
||||||
'''
|
|
||||||
while isinstance(x, PDFObjRef):
|
|
||||||
x = x.resolve()
|
|
||||||
if isinstance(x, list):
|
|
||||||
x = [ resolve_all(v) for v in x ]
|
|
||||||
elif isinstance(x, dict):
|
|
||||||
for (k,v) in x.iteritems():
|
|
||||||
x[k] = resolve_all(v)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def decipher_all(decipher, objid, genno, x):
|
|
||||||
'''
|
|
||||||
Recursively decipher X.
|
|
||||||
'''
|
|
||||||
if isinstance(x, str):
|
|
||||||
return decipher(objid, genno, x)
|
|
||||||
if isinstance(x, list):
|
|
||||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
|
||||||
elif isinstance(x, dict):
|
|
||||||
for (k,v) in x.iteritems():
|
|
||||||
x[k] = decipher_all(decipher, objid, genno, v)
|
|
||||||
return x
|
|
||||||
|
|
||||||
# Type cheking
|
|
||||||
def int_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not isinstance(x, int):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('Integer required: %r' % x)
|
|
||||||
return 0
|
|
||||||
return x
|
|
||||||
|
|
||||||
def float_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not isinstance(x, float):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('Float required: %r' % x)
|
|
||||||
return 0.0
|
|
||||||
return x
|
|
||||||
|
|
||||||
def num_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not (isinstance(x, int) or isinstance(x, float)):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('Int or Float required: %r' % x)
|
|
||||||
return 0
|
|
||||||
return x
|
|
||||||
|
|
||||||
def str_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not isinstance(x, str):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('String required: %r' % x)
|
|
||||||
return ''
|
|
||||||
return x
|
|
||||||
|
|
||||||
def list_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('List required: %r' % x)
|
|
||||||
return []
|
|
||||||
return x
|
|
||||||
|
|
||||||
def dict_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not isinstance(x, dict):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('Dict required: %r' % x)
|
|
||||||
return {}
|
|
||||||
return x
|
|
||||||
|
|
||||||
def stream_value(x):
|
|
||||||
x = resolve1(x)
|
|
||||||
if not isinstance(x, PDFStream):
|
|
||||||
if STRICT:
|
|
||||||
raise PDFTypeError('PDFStream required: %r' % x)
|
|
||||||
return PDFStream({}, '')
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
## PDFStream type
|
|
||||||
##
|
|
||||||
class PDFStream(PDFObject):
|
|
||||||
|
|
||||||
def __init__(self, dic, rawdata, decipher=None):
|
|
||||||
self.dic = dic
|
|
||||||
self.rawdata = rawdata
|
|
||||||
self.decipher = decipher
|
|
||||||
self.data = None
|
|
||||||
self.objid = None
|
|
||||||
self.genno = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def set_objid(self, objid, genno):
|
|
||||||
self.objid = objid
|
|
||||||
self.genno = genno
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
|
||||||
|
|
||||||
def decode(self):
|
|
||||||
assert self.data == None and self.rawdata != None
|
|
||||||
data = self.rawdata
|
|
||||||
if self.decipher:
|
|
||||||
# Handle encryption
|
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
|
||||||
if 'Filter' not in self.dic:
|
|
||||||
self.data = data
|
|
||||||
self.rawdata = None
|
|
||||||
return
|
|
||||||
filters = self.dic['Filter']
|
|
||||||
if not isinstance(filters, list):
|
|
||||||
filters = [ filters ]
|
|
||||||
for f in filters:
|
|
||||||
if f in LITERALS_FLATE_DECODE:
|
|
||||||
import zlib
|
|
||||||
# will get errors if the document is encrypted.
|
|
||||||
data = zlib.decompress(data)
|
|
||||||
elif f in LITERALS_LZW_DECODE:
|
|
||||||
try:
|
|
||||||
from cStringIO import StringIO
|
|
||||||
except ImportError:
|
|
||||||
from StringIO import StringIO
|
|
||||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
|
||||||
elif f in LITERALS_ASCII85_DECODE:
|
|
||||||
import ascii85
|
|
||||||
data = ascii85.ascii85decode(data)
|
|
||||||
elif f == LITERAL_CRYPT:
|
|
||||||
raise PDFEncryptionError('/Crypt filter is unsupported')
|
|
||||||
else:
|
|
||||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
|
||||||
# apply predictors
|
|
||||||
params = self.dic.get('DecodeParms', {})
|
|
||||||
if 'Predictor' in params:
|
|
||||||
pred = int_value(params['Predictor'])
|
|
||||||
if pred:
|
|
||||||
if pred != 12:
|
|
||||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
|
||||||
if 'Columns' not in params:
|
|
||||||
raise PDFValueError('Columns undefined for predictor=12')
|
|
||||||
columns = int_value(params['Columns'])
|
|
||||||
buf = ''
|
|
||||||
ent0 = '\x00' * columns
|
|
||||||
for i in xrange(0, len(data), columns+1):
|
|
||||||
pred = data[i]
|
|
||||||
ent1 = data[i+1:i+1+columns]
|
|
||||||
if pred == '\x02':
|
|
||||||
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
|
||||||
buf += ent1
|
|
||||||
ent0 = ent1
|
|
||||||
data = buf
|
|
||||||
self.data = data
|
|
||||||
self.rawdata = None
|
|
||||||
return
|
|
||||||
|
|
||||||
def get_data(self):
|
|
||||||
if self.data == None:
|
|
||||||
self.decode()
|
|
||||||
return self.data
|
|
||||||
|
|
||||||
def get_rawdata(self):
|
|
||||||
return self.rawdata
|
|
||||||
|
|
||||||
|
|
||||||
## PDFPage
|
|
||||||
##
|
|
||||||
class PDFPage(object):
|
|
||||||
|
|
||||||
def __init__(self, doc, pageid, attrs):
|
|
||||||
self.doc = doc
|
|
||||||
self.pageid = pageid
|
|
||||||
self.attrs = dict_value(attrs)
|
|
||||||
self.lastmod = self.attrs.get('LastModified')
|
|
||||||
self.resources = resolve1(self.attrs['Resources'])
|
|
||||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
|
||||||
if 'CropBox' in self.attrs:
|
|
||||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
|
||||||
else:
|
|
||||||
self.cropbox = self.mediabox
|
|
||||||
self.rotate = self.attrs.get('Rotate', 0)
|
|
||||||
self.annots = self.attrs.get('Annots')
|
|
||||||
self.beads = self.attrs.get('B')
|
|
||||||
if 'Contents' in self.attrs:
|
|
||||||
contents = resolve1(self.attrs['Contents'])
|
|
||||||
else:
|
|
||||||
contents = []
|
|
||||||
if not isinstance(contents, list):
|
|
||||||
contents = [ contents ]
|
|
||||||
self.contents = contents
|
|
||||||
return
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
|
||||||
|
|
||||||
|
|
||||||
## XRefs
|
## XRefs
|
||||||
|
##
|
||||||
|
|
||||||
## PDFXRef
|
## PDFXRef
|
||||||
##
|
##
|
||||||
|
@ -296,7 +44,7 @@ class PDFXRef(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def objids(self):
|
def objids(self):
|
||||||
return self.offsets.keys()
|
return self.offsets.iterkeys()
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -330,10 +78,11 @@ class PDFXRef(object):
|
||||||
self.load_trailer(parser)
|
self.load_trailer(parser)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||||
def load_trailer(self, parser):
|
def load_trailer(self, parser):
|
||||||
try:
|
try:
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
assert kwd == KEYWORD_TRAILER
|
assert kwd is self.KEYWORD_TRAILER
|
||||||
(_,dic) = parser.nextobject()
|
(_,dic) = parser.nextobject()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
x = parser.pop(1)
|
x = parser.pop(1)
|
||||||
|
@ -350,7 +99,7 @@ class PDFXRef(object):
|
||||||
raise
|
raise
|
||||||
if use != 'n':
|
if use != 'n':
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Unused objid=%r' % objid)
|
raise PDFSyntaxError('Unused objid=%r' % objid)
|
||||||
return (None, pos)
|
return (None, pos)
|
||||||
|
|
||||||
|
|
||||||
|
@ -367,14 +116,14 @@ class PDFXRefStream(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def objids(self):
|
def objids(self):
|
||||||
return range(self.objid0, self.objid1+1)
|
return xrange(self.objid0, self.objid1)
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser):
|
||||||
(_,objid) = parser.nexttoken() # ignored
|
(_,objid) = parser.nexttoken() # ignored
|
||||||
(_,genno) = parser.nexttoken() # ignored
|
(_,genno) = parser.nexttoken() # ignored
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_,stream) = parser.nextobject()
|
||||||
if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
|
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
|
||||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||||
size = stream.dic['Size']
|
size = stream.dic['Size']
|
||||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||||
|
@ -402,6 +151,37 @@ class PDFXRefStream(object):
|
||||||
return (objid, index)
|
return (objid, index)
|
||||||
|
|
||||||
|
|
||||||
|
## PDFPage
|
||||||
|
##
|
||||||
|
class PDFPage(object):
|
||||||
|
|
||||||
|
def __init__(self, doc, pageid, attrs):
|
||||||
|
self.doc = doc
|
||||||
|
self.pageid = pageid
|
||||||
|
self.attrs = dict_value(attrs)
|
||||||
|
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||||
|
self.resources = resolve1(self.attrs['Resources'])
|
||||||
|
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||||
|
if 'CropBox' in self.attrs:
|
||||||
|
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||||
|
else:
|
||||||
|
self.cropbox = self.mediabox
|
||||||
|
self.rotate = self.attrs.get('Rotate', 0)
|
||||||
|
self.annots = self.attrs.get('Annots')
|
||||||
|
self.beads = self.attrs.get('B')
|
||||||
|
if 'Contents' in self.attrs:
|
||||||
|
contents = resolve1(self.attrs['Contents'])
|
||||||
|
else:
|
||||||
|
contents = []
|
||||||
|
if not isinstance(contents, list):
|
||||||
|
contents = [ contents ]
|
||||||
|
self.contents = contents
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||||
|
|
||||||
|
|
||||||
## PDFDocument
|
## PDFDocument
|
||||||
##
|
##
|
||||||
## A PDFDocument object represents a PDF document.
|
## A PDFDocument object represents a PDF document.
|
||||||
|
@ -463,15 +243,16 @@ class PDFDocument(object):
|
||||||
def set_root(self, root):
|
def set_root(self, root):
|
||||||
self.root = root
|
self.root = root
|
||||||
self.catalog = dict_value(self.root)
|
self.catalog = dict_value(self.root)
|
||||||
if self.catalog.get('Type') != LITERAL_CATALOG:
|
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Catalog not found!')
|
raise PDFSyntaxError('Catalog not found!')
|
||||||
return
|
return
|
||||||
|
|
||||||
# initialize(password='')
|
# initialize(password='')
|
||||||
# Perform the initialization with a given password.
|
# Perform the initialization with a given password.
|
||||||
# This step is mandatory even if there's no password associated
|
# This step is mandatory even if there's no password associated
|
||||||
# with the document.
|
# with the document.
|
||||||
|
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||||
def initialize(self, password=''):
|
def initialize(self, password=''):
|
||||||
if not self.encryption:
|
if not self.encryption:
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
|
@ -494,7 +275,7 @@ class PDFDocument(object):
|
||||||
self.is_modifiable = bool(P & 8)
|
self.is_modifiable = bool(P & 8)
|
||||||
self.is_extractable = bool(P & 16)
|
self.is_extractable = bool(P & 16)
|
||||||
# Algorithm 3.2
|
# Algorithm 3.2
|
||||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5.md5(password) # 2
|
hash = md5.md5(password) # 2
|
||||||
hash.update(O) # 3
|
hash.update(O) # 3
|
||||||
hash.update(struct.pack('<l', P)) # 4
|
hash.update(struct.pack('<l', P)) # 4
|
||||||
|
@ -512,7 +293,7 @@ class PDFDocument(object):
|
||||||
u1 = Arcfour(key).process(password)
|
u1 = Arcfour(key).process(password)
|
||||||
elif R == 3:
|
elif R == 3:
|
||||||
# Algorithm 3.5
|
# Algorithm 3.5
|
||||||
hash = md5.md5(PASSWORD_PADDING) # 2
|
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||||
hash.update(docid[0]) # 3
|
hash.update(docid[0]) # 3
|
||||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||||
for i in xrange(1,19+1):
|
for i in xrange(1,19+1):
|
||||||
|
@ -536,6 +317,7 @@ class PDFDocument(object):
|
||||||
key = hash.digest()[:min(len(key),16)]
|
key = hash.digest()[:min(len(key),16)]
|
||||||
return Arcfour(key).process(data)
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
|
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
if not self.ready:
|
if not self.ready:
|
||||||
raise PDFException('PDFDocument not initialized')
|
raise PDFException('PDFDocument not initialized')
|
||||||
|
@ -554,11 +336,11 @@ class PDFDocument(object):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('Cannot locate objid=%r' % objid)
|
raise PDFSyntaxError('Cannot locate objid=%r' % objid)
|
||||||
return None
|
return None
|
||||||
if strmid:
|
if strmid:
|
||||||
stream = stream_value(self.getobj(strmid))
|
stream = stream_value(self.getobj(strmid))
|
||||||
if stream.dic['Type'] != LITERAL_OBJSTM:
|
if stream.dic['Type'] is not LITERAL_OBJSTM:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||||
try:
|
try:
|
||||||
|
@ -589,7 +371,7 @@ class PDFDocument(object):
|
||||||
(_,genno) = self.parser.nexttoken() # genno
|
(_,genno) = self.parser.nexttoken() # genno
|
||||||
#assert objid1 == objid, (objid, objid1)
|
#assert objid1 == objid, (objid, objid1)
|
||||||
(_,kwd) = self.parser.nexttoken()
|
(_,kwd) = self.parser.nexttoken()
|
||||||
if kwd != KEYWORD_OBJ:
|
if kwd is not self.KEYWORD_OBJ:
|
||||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||||
(_,obj) = self.parser.nextobject()
|
(_,obj) = self.parser.nextobject()
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
|
@ -611,13 +393,13 @@ class PDFDocument(object):
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
|
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||||
for c in tree['Kids']:
|
for c in tree['Kids']:
|
||||||
for x in search(c, tree):
|
for x in search(c, tree):
|
||||||
yield x
|
yield x
|
||||||
elif tree.get('Type') == LITERAL_PAGE:
|
elif tree.get('Type') is LITERAL_PAGE:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Page: %r' % tree
|
print >>stderr, 'Page: %r' % tree
|
||||||
yield (obj.objid, tree)
|
yield (obj.objid, tree)
|
||||||
|
@ -683,15 +465,20 @@ class PDFParser(PSStackParser):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<PDFParser>'
|
return '<PDFParser>'
|
||||||
|
|
||||||
|
KEYWORD_R = PSKeywordTable.intern('R')
|
||||||
|
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||||
|
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||||
|
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||||
|
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||||
def do_keyword(self, pos, token):
|
def do_keyword(self, pos, token):
|
||||||
if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
|
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
||||||
self.add_results(*self.pop(1))
|
self.add_results(*self.pop(1))
|
||||||
return
|
return
|
||||||
if token == KEYWORD_ENDOBJ:
|
if token is self.KEYWORD_ENDOBJ:
|
||||||
self.add_results(*self.pop(4))
|
self.add_results(*self.pop(4))
|
||||||
return
|
return
|
||||||
|
|
||||||
if token == KEYWORD_R:
|
if token is self.KEYWORD_R:
|
||||||
# reference to indirect object
|
# reference to indirect object
|
||||||
try:
|
try:
|
||||||
((_,objid), (_,genno)) = self.pop(2)
|
((_,objid), (_,genno)) = self.pop(2)
|
||||||
|
@ -702,7 +489,7 @@ class PDFParser(PSStackParser):
|
||||||
pass
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
if token == KEYWORD_STREAM:
|
if token is self.KEYWORD_STREAM:
|
||||||
# stream object
|
# stream object
|
||||||
((_,dic),) = self.pop(1)
|
((_,dic),) = self.pop(1)
|
||||||
dic = dict_value(dic)
|
dic = dict_value(dic)
|
||||||
|
@ -710,7 +497,7 @@ class PDFParser(PSStackParser):
|
||||||
objlen = int_value(dic['Length'])
|
objlen = int_value(dic['Length'])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('/Length is undefined: %r' % dic)
|
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
||||||
objlen = 0
|
objlen = 0
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
try:
|
try:
|
||||||
|
@ -785,7 +572,7 @@ class PDFParser(PSStackParser):
|
||||||
xref = PDFXRefStream()
|
xref = PDFXRefStream()
|
||||||
xref.load(self)
|
xref.load(self)
|
||||||
else:
|
else:
|
||||||
if token != KEYWORD_XREF:
|
if token is not self.KEYWORD_XREF:
|
||||||
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
||||||
(pos, token))
|
(pos, token))
|
||||||
self.nextline()
|
self.nextline()
|
||||||
|
@ -835,6 +622,7 @@ class PDFParser(PSStackParser):
|
||||||
yield xref
|
yield xref
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
## PDFObjStrmParser
|
## PDFObjStrmParser
|
||||||
##
|
##
|
||||||
class PDFObjStrmParser(PDFParser):
|
class PDFObjStrmParser(PDFParser):
|
||||||
|
|
|
@ -0,0 +1,222 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
import sys, zlib
|
||||||
|
stderr = sys.stderr
|
||||||
|
from pdflib.lzw import LZWDecoder
|
||||||
|
from pdflib.psparser import PSException, PSObject, \
|
||||||
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
|
literal_name, keyword_name, STRICT
|
||||||
|
|
||||||
|
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||||
|
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
|
||||||
|
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
|
||||||
|
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
|
||||||
|
|
||||||
|
|
||||||
|
## PDF Objects
|
||||||
|
##
|
||||||
|
class PDFObject(PSObject): pass
|
||||||
|
|
||||||
|
class PDFException(PSException): pass
|
||||||
|
class PDFTypeError(PDFException): pass
|
||||||
|
class PDFValueError(PDFException): pass
|
||||||
|
class PDFNotImplementedError(PSException): pass
|
||||||
|
|
||||||
|
|
||||||
|
## PDFObjRef
|
||||||
|
##
|
||||||
|
class PDFObjRef(PDFObject):
|
||||||
|
|
||||||
|
def __init__(self, doc, objid, _):
|
||||||
|
if objid == 0:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFValueError('PDF object id cannot be 0.')
|
||||||
|
self.doc = doc
|
||||||
|
self.objid = objid
|
||||||
|
#self.genno = genno # Never used.
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFObjRef:%d>' % (self.objid)
|
||||||
|
|
||||||
|
def resolve(self):
|
||||||
|
return self.doc.getobj(self.objid)
|
||||||
|
|
||||||
|
|
||||||
|
# resolve
|
||||||
|
def resolve1(x):
|
||||||
|
'''
|
||||||
|
Resolve an object. If this is an array or dictionary,
|
||||||
|
it may still contains some indirect objects inside.
|
||||||
|
'''
|
||||||
|
while isinstance(x, PDFObjRef):
|
||||||
|
x = x.resolve()
|
||||||
|
return x
|
||||||
|
|
||||||
|
def resolve_all(x):
|
||||||
|
'''
|
||||||
|
Recursively resolve X and all the internals.
|
||||||
|
Make sure there is no indirect reference within the nested object.
|
||||||
|
This procedure might be slow.
|
||||||
|
'''
|
||||||
|
while isinstance(x, PDFObjRef):
|
||||||
|
x = x.resolve()
|
||||||
|
if isinstance(x, list):
|
||||||
|
x = [ resolve_all(v) for v in x ]
|
||||||
|
elif isinstance(x, dict):
|
||||||
|
for (k,v) in x.iteritems():
|
||||||
|
x[k] = resolve_all(v)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def decipher_all(decipher, objid, genno, x):
|
||||||
|
'''
|
||||||
|
Recursively decipher X.
|
||||||
|
'''
|
||||||
|
if isinstance(x, str):
|
||||||
|
return decipher(objid, genno, x)
|
||||||
|
if isinstance(x, list):
|
||||||
|
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||||
|
elif isinstance(x, dict):
|
||||||
|
for (k,v) in x.iteritems():
|
||||||
|
x[k] = decipher_all(decipher, objid, genno, v)
|
||||||
|
return x
|
||||||
|
|
||||||
|
# Type cheking
|
||||||
|
def int_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not isinstance(x, int):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('Integer required: %r' % x)
|
||||||
|
return 0
|
||||||
|
return x
|
||||||
|
|
||||||
|
def float_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not isinstance(x, float):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('Float required: %r' % x)
|
||||||
|
return 0.0
|
||||||
|
return x
|
||||||
|
|
||||||
|
def num_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not (isinstance(x, int) or isinstance(x, float)):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('Int or Float required: %r' % x)
|
||||||
|
return 0
|
||||||
|
return x
|
||||||
|
|
||||||
|
def str_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not isinstance(x, str):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('String required: %r' % x)
|
||||||
|
return ''
|
||||||
|
return x
|
||||||
|
|
||||||
|
def list_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('List required: %r' % x)
|
||||||
|
return []
|
||||||
|
return x
|
||||||
|
|
||||||
|
def dict_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not isinstance(x, dict):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('Dict required: %r' % x)
|
||||||
|
return {}
|
||||||
|
return x
|
||||||
|
|
||||||
|
def stream_value(x):
|
||||||
|
x = resolve1(x)
|
||||||
|
if not isinstance(x, PDFStream):
|
||||||
|
if STRICT:
|
||||||
|
raise PDFTypeError('PDFStream required: %r' % x)
|
||||||
|
return PDFStream({}, '')
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
## PDFStream type
|
||||||
|
##
|
||||||
|
class PDFStream(PDFObject):
|
||||||
|
|
||||||
|
def __init__(self, dic, rawdata, decipher=None):
|
||||||
|
self.dic = dic
|
||||||
|
self.rawdata = rawdata
|
||||||
|
self.decipher = decipher
|
||||||
|
self.data = None
|
||||||
|
self.objid = None
|
||||||
|
self.genno = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def set_objid(self, objid, genno):
|
||||||
|
self.objid = objid
|
||||||
|
self.genno = genno
|
||||||
|
return
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||||
|
|
||||||
|
def decode(self):
|
||||||
|
assert self.data == None and self.rawdata != None
|
||||||
|
data = self.rawdata
|
||||||
|
if self.decipher:
|
||||||
|
# Handle encryption
|
||||||
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
|
if 'Filter' not in self.dic:
|
||||||
|
self.data = data
|
||||||
|
self.rawdata = None
|
||||||
|
return
|
||||||
|
filters = self.dic['Filter']
|
||||||
|
if not isinstance(filters, list):
|
||||||
|
filters = [ filters ]
|
||||||
|
for f in filters:
|
||||||
|
if f in LITERALS_FLATE_DECODE:
|
||||||
|
# will get errors if the document is encrypted.
|
||||||
|
data = zlib.decompress(data)
|
||||||
|
elif f in LITERALS_LZW_DECODE:
|
||||||
|
try:
|
||||||
|
from cStringIO import StringIO
|
||||||
|
except ImportError:
|
||||||
|
from StringIO import StringIO
|
||||||
|
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||||
|
elif f in LITERALS_ASCII85_DECODE:
|
||||||
|
import ascii85
|
||||||
|
data = ascii85.ascii85decode(data)
|
||||||
|
elif f == LITERAL_CRYPT:
|
||||||
|
raise PDFEncryptionError('/Crypt filter is unsupported')
|
||||||
|
else:
|
||||||
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||||
|
# apply predictors
|
||||||
|
params = self.dic.get('DecodeParms', {})
|
||||||
|
if 'Predictor' in params:
|
||||||
|
pred = int_value(params['Predictor'])
|
||||||
|
if pred:
|
||||||
|
if pred != 12:
|
||||||
|
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||||
|
if 'Columns' not in params:
|
||||||
|
raise PDFValueError('Columns undefined for predictor=12')
|
||||||
|
columns = int_value(params['Columns'])
|
||||||
|
buf = ''
|
||||||
|
ent0 = '\x00' * columns
|
||||||
|
for i in xrange(0, len(data), columns+1):
|
||||||
|
pred = data[i]
|
||||||
|
ent1 = data[i+1:i+1+columns]
|
||||||
|
if pred == '\x02':
|
||||||
|
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
||||||
|
buf += ent1
|
||||||
|
ent0 = ent1
|
||||||
|
data = buf
|
||||||
|
self.data = data
|
||||||
|
self.rawdata = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def get_data(self):
|
||||||
|
if self.data == None:
|
||||||
|
self.decode()
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
def get_rawdata(self):
|
||||||
|
return self.rawdata
|
|
@ -1,7 +1,8 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys, re
|
import sys, re
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist
|
|
||||||
|
from pdflib.utils import choplist
|
||||||
|
|
||||||
STRICT = 0
|
STRICT = 0
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,8 @@ from struct import unpack
|
||||||
|
|
||||||
## Matrix operations
|
## Matrix operations
|
||||||
##
|
##
|
||||||
|
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||||
|
|
||||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
'''Multiplies two matrices.'''
|
'''Multiplies two matrices.'''
|
||||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||||
|
|
Loading…
Reference in New Issue