tmp
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
24bdd33557
commit
c41c279321
|
@ -2,10 +2,53 @@
|
|||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
|
||||
from pdflib.pdffont import PDFUnicodeNotDefined
|
||||
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
|
||||
|
||||
|
||||
## PDFDevice
|
||||
##
|
||||
class PDFDevice(object):
|
||||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, rsrc):
|
||||
self.rsrc = rsrc
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFDevice>'
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def set_ctm(self, ctm):
|
||||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_tag(self, tag, props=None):
|
||||
return
|
||||
def end_tag(self):
|
||||
return
|
||||
def do_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
return
|
||||
def end_page(self, page):
|
||||
return
|
||||
def begin_figure(self, name, bbox):
|
||||
return
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
raise NotImplementedError
|
||||
def render_image(self, stream, size, matrix):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
## PageItem
|
||||
##
|
||||
class PageItem(object):
|
||||
|
|
|
@ -3,10 +3,10 @@ import sys
|
|||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
||||
PDFPageInterpreter, PDFUnicodeNotDefined
|
||||
from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdflib.pdffont import PDFUnicodeNotDefined
|
||||
from pdflib.cmap import CMapDB
|
||||
from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
|
||||
from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator
|
||||
|
||||
|
||||
def enc(x, codec):
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from pdflib.psparser import PSLiteralTable
|
||||
|
||||
|
||||
## ColorSpace
|
||||
##
|
||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||
|
||||
class ColorSpace(object):
|
||||
|
||||
def __init__(self, name, ncomponents):
|
||||
self.name = name
|
||||
self.ncomponents = ncomponents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
|
||||
|
||||
PREDEFINED_COLORSPACE = dict(
|
||||
(name, ColorSpace(name,n)) for (name,n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}.iteritems())
|
|
@ -0,0 +1,341 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
|
||||
literal_name, keyword_name, STRICT
|
||||
from pdflib.pdftypes import PDFException, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
|
||||
class PDFFontError(PDFException): pass
|
||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||
|
||||
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||
|
||||
|
||||
# PDFFont
|
||||
class PDFFont(object):
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
|
||||
self.descriptor = descriptor
|
||||
self.widths = widths
|
||||
self.fontname = descriptor.get('FontName', 'unknown')
|
||||
if isinstance(self.fontname, PSLiteral):
|
||||
self.fontname = literal_name(self.fontname)
|
||||
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||
self.descent = num_value(descriptor.get('Descent', 0))
|
||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFFont>'
|
||||
|
||||
def is_vertical(self):
|
||||
return False
|
||||
|
||||
def is_multibyte(self):
|
||||
return False
|
||||
|
||||
def decode(self, bytes):
|
||||
return map(ord, bytes)
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
||||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec, font_matrix=None):
|
||||
# Font encoding is specified either by a name of
|
||||
# built-in encoding or a dictionary that describes
|
||||
# the differences.
|
||||
if 'Encoding' in spec:
|
||||
encoding = resolve1(spec['Encoding'])
|
||||
else:
|
||||
encoding = LITERAL_STANDARD_ENCODING
|
||||
if isinstance(encoding, dict):
|
||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||
diff = list_value(encoding.get('Differences', None))
|
||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
||||
else:
|
||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
|
||||
return
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
try:
|
||||
return self.encoding[cid]
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
try:
|
||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||
except KeyError:
|
||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 255))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
||||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'FontName':spec.get('Name'),
|
||||
'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec,
|
||||
font_matrix=tuple(list_value(spec.get('FontMatrix'))))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType3Font>'
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
|
||||
## TrueTypeFont
|
||||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception): pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
self.tables = {}
|
||||
fonttype = fp.read(4)
|
||||
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
for i in xrange(ntables):
|
||||
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
||||
self.tables[name] = (offset, length)
|
||||
return
|
||||
|
||||
def create_cmap(self):
|
||||
if 'cmap' not in self.tables:
|
||||
raise TrueTypeFont.CMapNotFound
|
||||
(base_offset, length) = self.tables['cmap']
|
||||
fp = self.fp
|
||||
fp.seek(base_offset)
|
||||
(version, nsubtables) = unpack('>HH', fp.read(4))
|
||||
subtables = []
|
||||
for i in xrange(nsubtables):
|
||||
subtables.append(unpack('>HHL', fp.read(8)))
|
||||
char2gid = {}
|
||||
# Only supports subtable type 0, 2 and 4.
|
||||
for (_1, _2, st_offset) in subtables:
|
||||
fp.seek(base_offset+st_offset)
|
||||
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
||||
if fmttype == 0:
|
||||
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
||||
elif fmttype == 2:
|
||||
subheaderkeys = unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
for (i,k) in enumerate(subheaderkeys):
|
||||
firstbytes[k/8] = i
|
||||
nhdrs = max(subheaderkeys)/8 + 1
|
||||
hdrs = []
|
||||
for i in xrange(nhdrs):
|
||||
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
||||
if not entcount: continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in xrange(entcount):
|
||||
gid = unpack('>H', fp.read(2))
|
||||
if gid:
|
||||
gid += delta
|
||||
char2gid[first+c] = gid
|
||||
elif fmttype == 4:
|
||||
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
segcount /= 2
|
||||
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
fp.read(2)
|
||||
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||
pos = fp.tell()
|
||||
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
||||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
||||
else:
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (c + idd) & 0xffff
|
||||
gid2char = dict( (gid, pack('>H', char))
|
||||
for (char,gid) in char2gid.iteritems() )
|
||||
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
||||
return CMap(cmapname).update(char2gid, gid2char)
|
||||
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||
try:
|
||||
name = literal_name(spec['Encoding'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('Encoding is unspecified')
|
||||
name = 'unknown'
|
||||
try:
|
||||
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('FontDescriptor is missing')
|
||||
descriptor = {}
|
||||
ttf = None
|
||||
if 'FontFile2' in descriptor:
|
||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||
ttf = TrueTypeFont(self.basefont,
|
||||
StringIO(self.fontfile.get_data()))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
elif self.cidcoding == 'Adobe-Identity':
|
||||
if ttf:
|
||||
try:
|
||||
self.ucs2_cmap = ttf.create_cmap()
|
||||
except TrueTypeFont.CMapNotFound:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
||||
strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
|
||||
def get_width(seq):
|
||||
dic = {}
|
||||
char1 = char2 = None
|
||||
for v in seq:
|
||||
if char1 == None:
|
||||
char1 = v
|
||||
elif char2 == None and isinstance(v, int):
|
||||
char2 = v
|
||||
else:
|
||||
if char2 == None:
|
||||
for (i,w) in enumerate(v):
|
||||
dic[char1+i] = w
|
||||
else:
|
||||
for i in xrange(char1, char2+1):
|
||||
dic[i] = v
|
||||
char1 = char2 = None
|
||||
return dic
|
||||
self.vertical = self.cmap.is_vertical()
|
||||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
dic = get_width(list_value(spec.get('W2', [])))
|
||||
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
||||
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
||||
(d,w) = spec.get('DW2', [880, -1000])
|
||||
default_width = w
|
||||
self.default_disp = d
|
||||
else:
|
||||
# writing mode: horizontal
|
||||
widths = get_width(list_value(spec.get('W', [])))
|
||||
self.disps = {}
|
||||
default_width = spec.get('DW', 1000)
|
||||
self.default_disp = 0
|
||||
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_multibyte(self):
|
||||
return True
|
||||
|
||||
def decode(self, bytes):
|
||||
return self.cmap.decode(bytes)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return self.disps.get(cid, self.default_disp)
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
|
@ -6,33 +6,22 @@ try:
|
|||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||
from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
|
||||
int_value, float_value, num_value, \
|
||||
from pdflib.psparser import PSException, PSTypeError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
PSStackParser, PSKeyword, STRICT
|
||||
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
|
||||
resolve1, int_value, float_value, num_value, \
|
||||
str_value, list_value, dict_value, stream_value
|
||||
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
|
||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
|
||||
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
|
||||
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
|
||||
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
|
||||
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
|
||||
|
||||
|
||||
## Exceptions
|
||||
##
|
||||
class PDFResourceError(PDFException): pass
|
||||
class PDFInterpreterError(PDFException): pass
|
||||
class PDFFontError(PDFException): pass
|
||||
class PDFUnicodeNotDefined(PDFFontError): pass
|
||||
|
||||
|
||||
## ColorSpace
|
||||
##
|
||||
class ColorSpace(object):
|
||||
def __init__(self, name, ncomponents):
|
||||
self.name = name
|
||||
self.ncomponents = ncomponents
|
||||
return
|
||||
def __repr__(self):
|
||||
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
|
||||
|
||||
|
||||
## Constants
|
||||
|
@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text')
|
|||
LITERAL_FONT = PSLiteralTable.intern('Font')
|
||||
LITERAL_FORM = PSLiteralTable.intern('Form')
|
||||
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
||||
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
|
||||
KEYWORD_BI = PSKeywordTable.intern('BI')
|
||||
KEYWORD_ID = PSKeywordTable.intern('ID')
|
||||
KEYWORD_EI = PSKeywordTable.intern('EI')
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
PREDEFINED_COLORSPACE = dict(
|
||||
(name, ColorSpace(name,n)) for (name,n) in {
|
||||
'CalRGB': 3,
|
||||
'CalGray': 1,
|
||||
'Lab': 3,
|
||||
'DeviceRGB': 3,
|
||||
'DeviceCMYK': 4,
|
||||
'DeviceGray': 1,
|
||||
'Separation': 1,
|
||||
'Indexed': 1,
|
||||
'Pattern': 1,
|
||||
}.iteritems())
|
||||
|
||||
|
||||
## Fonts
|
||||
##
|
||||
|
||||
# PDFFont
|
||||
class PDFFont(object):
|
||||
|
||||
def __init__(self, descriptor, widths, default_width=None):
|
||||
self.descriptor = descriptor
|
||||
self.widths = widths
|
||||
self.fontname = descriptor.get('FontName', 'unknown')
|
||||
if isinstance(self.fontname, PSLiteral):
|
||||
self.fontname = literal_name(self.fontname)
|
||||
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||
self.descent = num_value(descriptor.get('Descent', 0))
|
||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||
self.leading = num_value(descriptor.get('Leading', 0))
|
||||
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFFont>'
|
||||
|
||||
def is_vertical(self):
|
||||
return False
|
||||
|
||||
def is_multibyte(self):
|
||||
return False
|
||||
|
||||
def decode(self, bytes):
|
||||
return map(ord, bytes)
|
||||
|
||||
def char_width(self, cid):
|
||||
return self.widths.get(cid, self.default_width)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return 0
|
||||
|
||||
def string_width(self, s):
|
||||
return sum( self.char_width(cid) for cid in self.decode(s) )
|
||||
|
||||
|
||||
# PDFSimpleFont
|
||||
class PDFSimpleFont(PDFFont):
|
||||
|
||||
def __init__(self, descriptor, widths, spec):
|
||||
# Font encoding is specified either by a name of
|
||||
# built-in encoding or a dictionary that describes
|
||||
# the differences.
|
||||
if 'Encoding' in spec:
|
||||
encoding = resolve1(spec['Encoding'])
|
||||
else:
|
||||
encoding = LITERAL_STANDARD_ENCODING
|
||||
if isinstance(encoding, dict):
|
||||
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
|
||||
diff = list_value(encoding.get('Differences', None))
|
||||
self.encoding = EncodingDB.get_encoding(name, diff)
|
||||
else:
|
||||
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
PDFFont.__init__(self, descriptor, widths)
|
||||
return
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
try:
|
||||
return self.encoding[cid]
|
||||
except KeyError:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(None, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
# PDFType1Font
|
||||
class PDFType1Font(PDFSimpleFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
try:
|
||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||
except KeyError:
|
||||
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 255))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType1Font: basefont=%r>' % self.basefont
|
||||
|
||||
# PDFTrueTypeFont
|
||||
class PDFTrueTypeFont(PDFType1Font):
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
|
||||
|
||||
# PDFType3Font
|
||||
class PDFType3Font(PDFSimpleFont):
|
||||
def __init__(self, spec):
|
||||
firstchar = int_value(spec.get('FirstChar', 0))
|
||||
lastchar = int_value(spec.get('LastChar', 0))
|
||||
widths = list_value(spec.get('Widths', [0]*256))
|
||||
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
||||
if 'FontDescriptor' in spec:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
else:
|
||||
descriptor = {'FontName':spec.get('Name'),
|
||||
'Ascent':0, 'Descent':0,
|
||||
'FontBBox':spec['FontBBox']}
|
||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFType3Font>'
|
||||
|
||||
|
||||
# PDFCIDFont
|
||||
|
||||
## TrueTypeFont
|
||||
##
|
||||
class TrueTypeFont(object):
|
||||
|
||||
class CMapNotFound(Exception): pass
|
||||
|
||||
def __init__(self, name, fp):
|
||||
self.name = name
|
||||
self.fp = fp
|
||||
self.tables = {}
|
||||
fonttype = fp.read(4)
|
||||
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
for i in xrange(ntables):
|
||||
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
|
||||
self.tables[name] = (offset, length)
|
||||
return
|
||||
|
||||
def create_cmap(self):
|
||||
if 'cmap' not in self.tables:
|
||||
raise TrueTypeFont.CMapNotFound
|
||||
(base_offset, length) = self.tables['cmap']
|
||||
fp = self.fp
|
||||
fp.seek(base_offset)
|
||||
(version, nsubtables) = unpack('>HH', fp.read(4))
|
||||
subtables = []
|
||||
for i in xrange(nsubtables):
|
||||
subtables.append(unpack('>HHL', fp.read(8)))
|
||||
char2gid = {}
|
||||
# Only supports subtable type 0, 2 and 4.
|
||||
for (_1, _2, st_offset) in subtables:
|
||||
fp.seek(base_offset+st_offset)
|
||||
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
|
||||
if fmttype == 0:
|
||||
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
|
||||
elif fmttype == 2:
|
||||
subheaderkeys = unpack('>256H', fp.read(512))
|
||||
firstbytes = [0]*8192
|
||||
for (i,k) in enumerate(subheaderkeys):
|
||||
firstbytes[k/8] = i
|
||||
nhdrs = max(subheaderkeys)/8 + 1
|
||||
hdrs = []
|
||||
for i in xrange(nhdrs):
|
||||
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
|
||||
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
|
||||
for (i,firstcode,entcount,delta,pos) in hdrs:
|
||||
if not entcount: continue
|
||||
first = firstcode + (firstbytes[i] << 8)
|
||||
fp.seek(pos)
|
||||
for c in xrange(entcount):
|
||||
gid = unpack('>H', fp.read(2))
|
||||
if gid:
|
||||
gid += delta
|
||||
char2gid[first+c] = gid
|
||||
elif fmttype == 4:
|
||||
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
|
||||
segcount /= 2
|
||||
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
fp.read(2)
|
||||
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
|
||||
pos = fp.tell()
|
||||
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
|
||||
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
|
||||
if idr:
|
||||
fp.seek(pos+idr)
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
|
||||
else:
|
||||
for c in xrange(sc, ec+1):
|
||||
char2gid[c] = (c + idd) & 0xffff
|
||||
gid2char = dict( (gid, pack('>H', char))
|
||||
for (char,gid) in char2gid.iteritems() )
|
||||
cmapname = 'Adobe-Identity-UCS-%s' % self.name
|
||||
return CMap(cmapname).update(char2gid, gid2char)
|
||||
|
||||
class PDFCIDFont(PDFFont):
|
||||
|
||||
def __init__(self, spec):
|
||||
try:
|
||||
self.basefont = literal_name(spec['BaseFont'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('BaseFont is missing')
|
||||
self.basefont = 'unknown'
|
||||
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||
try:
|
||||
name = literal_name(spec['Encoding'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('Encoding is unspecified')
|
||||
name = 'unknown'
|
||||
try:
|
||||
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
try:
|
||||
descriptor = dict_value(spec['FontDescriptor'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFFontError('FontDescriptor is missing')
|
||||
descriptor = {}
|
||||
ttf = None
|
||||
if 'FontFile2' in descriptor:
|
||||
self.fontfile = stream_value(descriptor.get('FontFile2'))
|
||||
ttf = TrueTypeFont(self.basefont,
|
||||
StringIO(self.fontfile.get_data()))
|
||||
self.ucs2_cmap = None
|
||||
if 'ToUnicode' in spec:
|
||||
strm = stream_value(spec['ToUnicode'])
|
||||
self.ucs2_cmap = CMap()
|
||||
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
|
||||
elif self.cidcoding == 'Adobe-Identity':
|
||||
if ttf:
|
||||
try:
|
||||
self.ucs2_cmap = ttf.create_cmap()
|
||||
except TrueTypeFont.CMapNotFound:
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
|
||||
strict=STRICT)
|
||||
except CMapDB.CMapNotFound, e:
|
||||
raise PDFFontError(e)
|
||||
|
||||
def get_width(seq):
|
||||
dic = {}
|
||||
char1 = char2 = None
|
||||
for v in seq:
|
||||
if char1 == None:
|
||||
char1 = v
|
||||
elif char2 == None and isinstance(v, int):
|
||||
char2 = v
|
||||
else:
|
||||
if char2 == None:
|
||||
for (i,w) in enumerate(v):
|
||||
dic[char1+i] = w
|
||||
else:
|
||||
for i in xrange(char1, char2+1):
|
||||
dic[i] = v
|
||||
char1 = char2 = None
|
||||
return dic
|
||||
self.vertical = self.cmap.is_vertical()
|
||||
if self.vertical:
|
||||
# writing mode: vertical
|
||||
dic = get_width(list_value(spec.get('W2', [])))
|
||||
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
|
||||
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
|
||||
(d,w) = spec.get('DW2', [880, -1000])
|
||||
default_width = w
|
||||
self.default_disp = d
|
||||
else:
|
||||
# writing mode: horizontal
|
||||
widths = get_width(list_value(spec.get('W', [])))
|
||||
self.disps = {}
|
||||
default_width = spec.get('DW', 1000)
|
||||
self.default_disp = 0
|
||||
PDFFont.__init__(self, descriptor, widths, default_width)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
|
||||
|
||||
def is_vertical(self):
|
||||
return self.vertical
|
||||
|
||||
def is_multibyte(self):
|
||||
return True
|
||||
|
||||
def decode(self, bytes):
|
||||
return self.cmap.decode(bytes)
|
||||
|
||||
def char_disp(self, cid):
|
||||
return self.disps.get(cid, self.default_disp)
|
||||
|
||||
def to_unicode(self, cid):
|
||||
if not self.ucs2_cmap:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
code = self.ucs2_cmap.tocode(cid)
|
||||
if not code:
|
||||
raise PDFUnicodeNotDefined(self.cidcoding, cid)
|
||||
chars = unpack('>%dH' % (len(code)/2), code)
|
||||
return ''.join( unichr(c) for c in chars )
|
||||
|
||||
|
||||
## Resource Manager
|
||||
|
@ -388,7 +39,7 @@ class PDFResourceManager(object):
|
|||
|
||||
'''
|
||||
ResourceManager facilitates reuse of shared resources
|
||||
such as fonts, images and cmaps so that large objects are not
|
||||
such as fonts and images so that large objects are not
|
||||
allocated multiple times.
|
||||
'''
|
||||
debug = 0
|
||||
|
@ -399,24 +50,21 @@ class PDFResourceManager(object):
|
|||
|
||||
def get_procset(self, procs):
|
||||
for proc in procs:
|
||||
if proc == LITERAL_PDF:
|
||||
if proc is LITERAL_PDF:
|
||||
pass
|
||||
elif proc == LITERAL_TEXT:
|
||||
elif proc is LITERAL_TEXT:
|
||||
pass
|
||||
else:
|
||||
#raise PDFResourceError('ProcSet %r is not supported.' % proc)
|
||||
pass
|
||||
return
|
||||
|
||||
def get_cmap(self, name):
|
||||
return CMapDB.get_cmap(name, strict=STRICT)
|
||||
|
||||
def get_font(self, objid, spec):
|
||||
if objid and objid in self.fonts:
|
||||
font = self.fonts[objid]
|
||||
else:
|
||||
if STRICT:
|
||||
if spec['Type'] != LITERAL_FONT:
|
||||
if spec['Type'] is not LITERAL_FONT:
|
||||
raise PDFFontError('Type is not /Font')
|
||||
# Create a Font object.
|
||||
if 'Subtype' in spec:
|
||||
|
@ -455,49 +103,6 @@ class PDFResourceManager(object):
|
|||
return font
|
||||
|
||||
|
||||
## PDFDevice
|
||||
##
|
||||
class PDFDevice(object):
|
||||
|
||||
debug = 0
|
||||
|
||||
def __init__(self, rsrc):
|
||||
self.rsrc = rsrc
|
||||
self.ctm = None
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFDevice>'
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
def set_ctm(self, ctm):
|
||||
self.ctm = ctm
|
||||
return
|
||||
|
||||
def begin_tag(self, tag, props=None):
|
||||
return
|
||||
def end_tag(self):
|
||||
return
|
||||
def do_tag(self, tag, props=None):
|
||||
return
|
||||
|
||||
def begin_page(self, page):
|
||||
return
|
||||
def end_page(self, page):
|
||||
return
|
||||
def begin_figure(self, name, bbox):
|
||||
return
|
||||
def end_figure(self, name):
|
||||
return
|
||||
|
||||
def render_string(self, textstate, textmatrix, seq):
|
||||
raise NotImplementedError
|
||||
def render_image(self, stream, size, matrix):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
## PDFContentParser
|
||||
##
|
||||
class PDFContentParser(PSStackParser):
|
||||
|
@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser):
|
|||
self.add_results(*self.popall())
|
||||
return
|
||||
|
||||
KEYWORD_BI = PSKeywordTable.intern('BI')
|
||||
KEYWORD_ID = PSKeywordTable.intern('ID')
|
||||
KEYWORD_EI = PSKeywordTable.intern('EI')
|
||||
def do_keyword(self, pos, token):
|
||||
if token == KEYWORD_BI:
|
||||
if token is self.KEYWORD_BI:
|
||||
# inline image within a content stream
|
||||
self.start_type(pos, 'inline')
|
||||
elif token == KEYWORD_ID:
|
||||
elif token is self.KEYWORD_ID:
|
||||
try:
|
||||
(_, objs) = self.end_type('inline')
|
||||
if len(objs) % 2 != 0:
|
||||
|
@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser):
|
|||
(pos, data) = self.get_inline_data(pos+len('ID '))
|
||||
obj = PDFStream(d, data)
|
||||
self.push((pos, obj))
|
||||
self.push((pos, KEYWORD_EI))
|
||||
self.push((pos, self.KEYWORD_EI))
|
||||
except PSTypeError:
|
||||
if STRICT: raise
|
||||
else:
|
||||
|
@ -975,7 +583,7 @@ class PDFPageInterpreter(object):
|
|||
if 1 <= self.debug:
|
||||
print >>stderr, 'Processing xobj: %r' % xobj
|
||||
subtype = xobj.dic.get('Subtype')
|
||||
if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
|
||||
if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
|
||||
interpreter = self.dup()
|
||||
(x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
|
||||
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
|
||||
|
@ -985,7 +593,7 @@ class PDFPageInterpreter(object):
|
|||
self.device.begin_figure(xobjid, bbox)
|
||||
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
|
||||
self.device.end_figure(xobjid)
|
||||
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
||||
elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
||||
(x0,y0) = apply_matrix(self.ctm, (0,0))
|
||||
(x1,y1) = apply_matrix(self.ctm, (1,1))
|
||||
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
|
||||
|
|
|
@ -7,26 +7,22 @@
|
|||
import sys, re
|
||||
import md5, struct
|
||||
stderr = sys.stderr
|
||||
from utils import choplist, nunpack
|
||||
from arcfour import Arcfour
|
||||
from lzw import LZWDecoder
|
||||
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||
PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, \
|
||||
PSStackParser, STRICT
|
||||
from pdflib.utils import choplist, nunpack
|
||||
from pdflib.arcfour import Arcfour
|
||||
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
STRICT
|
||||
from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
|
||||
PDFStream, PDFObjRef, resolve1, decipher_all, \
|
||||
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
|
||||
|
||||
|
||||
## PDF Exceptions
|
||||
## Exceptions
|
||||
##
|
||||
class PDFException(PSException): pass
|
||||
class PDFSyntaxError(PDFException): pass
|
||||
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||
class PDFEncryptionError(PDFException): pass
|
||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFNotImplementedError(PSException): pass
|
||||
|
||||
|
||||
# some predefined literals and keywords.
|
||||
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
|
||||
|
@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
|
|||
LITERAL_PAGE = PSLiteralTable.intern('Page')
|
||||
LITERAL_PAGES = PSLiteralTable.intern('Pages')
|
||||
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
|
||||
PSLiteralTable.intern('Fl'))
|
||||
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
|
||||
PSLiteralTable.intern('LZW'))
|
||||
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
|
||||
PSLiteralTable.intern('A85'))
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
|
||||
class PDFObject(PSObject): pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
##
|
||||
class PDFObjRef(PDFObject):
|
||||
|
||||
def __init__(self, doc, objid, _):
|
||||
if objid == 0:
|
||||
if STRICT:
|
||||
raise PDFValueError('PDF object id cannot be 0.')
|
||||
self.doc = doc
|
||||
self.objid = objid
|
||||
#self.genno = genno # Never used.
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFObjRef:%d>' % (self.objid)
|
||||
|
||||
def resolve(self):
|
||||
return self.doc.getobj(self.objid)
|
||||
|
||||
|
||||
# resolve
|
||||
def resolve1(x):
|
||||
'''
|
||||
Resolve an object. If this is an array or dictionary,
|
||||
it may still contains some indirect objects inside.
|
||||
'''
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
return x
|
||||
|
||||
def resolve_all(x):
|
||||
'''
|
||||
Recursively resolve X and all the internals.
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow.
|
||||
'''
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
if isinstance(x, list):
|
||||
x = [ resolve_all(v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = resolve_all(v)
|
||||
return x
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
'''
|
||||
Recursively decipher X.
|
||||
'''
|
||||
if isinstance(x, str):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
# Type cheking
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, int):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Integer required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
def float_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Float required: %r' % x)
|
||||
return 0.0
|
||||
return x
|
||||
|
||||
def num_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, int) or isinstance(x, float)):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Int or Float required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
def str_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, str):
|
||||
if STRICT:
|
||||
raise PDFTypeError('String required: %r' % x)
|
||||
return ''
|
||||
return x
|
||||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||
if STRICT:
|
||||
raise PDFTypeError('List required: %r' % x)
|
||||
return []
|
||||
return x
|
||||
|
||||
def dict_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Dict required: %r' % x)
|
||||
return {}
|
||||
return x
|
||||
|
||||
def stream_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
if STRICT:
|
||||
raise PDFTypeError('PDFStream required: %r' % x)
|
||||
return PDFStream({}, '')
|
||||
return x
|
||||
|
||||
|
||||
## PDFStream type
|
||||
##
|
||||
class PDFStream(PDFObject):
|
||||
|
||||
def __init__(self, dic, rawdata, decipher=None):
|
||||
self.dic = dic
|
||||
self.rawdata = rawdata
|
||||
self.decipher = decipher
|
||||
self.data = None
|
||||
self.objid = None
|
||||
self.genno = None
|
||||
return
|
||||
|
||||
def set_objid(self, objid, genno):
|
||||
self.objid = objid
|
||||
self.genno = genno
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||
|
||||
def decode(self):
|
||||
assert self.data == None and self.rawdata != None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
data = self.decipher(self.objid, self.genno, data)
|
||||
if 'Filter' not in self.dic:
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
filters = self.dic['Filter']
|
||||
if not isinstance(filters, list):
|
||||
filters = [ filters ]
|
||||
for f in filters:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
import zlib
|
||||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||
elif f in LITERALS_ASCII85_DECODE:
|
||||
import ascii85
|
||||
data = ascii85.ascii85decode(data)
|
||||
elif f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError('/Crypt filter is unsupported')
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
# apply predictors
|
||||
params = self.dic.get('DecodeParms', {})
|
||||
if 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if pred:
|
||||
if pred != 12:
|
||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||
if 'Columns' not in params:
|
||||
raise PDFValueError('Columns undefined for predictor=12')
|
||||
columns = int_value(params['Columns'])
|
||||
buf = ''
|
||||
ent0 = '\x00' * columns
|
||||
for i in xrange(0, len(data), columns+1):
|
||||
pred = data[i]
|
||||
ent1 = data[i+1:i+1+columns]
|
||||
if pred == '\x02':
|
||||
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
||||
buf += ent1
|
||||
ent0 = ent1
|
||||
data = buf
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
|
||||
def get_data(self):
|
||||
if self.data == None:
|
||||
self.decode()
|
||||
return self.data
|
||||
|
||||
def get_rawdata(self):
|
||||
return self.rawdata
|
||||
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.lastmod = self.attrs.get('LastModified')
|
||||
self.resources = resolve1(self.attrs['Resources'])
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = self.attrs.get('Rotate', 0)
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
|
||||
## XRefs
|
||||
##
|
||||
|
||||
## PDFXRef
|
||||
##
|
||||
|
@ -296,7 +44,7 @@ class PDFXRef(object):
|
|||
return
|
||||
|
||||
def objids(self):
|
||||
return self.offsets.keys()
|
||||
return self.offsets.iterkeys()
|
||||
|
||||
def load(self, parser):
|
||||
while 1:
|
||||
|
@ -330,10 +78,11 @@ class PDFXRef(object):
|
|||
self.load_trailer(parser)
|
||||
return
|
||||
|
||||
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
|
||||
def load_trailer(self, parser):
|
||||
try:
|
||||
(_,kwd) = parser.nexttoken()
|
||||
assert kwd == KEYWORD_TRAILER
|
||||
assert kwd is self.KEYWORD_TRAILER
|
||||
(_,dic) = parser.nextobject()
|
||||
except PSEOF:
|
||||
x = parser.pop(1)
|
||||
|
@ -350,7 +99,7 @@ class PDFXRef(object):
|
|||
raise
|
||||
if use != 'n':
|
||||
if STRICT:
|
||||
raise PDFValueError('Unused objid=%r' % objid)
|
||||
raise PDFSyntaxError('Unused objid=%r' % objid)
|
||||
return (None, pos)
|
||||
|
||||
|
||||
|
@ -367,14 +116,14 @@ class PDFXRefStream(object):
|
|||
return
|
||||
|
||||
def objids(self):
|
||||
return range(self.objid0, self.objid1+1)
|
||||
return xrange(self.objid0, self.objid1)
|
||||
|
||||
def load(self, parser):
|
||||
(_,objid) = parser.nexttoken() # ignored
|
||||
(_,genno) = parser.nexttoken() # ignored
|
||||
(_,kwd) = parser.nexttoken()
|
||||
(_,stream) = parser.nextobject()
|
||||
if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
|
||||
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
|
||||
raise PDFNoValidXRef('Invalid PDF stream spec.')
|
||||
size = stream.dic['Size']
|
||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||
|
@ -402,6 +151,37 @@ class PDFXRefStream(object):
|
|||
return (objid, index)
|
||||
|
||||
|
||||
## PDFPage
|
||||
##
|
||||
class PDFPage(object):
|
||||
|
||||
def __init__(self, doc, pageid, attrs):
|
||||
self.doc = doc
|
||||
self.pageid = pageid
|
||||
self.attrs = dict_value(attrs)
|
||||
self.lastmod = resolve1(self.attrs.get('LastModified'))
|
||||
self.resources = resolve1(self.attrs['Resources'])
|
||||
self.mediabox = resolve1(self.attrs['MediaBox'])
|
||||
if 'CropBox' in self.attrs:
|
||||
self.cropbox = resolve1(self.attrs['CropBox'])
|
||||
else:
|
||||
self.cropbox = self.mediabox
|
||||
self.rotate = self.attrs.get('Rotate', 0)
|
||||
self.annots = self.attrs.get('Annots')
|
||||
self.beads = self.attrs.get('B')
|
||||
if 'Contents' in self.attrs:
|
||||
contents = resolve1(self.attrs['Contents'])
|
||||
else:
|
||||
contents = []
|
||||
if not isinstance(contents, list):
|
||||
contents = [ contents ]
|
||||
self.contents = contents
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
|
||||
|
||||
|
||||
## PDFDocument
|
||||
##
|
||||
## A PDFDocument object represents a PDF document.
|
||||
|
@ -463,15 +243,16 @@ class PDFDocument(object):
|
|||
def set_root(self, root):
|
||||
self.root = root
|
||||
self.catalog = dict_value(self.root)
|
||||
if self.catalog.get('Type') != LITERAL_CATALOG:
|
||||
if self.catalog.get('Type') is not LITERAL_CATALOG:
|
||||
if STRICT:
|
||||
raise PDFValueError('Catalog not found!')
|
||||
raise PDFSyntaxError('Catalog not found!')
|
||||
return
|
||||
|
||||
# initialize(password='')
|
||||
# Perform the initialization with a given password.
|
||||
# This step is mandatory even if there's no password associated
|
||||
# with the document.
|
||||
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
|
||||
def initialize(self, password=''):
|
||||
if not self.encryption:
|
||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||
|
@ -494,7 +275,7 @@ class PDFDocument(object):
|
|||
self.is_modifiable = bool(P & 8)
|
||||
self.is_extractable = bool(P & 16)
|
||||
# Algorithm 3.2
|
||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||
password = (password+self.PASSWORD_PADDING)[:32] # 1
|
||||
hash = md5.md5(password) # 2
|
||||
hash.update(O) # 3
|
||||
hash.update(struct.pack('<l', P)) # 4
|
||||
|
@ -512,7 +293,7 @@ class PDFDocument(object):
|
|||
u1 = Arcfour(key).process(password)
|
||||
elif R == 3:
|
||||
# Algorithm 3.5
|
||||
hash = md5.md5(PASSWORD_PADDING) # 2
|
||||
hash = md5.md5(self.PASSWORD_PADDING) # 2
|
||||
hash.update(docid[0]) # 3
|
||||
x = Arcfour(key).process(hash.digest()[:16]) # 4
|
||||
for i in xrange(1,19+1):
|
||||
|
@ -536,6 +317,7 @@ class PDFDocument(object):
|
|||
key = hash.digest()[:min(len(key),16)]
|
||||
return Arcfour(key).process(data)
|
||||
|
||||
KEYWORD_OBJ = PSKeywordTable.intern('obj')
|
||||
def getobj(self, objid):
|
||||
if not self.ready:
|
||||
raise PDFException('PDFDocument not initialized')
|
||||
|
@ -554,11 +336,11 @@ class PDFDocument(object):
|
|||
pass
|
||||
else:
|
||||
if STRICT:
|
||||
raise PDFValueError('Cannot locate objid=%r' % objid)
|
||||
raise PDFSyntaxError('Cannot locate objid=%r' % objid)
|
||||
return None
|
||||
if strmid:
|
||||
stream = stream_value(self.getobj(strmid))
|
||||
if stream.dic['Type'] != LITERAL_OBJSTM:
|
||||
if stream.dic['Type'] is not LITERAL_OBJSTM:
|
||||
if STRICT:
|
||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||
try:
|
||||
|
@ -589,7 +371,7 @@ class PDFDocument(object):
|
|||
(_,genno) = self.parser.nexttoken() # genno
|
||||
#assert objid1 == objid, (objid, objid1)
|
||||
(_,kwd) = self.parser.nexttoken()
|
||||
if kwd != KEYWORD_OBJ:
|
||||
if kwd is not self.KEYWORD_OBJ:
|
||||
raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
|
||||
(_,obj) = self.parser.nextobject()
|
||||
if isinstance(obj, PDFStream):
|
||||
|
@ -611,13 +393,13 @@ class PDFDocument(object):
|
|||
for (k,v) in parent.iteritems():
|
||||
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||
tree[k] = v
|
||||
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
|
||||
if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Pages: Kids=%r' % tree['Kids']
|
||||
for c in tree['Kids']:
|
||||
for x in search(c, tree):
|
||||
yield x
|
||||
elif tree.get('Type') == LITERAL_PAGE:
|
||||
elif tree.get('Type') is LITERAL_PAGE:
|
||||
if 1 <= self.debug:
|
||||
print >>stderr, 'Page: %r' % tree
|
||||
yield (obj.objid, tree)
|
||||
|
@ -683,15 +465,20 @@ class PDFParser(PSStackParser):
|
|||
def __repr__(self):
|
||||
return '<PDFParser>'
|
||||
|
||||
KEYWORD_R = PSKeywordTable.intern('R')
|
||||
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
|
||||
KEYWORD_STREAM = PSKeywordTable.intern('stream')
|
||||
KEYWORD_XREF = PSKeywordTable.intern('xref')
|
||||
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
|
||||
def do_keyword(self, pos, token):
|
||||
if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
|
||||
if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
|
||||
self.add_results(*self.pop(1))
|
||||
return
|
||||
if token == KEYWORD_ENDOBJ:
|
||||
if token is self.KEYWORD_ENDOBJ:
|
||||
self.add_results(*self.pop(4))
|
||||
return
|
||||
|
||||
if token == KEYWORD_R:
|
||||
if token is self.KEYWORD_R:
|
||||
# reference to indirect object
|
||||
try:
|
||||
((_,objid), (_,genno)) = self.pop(2)
|
||||
|
@ -702,7 +489,7 @@ class PDFParser(PSStackParser):
|
|||
pass
|
||||
return
|
||||
|
||||
if token == KEYWORD_STREAM:
|
||||
if token is self.KEYWORD_STREAM:
|
||||
# stream object
|
||||
((_,dic),) = self.pop(1)
|
||||
dic = dict_value(dic)
|
||||
|
@ -710,7 +497,7 @@ class PDFParser(PSStackParser):
|
|||
objlen = int_value(dic['Length'])
|
||||
except KeyError:
|
||||
if STRICT:
|
||||
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||
raise PDFSyntaxError('/Length is undefined: %r' % dic)
|
||||
objlen = 0
|
||||
self.seek(pos)
|
||||
try:
|
||||
|
@ -785,7 +572,7 @@ class PDFParser(PSStackParser):
|
|||
xref = PDFXRefStream()
|
||||
xref.load(self)
|
||||
else:
|
||||
if token != KEYWORD_XREF:
|
||||
if token is not self.KEYWORD_XREF:
|
||||
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
||||
(pos, token))
|
||||
self.nextline()
|
||||
|
@ -835,6 +622,7 @@ class PDFParser(PSStackParser):
|
|||
yield xref
|
||||
return
|
||||
|
||||
|
||||
## PDFObjStrmParser
|
||||
##
|
||||
class PDFObjStrmParser(PDFParser):
|
||||
|
|
|
@ -0,0 +1,222 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, zlib
|
||||
stderr = sys.stderr
|
||||
from pdflib.lzw import LZWDecoder
|
||||
from pdflib.psparser import PSException, PSObject, \
|
||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||
literal_name, keyword_name, STRICT
|
||||
|
||||
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
|
||||
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
|
||||
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
|
||||
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
|
||||
|
||||
|
||||
## PDF Objects
|
||||
##
|
||||
class PDFObject(PSObject): pass
|
||||
|
||||
class PDFException(PSException): pass
|
||||
class PDFTypeError(PDFException): pass
|
||||
class PDFValueError(PDFException): pass
|
||||
class PDFNotImplementedError(PSException): pass
|
||||
|
||||
|
||||
## PDFObjRef
|
||||
##
|
||||
class PDFObjRef(PDFObject):
|
||||
|
||||
def __init__(self, doc, objid, _):
|
||||
if objid == 0:
|
||||
if STRICT:
|
||||
raise PDFValueError('PDF object id cannot be 0.')
|
||||
self.doc = doc
|
||||
self.objid = objid
|
||||
#self.genno = genno # Never used.
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFObjRef:%d>' % (self.objid)
|
||||
|
||||
def resolve(self):
|
||||
return self.doc.getobj(self.objid)
|
||||
|
||||
|
||||
# resolve
|
||||
def resolve1(x):
|
||||
'''
|
||||
Resolve an object. If this is an array or dictionary,
|
||||
it may still contains some indirect objects inside.
|
||||
'''
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
return x
|
||||
|
||||
def resolve_all(x):
|
||||
'''
|
||||
Recursively resolve X and all the internals.
|
||||
Make sure there is no indirect reference within the nested object.
|
||||
This procedure might be slow.
|
||||
'''
|
||||
while isinstance(x, PDFObjRef):
|
||||
x = x.resolve()
|
||||
if isinstance(x, list):
|
||||
x = [ resolve_all(v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = resolve_all(v)
|
||||
return x
|
||||
|
||||
def decipher_all(decipher, objid, genno, x):
|
||||
'''
|
||||
Recursively decipher X.
|
||||
'''
|
||||
if isinstance(x, str):
|
||||
return decipher(objid, genno, x)
|
||||
if isinstance(x, list):
|
||||
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
|
||||
elif isinstance(x, dict):
|
||||
for (k,v) in x.iteritems():
|
||||
x[k] = decipher_all(decipher, objid, genno, v)
|
||||
return x
|
||||
|
||||
# Type cheking
|
||||
def int_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, int):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Integer required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
def float_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, float):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Float required: %r' % x)
|
||||
return 0.0
|
||||
return x
|
||||
|
||||
def num_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, int) or isinstance(x, float)):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Int or Float required: %r' % x)
|
||||
return 0
|
||||
return x
|
||||
|
||||
def str_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, str):
|
||||
if STRICT:
|
||||
raise PDFTypeError('String required: %r' % x)
|
||||
return ''
|
||||
return x
|
||||
|
||||
def list_value(x):
|
||||
x = resolve1(x)
|
||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||
if STRICT:
|
||||
raise PDFTypeError('List required: %r' % x)
|
||||
return []
|
||||
return x
|
||||
|
||||
def dict_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, dict):
|
||||
if STRICT:
|
||||
raise PDFTypeError('Dict required: %r' % x)
|
||||
return {}
|
||||
return x
|
||||
|
||||
def stream_value(x):
|
||||
x = resolve1(x)
|
||||
if not isinstance(x, PDFStream):
|
||||
if STRICT:
|
||||
raise PDFTypeError('PDFStream required: %r' % x)
|
||||
return PDFStream({}, '')
|
||||
return x
|
||||
|
||||
|
||||
## PDFStream type
|
||||
##
|
||||
class PDFStream(PDFObject):
|
||||
|
||||
def __init__(self, dic, rawdata, decipher=None):
|
||||
self.dic = dic
|
||||
self.rawdata = rawdata
|
||||
self.decipher = decipher
|
||||
self.data = None
|
||||
self.objid = None
|
||||
self.genno = None
|
||||
return
|
||||
|
||||
def set_objid(self, objid, genno):
|
||||
self.objid = objid
|
||||
self.genno = genno
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
|
||||
|
||||
def decode(self):
|
||||
assert self.data == None and self.rawdata != None
|
||||
data = self.rawdata
|
||||
if self.decipher:
|
||||
# Handle encryption
|
||||
data = self.decipher(self.objid, self.genno, data)
|
||||
if 'Filter' not in self.dic:
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
filters = self.dic['Filter']
|
||||
if not isinstance(filters, list):
|
||||
filters = [ filters ]
|
||||
for f in filters:
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
# will get errors if the document is encrypted.
|
||||
data = zlib.decompress(data)
|
||||
elif f in LITERALS_LZW_DECODE:
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
data = ''.join(LZWDecoder(StringIO(data)).run())
|
||||
elif f in LITERALS_ASCII85_DECODE:
|
||||
import ascii85
|
||||
data = ascii85.ascii85decode(data)
|
||||
elif f == LITERAL_CRYPT:
|
||||
raise PDFEncryptionError('/Crypt filter is unsupported')
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
# apply predictors
|
||||
params = self.dic.get('DecodeParms', {})
|
||||
if 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if pred:
|
||||
if pred != 12:
|
||||
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
|
||||
if 'Columns' not in params:
|
||||
raise PDFValueError('Columns undefined for predictor=12')
|
||||
columns = int_value(params['Columns'])
|
||||
buf = ''
|
||||
ent0 = '\x00' * columns
|
||||
for i in xrange(0, len(data), columns+1):
|
||||
pred = data[i]
|
||||
ent1 = data[i+1:i+1+columns]
|
||||
if pred == '\x02':
|
||||
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
|
||||
buf += ent1
|
||||
ent0 = ent1
|
||||
data = buf
|
||||
self.data = data
|
||||
self.rawdata = None
|
||||
return
|
||||
|
||||
def get_data(self):
|
||||
if self.data == None:
|
||||
self.decode()
|
||||
return self.data
|
||||
|
||||
def get_rawdata(self):
|
||||
return self.rawdata
|
|
@ -1,7 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
import sys, re
|
||||
stderr = sys.stderr
|
||||
from utils import choplist
|
||||
|
||||
from pdflib.utils import choplist
|
||||
|
||||
STRICT = 0
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ from struct import unpack
|
|||
|
||||
## Matrix operations
|
||||
##
|
||||
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
|
||||
|
||||
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||
'''Multiplies two matrices.'''
|
||||
return (a0*a1+c0*b1, b0*a1+d0*b1,
|
||||
|
|
Loading…
Reference in New Issue