git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-01-10 09:14:46 +00:00
parent 24bdd33557
commit c41c279321
9 changed files with 740 additions and 700 deletions

View File

@ -2,10 +2,53 @@
import sys import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined from pdflib.pdffont import PDFUnicodeNotDefined
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PDFDevice
##
class PDFDevice(object):
debug = 0
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def close(self):
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox):
return
def end_figure(self, name):
return
def render_string(self, textstate, textmatrix, seq):
raise NotImplementedError
def render_image(self, stream, size, matrix):
raise NotImplementedError
## PageItem ## PageItem
## ##
class PageItem(object): class PageItem(object):

View File

@ -3,10 +3,10 @@ import sys
stdout = sys.stdout stdout = sys.stdout
stderr = sys.stderr stderr = sys.stderr
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \ from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
PDFPageInterpreter, PDFUnicodeNotDefined from pdflib.pdffont import PDFUnicodeNotDefined
from pdflib.cmap import CMapDB from pdflib.cmap import CMapDB
from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator
def enc(x, codec): def enc(x, codec):

35
pdflib/pdfcolor.py Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from pdflib.psparser import PSLiteralTable
## ColorSpace
##
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
class ColorSpace(object):
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __repr__(self):
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
PREDEFINED_COLORSPACE = dict(
(name, ColorSpace(name,n)) for (name,n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}.iteritems())

341
pdflib/pdffont.py Normal file
View File

@ -0,0 +1,341 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from struct import pack, unpack
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
literal_name, keyword_name, STRICT
from pdflib.pdftypes import PDFException, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
## Fonts
##
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
# PDFFont
class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor.get('FontName', 'unknown')
if isinstance(self.fontname, PSLiteral):
self.fontname = literal_name(self.fontname)
self.ascent = num_value(descriptor.get('Ascent', 0))
self.descent = num_value(descriptor.get('Descent', 0))
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
return
def __repr__(self):
return '<PDFFont>'
def is_vertical(self):
return False
def is_multibyte(self):
return False
def decode(self, bytes):
return map(ord, bytes)
def char_width(self, cid):
return self.widths.get(cid, self.default_width)
def char_disp(self, cid):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, descriptor, widths, spec, font_matrix=None):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
if 'Encoding' in spec:
encoding = resolve1(spec['Encoding'])
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
diff = list_value(encoding.get('Differences', None))
self.encoding = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
return
def to_unicode(self, cid):
if not self.ucs2_cmap:
try:
return self.encoding[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(None, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, spec):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if STRICT:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
try:
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
except KeyError:
descriptor = dict_value(spec.get('FontDescriptor', {}))
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, spec):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':spec.get('Name'),
'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec,
font_matrix=tuple(list_value(spec.get('FontMatrix'))))
return
def __repr__(self):
return '<PDFType3Font>'
# PDFCIDFont
## TrueTypeFont
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
cmapname = 'Adobe-Identity-UCS-%s' % self.name
return CMap(cmapname).update(char2gid, gid2char)
class PDFCIDFont(PDFFont):
def __init__(self, spec):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if STRICT:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
self.cidsysteminfo.get('Ordering', 'unknown'))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if STRICT:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
if STRICT:
raise PDFFontError('FontDescriptor is missing')
descriptor = {}
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
self.ucs2_cmap = ttf.create_cmap()
except TrueTypeFont.CMapNotFound:
pass
else:
try:
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
strict=STRICT)
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
def get_width(seq):
dic = {}
char1 = char2 = None
for v in seq:
if char1 == None:
char1 = v
elif char2 == None and isinstance(v, int):
char2 = v
else:
if char2 == None:
for (i,w) in enumerate(v):
dic[char1+i] = w
else:
for i in xrange(char1, char2+1):
dic[i] = v
char1 = char2 = None
return dic
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
dic = get_width(list_value(spec.get('W2', [])))
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
(d,w) = spec.get('DW2', [880, -1000])
default_width = w
self.default_disp = d
else:
# writing mode: horizontal
widths = get_width(list_value(spec.get('W', [])))
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return
def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
def is_vertical(self):
return self.vertical
def is_multibyte(self):
return True
def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid):
if not self.ucs2_cmap:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )

View File

@ -6,33 +6,22 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from StringIO import StringIO from StringIO import StringIO
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ from pdflib.psparser import PSException, PSTypeError, PSEOF, \
PSStackParser, PSLiteral, PSKeyword, STRICT, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name PSStackParser, PSKeyword, STRICT
from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \ from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
int_value, float_value, num_value, \ resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value str_value, list_value, dict_value, stream_value
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
## Exceptions ## Exceptions
## ##
class PDFResourceError(PDFException): pass class PDFResourceError(PDFException): pass
class PDFInterpreterError(PDFException): pass class PDFInterpreterError(PDFException): pass
class PDFFontError(PDFException): pass
class PDFUnicodeNotDefined(PDFFontError): pass
## ColorSpace
##
class ColorSpace(object):
def __init__(self, name, ncomponents):
self.name = name
self.ncomponents = ncomponents
return
def __repr__(self):
return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
## Constants ## Constants
@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text')
LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_FONT = PSLiteralTable.intern('Font')
LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_IMAGE = PSLiteralTable.intern('Image') LITERAL_IMAGE = PSLiteralTable.intern('Image')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
KEYWORD_BI = PSKeywordTable.intern('BI')
KEYWORD_ID = PSKeywordTable.intern('ID')
KEYWORD_EI = PSKeywordTable.intern('EI')
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
PREDEFINED_COLORSPACE = dict(
(name, ColorSpace(name,n)) for (name,n) in {
'CalRGB': 3,
'CalGray': 1,
'Lab': 3,
'DeviceRGB': 3,
'DeviceCMYK': 4,
'DeviceGray': 1,
'Separation': 1,
'Indexed': 1,
'Pattern': 1,
}.iteritems())
## Fonts
##
# PDFFont
class PDFFont(object):
def __init__(self, descriptor, widths, default_width=None):
self.descriptor = descriptor
self.widths = widths
self.fontname = descriptor.get('FontName', 'unknown')
if isinstance(self.fontname, PSLiteral):
self.fontname = literal_name(self.fontname)
self.ascent = num_value(descriptor.get('Ascent', 0))
self.descent = num_value(descriptor.get('Descent', 0))
self.default_width = default_width or descriptor.get('MissingWidth', 0)
self.leading = num_value(descriptor.get('Leading', 0))
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
return
def __repr__(self):
return '<PDFFont>'
def is_vertical(self):
return False
def is_multibyte(self):
return False
def decode(self, bytes):
return map(ord, bytes)
def char_width(self, cid):
return self.widths.get(cid, self.default_width)
def char_disp(self, cid):
return 0
def string_width(self, s):
return sum( self.char_width(cid) for cid in self.decode(s) )
# PDFSimpleFont
class PDFSimpleFont(PDFFont):
def __init__(self, descriptor, widths, spec):
# Font encoding is specified either by a name of
# built-in encoding or a dictionary that describes
# the differences.
if 'Encoding' in spec:
encoding = resolve1(spec['Encoding'])
else:
encoding = LITERAL_STANDARD_ENCODING
if isinstance(encoding, dict):
name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
diff = list_value(encoding.get('Differences', None))
self.encoding = EncodingDB.get_encoding(name, diff)
else:
self.encoding = EncodingDB.get_encoding(literal_name(encoding))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
PDFFont.__init__(self, descriptor, widths)
return
def to_unicode(self, cid):
if not self.ucs2_cmap:
try:
return self.encoding[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(None, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
def __init__(self, spec):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if STRICT:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
try:
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
except KeyError:
descriptor = dict_value(spec.get('FontDescriptor', {}))
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 255))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont
# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):
def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
# PDFType3Font
class PDFType3Font(PDFSimpleFont):
def __init__(self, spec):
firstchar = int_value(spec.get('FirstChar', 0))
lastchar = int_value(spec.get('LastChar', 0))
widths = list_value(spec.get('Widths', [0]*256))
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
descriptor = {'FontName':spec.get('Name'),
'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
return
def __repr__(self):
return '<PDFType3Font>'
# PDFCIDFont
## TrueTypeFont
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
cmapname = 'Adobe-Identity-UCS-%s' % self.name
return CMap(cmapname).update(char2gid, gid2char)
class PDFCIDFont(PDFFont):
def __init__(self, spec):
try:
self.basefont = literal_name(spec['BaseFont'])
except KeyError:
if STRICT:
raise PDFFontError('BaseFont is missing')
self.basefont = 'unknown'
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
self.cidsysteminfo.get('Ordering', 'unknown'))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if STRICT:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name, strict=STRICT)
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
if STRICT:
raise PDFFontError('FontDescriptor is missing')
descriptor = {}
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
ttf = TrueTypeFont(self.basefont,
StringIO(self.fontfile.get_data()))
self.ucs2_cmap = None
if 'ToUnicode' in spec:
strm = stream_value(spec['ToUnicode'])
self.ucs2_cmap = CMap()
CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
elif self.cidcoding == 'Adobe-Identity':
if ttf:
try:
self.ucs2_cmap = ttf.create_cmap()
except TrueTypeFont.CMapNotFound:
pass
else:
try:
self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
strict=STRICT)
except CMapDB.CMapNotFound, e:
raise PDFFontError(e)
def get_width(seq):
dic = {}
char1 = char2 = None
for v in seq:
if char1 == None:
char1 = v
elif char2 == None and isinstance(v, int):
char2 = v
else:
if char2 == None:
for (i,w) in enumerate(v):
dic[char1+i] = w
else:
for i in xrange(char1, char2+1):
dic[i] = v
char1 = char2 = None
return dic
self.vertical = self.cmap.is_vertical()
if self.vertical:
# writing mode: vertical
dic = get_width(list_value(spec.get('W2', [])))
widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
(d,w) = spec.get('DW2', [880, -1000])
default_width = w
self.default_disp = d
else:
# writing mode: horizontal
widths = get_width(list_value(spec.get('W', [])))
self.disps = {}
default_width = spec.get('DW', 1000)
self.default_disp = 0
PDFFont.__init__(self, descriptor, widths, default_width)
return
def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
def is_vertical(self):
return self.vertical
def is_multibyte(self):
return True
def decode(self, bytes):
return self.cmap.decode(bytes)
def char_disp(self, cid):
return self.disps.get(cid, self.default_disp)
def to_unicode(self, cid):
if not self.ucs2_cmap:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(self.cidcoding, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
## Resource Manager ## Resource Manager
@ -388,7 +39,7 @@ class PDFResourceManager(object):
''' '''
ResourceManager facilitates reuse of shared resources ResourceManager facilitates reuse of shared resources
such as fonts, images and cmaps so that large objects are not such as fonts and images so that large objects are not
allocated multiple times. allocated multiple times.
''' '''
debug = 0 debug = 0
@ -399,24 +50,21 @@ class PDFResourceManager(object):
def get_procset(self, procs): def get_procset(self, procs):
for proc in procs: for proc in procs:
if proc == LITERAL_PDF: if proc is LITERAL_PDF:
pass pass
elif proc == LITERAL_TEXT: elif proc is LITERAL_TEXT:
pass pass
else: else:
#raise PDFResourceError('ProcSet %r is not supported.' % proc) #raise PDFResourceError('ProcSet %r is not supported.' % proc)
pass pass
return return
def get_cmap(self, name):
return CMapDB.get_cmap(name, strict=STRICT)
def get_font(self, objid, spec): def get_font(self, objid, spec):
if objid and objid in self.fonts: if objid and objid in self.fonts:
font = self.fonts[objid] font = self.fonts[objid]
else: else:
if STRICT: if STRICT:
if spec['Type'] != LITERAL_FONT: if spec['Type'] is not LITERAL_FONT:
raise PDFFontError('Type is not /Font') raise PDFFontError('Type is not /Font')
# Create a Font object. # Create a Font object.
if 'Subtype' in spec: if 'Subtype' in spec:
@ -455,49 +103,6 @@ class PDFResourceManager(object):
return font return font
## PDFDevice
##
class PDFDevice(object):
debug = 0
def __init__(self, rsrc):
self.rsrc = rsrc
self.ctm = None
return
def __repr__(self):
return '<PDFDevice>'
def close(self):
return
def set_ctm(self, ctm):
self.ctm = ctm
return
def begin_tag(self, tag, props=None):
return
def end_tag(self):
return
def do_tag(self, tag, props=None):
return
def begin_page(self, page):
return
def end_page(self, page):
return
def begin_figure(self, name, bbox):
return
def end_figure(self, name):
return
def render_string(self, textstate, textmatrix, seq):
raise NotImplementedError
def render_image(self, stream, size, matrix):
raise NotImplementedError
## PDFContentParser ## PDFContentParser
## ##
class PDFContentParser(PSStackParser): class PDFContentParser(PSStackParser):
@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser):
self.add_results(*self.popall()) self.add_results(*self.popall())
return return
KEYWORD_BI = PSKeywordTable.intern('BI')
KEYWORD_ID = PSKeywordTable.intern('ID')
KEYWORD_EI = PSKeywordTable.intern('EI')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token == KEYWORD_BI: if token is self.KEYWORD_BI:
# inline image within a content stream # inline image within a content stream
self.start_type(pos, 'inline') self.start_type(pos, 'inline')
elif token == KEYWORD_ID: elif token is self.KEYWORD_ID:
try: try:
(_, objs) = self.end_type('inline') (_, objs) = self.end_type('inline')
if len(objs) % 2 != 0: if len(objs) % 2 != 0:
@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser):
(pos, data) = self.get_inline_data(pos+len('ID ')) (pos, data) = self.get_inline_data(pos+len('ID '))
obj = PDFStream(d, data) obj = PDFStream(d, data)
self.push((pos, obj)) self.push((pos, obj))
self.push((pos, KEYWORD_EI)) self.push((pos, self.KEYWORD_EI))
except PSTypeError: except PSTypeError:
if STRICT: raise if STRICT: raise
else: else:
@ -975,7 +583,7 @@ class PDFPageInterpreter(object):
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj print >>stderr, 'Processing xobj: %r' % xobj
subtype = xobj.dic.get('Subtype') subtype = xobj.dic.get('Subtype')
if subtype == LITERAL_FORM and 'BBox' in xobj.dic: if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
interpreter = self.dup() interpreter = self.dup()
(x0,y0,x1,y1) = list_value(xobj.dic['BBox']) (x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm) ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
@ -985,7 +593,7 @@ class PDFPageInterpreter(object):
self.device.begin_figure(xobjid, bbox) self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
(x0,y0) = apply_matrix(self.ctm, (0,0)) (x0,y0) = apply_matrix(self.ctm, (0,0))
(x1,y1) = apply_matrix(self.ctm, (1,1)) (x1,y1) = apply_matrix(self.ctm, (1,1))
self.device.begin_figure(xobjid, (x0,y0,x1,y1)) self.device.begin_figure(xobjid, (x0,y0,x1,y1))

View File

@ -7,26 +7,22 @@
import sys, re import sys, re
import md5, struct import md5, struct
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from pdflib.utils import choplist, nunpack
from arcfour import Arcfour from pdflib.arcfour import Arcfour
from lzw import LZWDecoder from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ STRICT
literal_name, keyword_name, \ from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
PSStackParser, STRICT PDFStream, PDFObjRef, resolve1, decipher_all, \
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
## PDF Exceptions ## Exceptions
## ##
class PDFException(PSException): pass
class PDFSyntaxError(PDFException): pass class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass class PDFNoValidXRef(PDFSyntaxError): pass
class PDFEncryptionError(PDFException): pass class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
# some predefined literals and keywords. # some predefined literals and keywords.
LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGE = PSLiteralTable.intern('Page')
LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
PSLiteralTable.intern('A85'))
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
class PDFObject(PSObject): pass
## PDFObjRef
##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolve_all(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
if STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
if STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
## PDFStream type
##
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFEncryptionError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata
## PDFPage
##
class PDFPage(object):
def __init__(self, doc, pageid, attrs):
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = self.attrs.get('LastModified')
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = self.attrs.get('Rotate', 0)
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
## XRefs ## XRefs
##
## PDFXRef ## PDFXRef
## ##
@ -296,7 +44,7 @@ class PDFXRef(object):
return return
def objids(self): def objids(self):
return self.offsets.keys() return self.offsets.iterkeys()
def load(self, parser): def load(self, parser):
while 1: while 1:
@ -330,10 +78,11 @@ class PDFXRef(object):
self.load_trailer(parser) self.load_trailer(parser)
return return
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
def load_trailer(self, parser): def load_trailer(self, parser):
try: try:
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
assert kwd == KEYWORD_TRAILER assert kwd is self.KEYWORD_TRAILER
(_,dic) = parser.nextobject() (_,dic) = parser.nextobject()
except PSEOF: except PSEOF:
x = parser.pop(1) x = parser.pop(1)
@ -350,7 +99,7 @@ class PDFXRef(object):
raise raise
if use != 'n': if use != 'n':
if STRICT: if STRICT:
raise PDFValueError('Unused objid=%r' % objid) raise PDFSyntaxError('Unused objid=%r' % objid)
return (None, pos) return (None, pos)
@ -367,14 +116,14 @@ class PDFXRefStream(object):
return return
def objids(self): def objids(self):
return range(self.objid0, self.objid1+1) return xrange(self.objid0, self.objid1)
def load(self, parser): def load(self, parser):
(_,objid) = parser.nexttoken() # ignored (_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_,stream) = parser.nextobject()
if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF: if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.') raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size'] size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size)) (start, nobjs) = stream.dic.get('Index', (0,size))
@ -402,6 +151,37 @@ class PDFXRefStream(object):
return (objid, index) return (objid, index)
## PDFPage
##
class PDFPage(object):
def __init__(self, doc, pageid, attrs):
self.doc = doc
self.pageid = pageid
self.attrs = dict_value(attrs)
self.lastmod = resolve1(self.attrs.get('LastModified'))
self.resources = resolve1(self.attrs['Resources'])
self.mediabox = resolve1(self.attrs['MediaBox'])
if 'CropBox' in self.attrs:
self.cropbox = resolve1(self.attrs['CropBox'])
else:
self.cropbox = self.mediabox
self.rotate = self.attrs.get('Rotate', 0)
self.annots = self.attrs.get('Annots')
self.beads = self.attrs.get('B')
if 'Contents' in self.attrs:
contents = resolve1(self.attrs['Contents'])
else:
contents = []
if not isinstance(contents, list):
contents = [ contents ]
self.contents = contents
return
def __repr__(self):
return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
## PDFDocument ## PDFDocument
## ##
## A PDFDocument object represents a PDF document. ## A PDFDocument object represents a PDF document.
@ -463,15 +243,16 @@ class PDFDocument(object):
def set_root(self, root): def set_root(self, root):
self.root = root self.root = root
self.catalog = dict_value(self.root) self.catalog = dict_value(self.root)
if self.catalog.get('Type') != LITERAL_CATALOG: if self.catalog.get('Type') is not LITERAL_CATALOG:
if STRICT: if STRICT:
raise PDFValueError('Catalog not found!') raise PDFSyntaxError('Catalog not found!')
return return
# initialize(password='') # initialize(password='')
# Perform the initialization with a given password. # Perform the initialization with a given password.
# This step is mandatory even if there's no password associated # This step is mandatory even if there's no password associated
# with the document. # with the document.
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
def initialize(self, password=''): def initialize(self, password=''):
if not self.encryption: if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True self.is_printable = self.is_modifiable = self.is_extractable = True
@ -494,7 +275,7 @@ class PDFDocument(object):
self.is_modifiable = bool(P & 8) self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16) self.is_extractable = bool(P & 16)
# Algorithm 3.2 # Algorithm 3.2
password = (password+PASSWORD_PADDING)[:32] # 1 password = (password+self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2 hash = md5.md5(password) # 2
hash.update(O) # 3 hash.update(O) # 3
hash.update(struct.pack('<l', P)) # 4 hash.update(struct.pack('<l', P)) # 4
@ -512,7 +293,7 @@ class PDFDocument(object):
u1 = Arcfour(key).process(password) u1 = Arcfour(key).process(password)
elif R == 3: elif R == 3:
# Algorithm 3.5 # Algorithm 3.5
hash = md5.md5(PASSWORD_PADDING) # 2 hash = md5.md5(self.PASSWORD_PADDING) # 2
hash.update(docid[0]) # 3 hash.update(docid[0]) # 3
x = Arcfour(key).process(hash.digest()[:16]) # 4 x = Arcfour(key).process(hash.digest()[:16]) # 4
for i in xrange(1,19+1): for i in xrange(1,19+1):
@ -536,6 +317,7 @@ class PDFDocument(object):
key = hash.digest()[:min(len(key),16)] key = hash.digest()[:min(len(key),16)]
return Arcfour(key).process(data) return Arcfour(key).process(data)
KEYWORD_OBJ = PSKeywordTable.intern('obj')
def getobj(self, objid): def getobj(self, objid):
if not self.ready: if not self.ready:
raise PDFException('PDFDocument not initialized') raise PDFException('PDFDocument not initialized')
@ -554,11 +336,11 @@ class PDFDocument(object):
pass pass
else: else:
if STRICT: if STRICT:
raise PDFValueError('Cannot locate objid=%r' % objid) raise PDFSyntaxError('Cannot locate objid=%r' % objid)
return None return None
if strmid: if strmid:
stream = stream_value(self.getobj(strmid)) stream = stream_value(self.getobj(strmid))
if stream.dic['Type'] != LITERAL_OBJSTM: if stream.dic['Type'] is not LITERAL_OBJSTM:
if STRICT: if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream) raise PDFSyntaxError('Not a stream object: %r' % stream)
try: try:
@ -589,7 +371,7 @@ class PDFDocument(object):
(_,genno) = self.parser.nexttoken() # genno (_,genno) = self.parser.nexttoken() # genno
#assert objid1 == objid, (objid, objid1) #assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken() (_,kwd) = self.parser.nexttoken()
if kwd != KEYWORD_OBJ: if kwd is not self.KEYWORD_OBJ:
raise PDFSyntaxError('Invalid object spec: offset=%r' % index) raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
(_,obj) = self.parser.nextobject() (_,obj) = self.parser.nextobject()
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
@ -611,13 +393,13 @@ class PDFDocument(object):
for (k,v) in parent.iteritems(): for (k,v) in parent.iteritems():
if k in self.INHERITABLE_ATTRS and k not in tree: if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree: if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Pages: Kids=%r' % tree['Kids'] print >>stderr, 'Pages: Kids=%r' % tree['Kids']
for c in tree['Kids']: for c in tree['Kids']:
for x in search(c, tree): for x in search(c, tree):
yield x yield x
elif tree.get('Type') == LITERAL_PAGE: elif tree.get('Type') is LITERAL_PAGE:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Page: %r' % tree print >>stderr, 'Page: %r' % tree
yield (obj.objid, tree) yield (obj.objid, tree)
@ -683,15 +465,20 @@ class PDFParser(PSStackParser):
def __repr__(self): def __repr__(self):
return '<PDFParser>' return '<PDFParser>'
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
def do_keyword(self, pos, token): def do_keyword(self, pos, token):
if token in (KEYWORD_XREF, KEYWORD_STARTXREF): if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
self.add_results(*self.pop(1)) self.add_results(*self.pop(1))
return return
if token == KEYWORD_ENDOBJ: if token is self.KEYWORD_ENDOBJ:
self.add_results(*self.pop(4)) self.add_results(*self.pop(4))
return return
if token == KEYWORD_R: if token is self.KEYWORD_R:
# reference to indirect object # reference to indirect object
try: try:
((_,objid), (_,genno)) = self.pop(2) ((_,objid), (_,genno)) = self.pop(2)
@ -702,7 +489,7 @@ class PDFParser(PSStackParser):
pass pass
return return
if token == KEYWORD_STREAM: if token is self.KEYWORD_STREAM:
# stream object # stream object
((_,dic),) = self.pop(1) ((_,dic),) = self.pop(1)
dic = dict_value(dic) dic = dict_value(dic)
@ -710,7 +497,7 @@ class PDFParser(PSStackParser):
objlen = int_value(dic['Length']) objlen = int_value(dic['Length'])
except KeyError: except KeyError:
if STRICT: if STRICT:
raise PDFValueError('/Length is undefined: %r' % dic) raise PDFSyntaxError('/Length is undefined: %r' % dic)
objlen = 0 objlen = 0
self.seek(pos) self.seek(pos)
try: try:
@ -785,7 +572,7 @@ class PDFParser(PSStackParser):
xref = PDFXRefStream() xref = PDFXRefStream()
xref.load(self) xref.load(self)
else: else:
if token != KEYWORD_XREF: if token is not self.KEYWORD_XREF:
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
(pos, token)) (pos, token))
self.nextline() self.nextline()
@ -835,6 +622,7 @@ class PDFParser(PSStackParser):
yield xref yield xref
return return
## PDFObjStrmParser ## PDFObjStrmParser
## ##
class PDFObjStrmParser(PDFParser): class PDFObjStrmParser(PDFParser):

222
pdflib/pdftypes.py Normal file
View File

@ -0,0 +1,222 @@
#!/usr/bin/env python
import sys, zlib
stderr = sys.stderr
from pdflib.lzw import LZWDecoder
from pdflib.psparser import PSException, PSObject, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, STRICT
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
## PDF Objects
##
class PDFObject(PSObject): pass
class PDFException(PSException): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
## PDFObjRef
##
class PDFObjRef(PDFObject):
def __init__(self, doc, objid, _):
if objid == 0:
if STRICT:
raise PDFValueError('PDF object id cannot be 0.')
self.doc = doc
self.objid = objid
#self.genno = genno # Never used.
return
def __repr__(self):
return '<PDFObjRef:%d>' % (self.objid)
def resolve(self):
return self.doc.getobj(self.objid)
# resolve
def resolve1(x):
'''
Resolve an object. If this is an array or dictionary,
it may still contains some indirect objects inside.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
return x
def resolve_all(x):
'''
Recursively resolve X and all the internals.
Make sure there is no indirect reference within the nested object.
This procedure might be slow.
'''
while isinstance(x, PDFObjRef):
x = x.resolve()
if isinstance(x, list):
x = [ resolve_all(v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = resolve_all(v)
return x
def decipher_all(decipher, objid, genno, x):
'''
Recursively decipher X.
'''
if isinstance(x, str):
return decipher(objid, genno, x)
if isinstance(x, list):
x = [ decipher_all(decipher, objid, genno, v) for v in x ]
elif isinstance(x, dict):
for (k,v) in x.iteritems():
x[k] = decipher_all(decipher, objid, genno, v)
return x
# Type cheking
def int_value(x):
x = resolve1(x)
if not isinstance(x, int):
if STRICT:
raise PDFTypeError('Integer required: %r' % x)
return 0
return x
def float_value(x):
x = resolve1(x)
if not isinstance(x, float):
if STRICT:
raise PDFTypeError('Float required: %r' % x)
return 0.0
return x
def num_value(x):
x = resolve1(x)
if not (isinstance(x, int) or isinstance(x, float)):
if STRICT:
raise PDFTypeError('Int or Float required: %r' % x)
return 0
return x
def str_value(x):
x = resolve1(x)
if not isinstance(x, str):
if STRICT:
raise PDFTypeError('String required: %r' % x)
return ''
return x
def list_value(x):
x = resolve1(x)
if not (isinstance(x, list) or isinstance(x, tuple)):
if STRICT:
raise PDFTypeError('List required: %r' % x)
return []
return x
def dict_value(x):
x = resolve1(x)
if not isinstance(x, dict):
if STRICT:
raise PDFTypeError('Dict required: %r' % x)
return {}
return x
def stream_value(x):
x = resolve1(x)
if not isinstance(x, PDFStream):
if STRICT:
raise PDFTypeError('PDFStream required: %r' % x)
return PDFStream({}, '')
return x
## PDFStream type
##
class PDFStream(PDFObject):
def __init__(self, dic, rawdata, decipher=None):
self.dic = dic
self.rawdata = rawdata
self.decipher = decipher
self.data = None
self.objid = None
self.genno = None
return
def set_objid(self, objid, genno):
self.objid = objid
self.genno = genno
return
def __repr__(self):
return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
self.rawdata = None
return
filters = self.dic['Filter']
if not isinstance(filters, list):
filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
data = zlib.decompress(data)
elif f in LITERALS_LZW_DECODE:
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
data = ''.join(LZWDecoder(StringIO(data)).run())
elif f in LITERALS_ASCII85_DECODE:
import ascii85
data = ascii85.ascii85decode(data)
elif f == LITERAL_CRYPT:
raise PDFEncryptionError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
def get_data(self):
if self.data == None:
self.decode()
return self.data
def get_rawdata(self):
return self.rawdata

View File

@ -1,7 +1,8 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys, re import sys, re
stderr = sys.stderr stderr = sys.stderr
from utils import choplist
from pdflib.utils import choplist
STRICT = 0 STRICT = 0

View File

@ -4,6 +4,8 @@ from struct import unpack
## Matrix operations ## Matrix operations
## ##
MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
'''Multiplies two matrices.''' '''Multiplies two matrices.'''
return (a0*a1+c0*b1, b0*a1+d0*b1, return (a0*a1+c0*b1, b0*a1+d0*b1,