add non-strict mode.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@16 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
80d17eb79b
commit
94859ea428
87
pdfinterp.py
87
pdfinterp.py
|
@ -7,7 +7,7 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||||
PSStackParser, PSLiteral, PSKeyword, \
|
PSStackParser, PSLiteral, PSKeyword, STRICT, \
|
||||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
PSLiteralTable, PSKeywordTable, literal_name, keyword_name
|
||||||
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
|
||||||
int_value, float_value, num_value, \
|
int_value, float_value, num_value, \
|
||||||
|
@ -84,14 +84,14 @@ class PDFFont:
|
||||||
def __init__(self, descriptor, widths, default_width=None):
|
def __init__(self, descriptor, widths, default_width=None):
|
||||||
self.descriptor = descriptor
|
self.descriptor = descriptor
|
||||||
self.widths = widths
|
self.widths = widths
|
||||||
self.fontname = descriptor['FontName']
|
self.fontname = descriptor.get('FontName', 'unknown')
|
||||||
if isinstance(self.fontname, PSLiteral):
|
if isinstance(self.fontname, PSLiteral):
|
||||||
self.fontname = literal_name(self.fontname)
|
self.fontname = literal_name(self.fontname)
|
||||||
self.ascent = descriptor['Ascent']
|
self.ascent = num_value(descriptor.get('Ascent', 0))
|
||||||
self.descent = descriptor['Descent']
|
self.descent = num_value(descriptor.get('Descent', 0))
|
||||||
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
self.default_width = default_width or descriptor.get('MissingWidth', 0)
|
||||||
self.leading = descriptor.get('Leading', 0)
|
self.leading = num_value(descriptor.get('Leading', 0))
|
||||||
self.bbox = list_value(descriptor['FontBBox'])
|
self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -155,20 +155,20 @@ class PDFSimpleFont(PDFFont):
|
||||||
class PDFType1Font(PDFSimpleFont):
|
class PDFType1Font(PDFSimpleFont):
|
||||||
|
|
||||||
def __init__(self, spec):
|
def __init__(self, spec):
|
||||||
if 'BaseFont' not in spec:
|
try:
|
||||||
raise PDFFontError('BaseFont is missing')
|
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('BaseFont is missing')
|
||||||
|
self.basefont = 'unknown'
|
||||||
try:
|
try:
|
||||||
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
(descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
|
||||||
except KeyError:
|
except KeyError:
|
||||||
try:
|
descriptor = dict_value(spec.get('FontDescriptor', {}))
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
firstchar = int_value(spec['FirstChar'])
|
lastchar = int_value(spec.get('LastChar', 255))
|
||||||
lastchar = int_value(spec['LastChar'])
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = dict( (i+firstchar,w) for (i,w)
|
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
|
||||||
in enumerate(list_value(spec['Widths'])) )
|
|
||||||
except KeyError, k:
|
|
||||||
raise PDFFontError('%s is missing' % k)
|
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -179,13 +179,10 @@ class PDFTrueTypeFont(PDFType1Font):
|
||||||
# PDFType3Font
|
# PDFType3Font
|
||||||
class PDFType3Font(PDFSimpleFont):
|
class PDFType3Font(PDFSimpleFont):
|
||||||
def __init__(self, spec):
|
def __init__(self, spec):
|
||||||
try:
|
firstchar = int_value(spec.get('FirstChar', 0))
|
||||||
firstchar = int_value(spec['FirstChar'])
|
lastchar = int_value(spec.get('LastChar', 0))
|
||||||
lastchar = int_value(spec['LastChar'])
|
widths = list_value(spec.get('Widths', [0]*256))
|
||||||
widths = dict( (i+firstchar,w) for (i,w)
|
widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
|
||||||
in enumerate(list_value(spec['Widths'])) )
|
|
||||||
except KeyError, k:
|
|
||||||
raise PDFFontError('%s is missing' % k)
|
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
|
@ -215,7 +212,8 @@ class TrueTypeFont:
|
||||||
return
|
return
|
||||||
|
|
||||||
def create_cmap(self):
|
def create_cmap(self):
|
||||||
if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
|
if 'cmap' not in self.tables:
|
||||||
|
raise TrueTypeFont.CMapNotFound
|
||||||
(base_offset, length) = self.tables['cmap']
|
(base_offset, length) = self.tables['cmap']
|
||||||
fp = self.fp
|
fp = self.fp
|
||||||
fp.seek(base_offset)
|
fp.seek(base_offset)
|
||||||
|
@ -274,15 +272,15 @@ class TrueTypeFont:
|
||||||
class PDFCIDFont(PDFFont):
|
class PDFCIDFont(PDFFont):
|
||||||
|
|
||||||
def __init__(self, spec):
|
def __init__(self, spec):
|
||||||
if 'BaseFont' not in spec:
|
|
||||||
raise PDFFontError('BaseFont is missing')
|
|
||||||
try:
|
try:
|
||||||
self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
|
|
||||||
self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
|
|
||||||
self.cidsysteminfo['Ordering'])
|
|
||||||
except KeyError:
|
|
||||||
raise PDFFontError('CIDSystemInfo not properly defined.')
|
|
||||||
self.basefont = literal_name(spec['BaseFont'])
|
self.basefont = literal_name(spec['BaseFont'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('BaseFont is missing')
|
||||||
|
self.basefont = 'unknown'
|
||||||
|
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
|
||||||
|
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
|
||||||
|
self.cidsysteminfo.get('Ordering', 'unknown'))
|
||||||
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
ttf = None
|
ttf = None
|
||||||
|
@ -391,11 +389,16 @@ class PDFResourceManager:
|
||||||
if objid and objid in self.fonts:
|
if objid and objid in self.fonts:
|
||||||
font = self.fonts[objid]
|
font = self.fonts[objid]
|
||||||
else:
|
else:
|
||||||
assert spec['Type'] == LITERAL_FONT
|
if STRICT:
|
||||||
|
if spec['Type'] != LITERAL_FONT:
|
||||||
|
raise PDFFontError('Type is not /Font')
|
||||||
# Create a Font object.
|
# Create a Font object.
|
||||||
if 'Subtype' not in spec:
|
if 'Subtype' in spec:
|
||||||
raise PDFFontError('Font Subtype is not specified.')
|
|
||||||
subtype = literal_name(spec['Subtype'])
|
subtype = literal_name(spec['Subtype'])
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFFontError('Font Subtype is not specified.')
|
||||||
|
subtype = 'Type1'
|
||||||
if subtype in ('Type1', 'MMType1'):
|
if subtype in ('Type1', 'MMType1'):
|
||||||
# Type1 Font
|
# Type1 Font
|
||||||
font = PDFType1Font(spec)
|
font = PDFType1Font(spec)
|
||||||
|
@ -411,14 +414,16 @@ class PDFResourceManager:
|
||||||
elif subtype == 'Type0':
|
elif subtype == 'Type0':
|
||||||
# Type0 Font
|
# Type0 Font
|
||||||
dfonts = list_value(spec['DescendantFonts'])
|
dfonts = list_value(spec['DescendantFonts'])
|
||||||
assert len(dfonts) == 1
|
assert dfonts
|
||||||
subspec = dict_value(dfonts[0]).copy()
|
subspec = dict_value(dfonts[0]).copy()
|
||||||
for k in ('Encoding', 'ToUnicode'):
|
for k in ('Encoding', 'ToUnicode'):
|
||||||
if k in spec:
|
if k in spec:
|
||||||
subspec[k] = resolve1(spec[k])
|
subspec[k] = resolve1(spec[k])
|
||||||
font = self.get_font(None, subspec)
|
font = self.get_font(None, subspec)
|
||||||
else:
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PDFFontError('Invalid Font: %r' % spec)
|
raise PDFFontError('Invalid Font: %r' % spec)
|
||||||
|
font = PDFType1Font(spec) # this is so wrong!
|
||||||
if objid:
|
if objid:
|
||||||
self.fonts[objid] = font
|
self.fonts[objid] = font
|
||||||
return font
|
return font
|
||||||
|
@ -480,14 +485,17 @@ class PDFContentParser(PSStackParser):
|
||||||
objs = self.partobj
|
objs = self.partobj
|
||||||
(type0, self.partobj) = self.context.pop()
|
(type0, self.partobj) = self.context.pop()
|
||||||
if len(objs) % 2 != 0:
|
if len(objs) % 2 != 0:
|
||||||
|
if STRICT:
|
||||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||||
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||||
pos += len('ID ')
|
pos += len('ID ')
|
||||||
self.fp.seek(pos)
|
self.fp.seek(pos)
|
||||||
data = self.fp.read(8192)
|
|
||||||
# XXX how do we know the real length other than scanning?
|
# XXX how do we know the real length other than scanning?
|
||||||
|
data = ''
|
||||||
|
while 1:
|
||||||
|
data += self.fp.read(4096)
|
||||||
m = self.EOIPAT.search(data)
|
m = self.EOIPAT.search(data)
|
||||||
assert m
|
if m: break
|
||||||
objlen = m.start(0)
|
objlen = m.start(0)
|
||||||
obj = PDFStream(dic, data[:objlen])
|
obj = PDFStream(dic, data[:objlen])
|
||||||
self.push(obj)
|
self.push(obj)
|
||||||
|
@ -731,7 +739,9 @@ class PDFPageInterpreter:
|
||||||
try:
|
try:
|
||||||
self.textstate.font = self.fontmap[literal_name(fontid)]
|
self.textstate.font = self.fontmap[literal_name(fontid)]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined font id: %r' % fontid)
|
raise PDFInterpreterError('Undefined font id: %r' % fontid)
|
||||||
|
return
|
||||||
self.textstate.fontsize = fontsize
|
self.textstate.fontsize = fontsize
|
||||||
return
|
return
|
||||||
# setrendering
|
# setrendering
|
||||||
|
@ -816,7 +826,9 @@ class PDFPageInterpreter:
|
||||||
try:
|
try:
|
||||||
xobj = stream_value(self.xobjmap[xobjid])
|
xobj = stream_value(self.xobjmap[xobjid])
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||||
|
return
|
||||||
if xobj.dic['Subtype'] == LITERAL_FORM:
|
if xobj.dic['Subtype'] == LITERAL_FORM:
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
|
@ -897,6 +909,7 @@ class PDFPageInterpreter:
|
||||||
print >>stderr, 'exec: %s' % (obj.name)
|
print >>stderr, 'exec: %s' % (obj.name)
|
||||||
func()
|
func()
|
||||||
else:
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PDFInterpreterError('unknown operator: %r' % obj.name)
|
raise PDFInterpreterError('unknown operator: %r' % obj.name)
|
||||||
else:
|
else:
|
||||||
self.push(obj)
|
self.push(obj)
|
||||||
|
|
76
pdfparser.py
76
pdfparser.py
|
@ -24,7 +24,7 @@ from utils import choplist, nunpack
|
||||||
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
from psparser import PSException, PSSyntaxError, PSTypeError, \
|
||||||
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
|
||||||
literal_name, keyword_name, \
|
literal_name, keyword_name, \
|
||||||
PSStackParser
|
PSStackParser, STRICT
|
||||||
|
|
||||||
|
|
||||||
## PDF Exceptions
|
## PDF Exceptions
|
||||||
|
@ -52,6 +52,7 @@ class PDFObjRef:
|
||||||
|
|
||||||
def __init__(self, doc, objid, genno):
|
def __init__(self, doc, objid, genno):
|
||||||
if objid == 0:
|
if objid == 0:
|
||||||
|
if STRICT:
|
||||||
raise PDFValueError('objid cannot be 0.')
|
raise PDFValueError('objid cannot be 0.')
|
||||||
self.doc = doc
|
self.doc = doc
|
||||||
self.objid = objid
|
self.objid = objid
|
||||||
|
@ -94,43 +95,57 @@ def resolveall(x):
|
||||||
def int_value(x):
|
def int_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, int):
|
if not isinstance(x, int):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('integer required: %r' % x)
|
raise PDFTypeError('integer required: %r' % x)
|
||||||
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def float_value(x):
|
def float_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, float):
|
if not isinstance(x, float):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('float required: %r' % x)
|
raise PDFTypeError('float required: %r' % x)
|
||||||
|
return 0.0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def num_value(x):
|
def num_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, int) or isinstance(x, float)):
|
if not (isinstance(x, int) or isinstance(x, float)):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('int or float required: %r' % x)
|
raise PDFTypeError('int or float required: %r' % x)
|
||||||
|
return 0
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def str_value(x):
|
def str_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, str):
|
if not isinstance(x, str):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('string required: %r' % x)
|
raise PDFTypeError('string required: %r' % x)
|
||||||
|
return ''
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def list_value(x):
|
def list_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not (isinstance(x, list) or isinstance(x, tuple)):
|
if not (isinstance(x, list) or isinstance(x, tuple)):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('list required: %r' % x)
|
raise PDFTypeError('list required: %r' % x)
|
||||||
|
return []
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def dict_value(x):
|
def dict_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, dict):
|
if not isinstance(x, dict):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('dict required: %r' % x)
|
raise PDFTypeError('dict required: %r' % x)
|
||||||
|
return {}
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def stream_value(x):
|
def stream_value(x):
|
||||||
x = resolve1(x)
|
x = resolve1(x)
|
||||||
if not isinstance(x, PDFStream):
|
if not isinstance(x, PDFStream):
|
||||||
|
if STRICT:
|
||||||
raise PDFTypeError('stream required: %r' % x)
|
raise PDFTypeError('stream required: %r' % x)
|
||||||
|
return PDFStream({}, '')
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
@ -186,6 +201,7 @@ class PDFStream:
|
||||||
ent0 = ent1
|
ent0 = ent1
|
||||||
data = buf
|
data = buf
|
||||||
else:
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PDFValueError('Invalid filter spec: %r' % f)
|
raise PDFValueError('Invalid filter spec: %r' % f)
|
||||||
self.data = data
|
self.data = data
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
|
@ -235,11 +251,14 @@ class PDFXRef:
|
||||||
while 1:
|
while 1:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
raise PDFSyntaxError('premature eof: %r' % parser)
|
||||||
|
break
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
f = line.split(' ')
|
f = line.split(' ')
|
||||||
if len(f) != 2:
|
if len(f) != 2:
|
||||||
if line != 'trailer':
|
if line != 'trailer':
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
||||||
break
|
break
|
||||||
(start, nobjs) = map(long, f)
|
(start, nobjs) = map(long, f)
|
||||||
|
@ -250,7 +269,9 @@ class PDFXRef:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if len(f) != 3:
|
if len(f) != 3:
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
||||||
|
continue
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
self.offsets.append((int(genno), long(pos), use))
|
self.offsets.append((int(genno), long(pos), use))
|
||||||
# read trailer
|
# read trailer
|
||||||
|
@ -259,9 +280,10 @@ class PDFXRef:
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
if objid < self.objid0 or self.objid1 <= objid:
|
if objid < self.objid0 or self.objid1 <= objid:
|
||||||
raise IndexError
|
raise IndexError(objid)
|
||||||
(genno, pos, use) = self.offsets[objid-self.objid0]
|
(genno, pos, use) = self.offsets[objid-self.objid0]
|
||||||
if use != 'n':
|
if use != 'n':
|
||||||
|
if STRICT:
|
||||||
raise PDFValueError('unused objid=%r' % objid)
|
raise PDFValueError('unused objid=%r' % objid)
|
||||||
return (None, pos)
|
return (None, pos)
|
||||||
|
|
||||||
|
@ -272,6 +294,7 @@ class PDFXRefStream:
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self, parser):
|
||||||
(objid, genno, _, stream) = list_value(parser.parse())
|
(objid, genno, _, stream) = list_value(parser.parse())
|
||||||
|
if STRICT:
|
||||||
assert stream.dic['Type'] == LITERAL_XREF
|
assert stream.dic['Type'] == LITERAL_XREF
|
||||||
size = stream.dic['Size']
|
size = stream.dic['Size']
|
||||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||||
|
@ -285,7 +308,7 @@ class PDFXRefStream:
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
if objid < self.objid0 or self.objid1 <= objid:
|
if objid < self.objid0 or self.objid1 <= objid:
|
||||||
raise IndexError
|
raise IndexError(objid)
|
||||||
i = self.entlen * (objid-self.objid0)
|
i = self.entlen * (objid-self.objid0)
|
||||||
ent = self.data[i:i+self.entlen]
|
ent = self.data[i:i+self.entlen]
|
||||||
f1 = nunpack(ent[:self.fl1], 1)
|
f1 = nunpack(ent[:self.fl1], 1)
|
||||||
|
@ -334,7 +357,7 @@ class PDFDocument:
|
||||||
return
|
return
|
||||||
|
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
assert self.xrefs
|
#assert self.xrefs
|
||||||
if objid in self.objs:
|
if objid in self.objs:
|
||||||
obj = self.objs[objid]
|
obj = self.objs[objid]
|
||||||
else:
|
else:
|
||||||
|
@ -345,13 +368,20 @@ class PDFDocument:
|
||||||
except IndexError:
|
except IndexError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PDFValueError('Cannot locate objid=%r' % objid)
|
raise PDFValueError('Cannot locate objid=%r' % objid)
|
||||||
|
return None
|
||||||
if strmid:
|
if strmid:
|
||||||
stream = stream_value(self.getobj(strmid))
|
stream = stream_value(self.getobj(strmid))
|
||||||
if stream.dic['Type'] != LITERAL_OBJSTM:
|
if stream.dic['Type'] != LITERAL_OBJSTM:
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
raise PDFSyntaxError('Not a stream object: %r' % stream)
|
||||||
if 'N' not in stream.dic:
|
try:
|
||||||
|
n = stream.dic['N']
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('N is not defined: %r' % stream)
|
raise PDFSyntaxError('N is not defined: %r' % stream)
|
||||||
|
n = 0
|
||||||
if strmid in self.parsed_objs:
|
if strmid in self.parsed_objs:
|
||||||
objs = self.parsed_objs[stream]
|
objs = self.parsed_objs[stream]
|
||||||
else:
|
else:
|
||||||
|
@ -363,8 +393,10 @@ class PDFDocument:
|
||||||
else:
|
else:
|
||||||
prevpos = self.parser.seek(index)
|
prevpos = self.parser.seek(index)
|
||||||
seq = list_value(self.parser.parse())
|
seq = list_value(self.parser.parse())
|
||||||
if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
raise PDFSyntaxError('invalid stream spec: %r' % seq)
|
||||||
|
return None
|
||||||
obj = seq[3]
|
obj = seq[3]
|
||||||
self.parser.seek(prevpos)
|
self.parser.seek(prevpos)
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
|
@ -373,7 +405,7 @@ class PDFDocument:
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def get_pages(self, debug=0):
|
def get_pages(self, debug=0):
|
||||||
assert self.xrefs
|
#assert self.xrefs
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
|
@ -397,6 +429,7 @@ class PDFDocument:
|
||||||
self.root = root
|
self.root = root
|
||||||
self.catalog = dict_value(self.root)
|
self.catalog = dict_value(self.root)
|
||||||
if self.catalog['Type'] != LITERAL_CATALOG:
|
if self.catalog['Type'] != LITERAL_CATALOG:
|
||||||
|
if STRICT:
|
||||||
raise PDFValueError('Catalog not found!')
|
raise PDFValueError('Catalog not found!')
|
||||||
self.outline = self.catalog.get('Outline')
|
self.outline = self.catalog.get('Outline')
|
||||||
return
|
return
|
||||||
|
@ -437,24 +470,24 @@ class PDFParser(PSStackParser):
|
||||||
# stream object
|
# stream object
|
||||||
(dic,) = self.pop(1)
|
(dic,) = self.pop(1)
|
||||||
dic = dict_value(dic)
|
dic = dict_value(dic)
|
||||||
if 'Length' not in dic:
|
try:
|
||||||
raise PDFValueError('/Length is undefined: %r' % dic)
|
|
||||||
objlen = int_value(dic['Length'])
|
objlen = int_value(dic['Length'])
|
||||||
|
except KeyError:
|
||||||
|
if STRICT:
|
||||||
|
raise PDFValueError('/Length is undefined: %r' % dic)
|
||||||
|
objlen = 0
|
||||||
self.seek(pos)
|
self.seek(pos)
|
||||||
(_, line) = self.nextline() # 'stream'
|
(_, line) = self.nextline() # 'stream'
|
||||||
self.fp.seek(pos+len(line))
|
pos += len(line)
|
||||||
|
self.fp.seek(pos)
|
||||||
data = self.fp.read(objlen)
|
data = self.fp.read(objlen)
|
||||||
self.seek(pos+len(line)+objlen)
|
self.seek(pos+objlen)
|
||||||
while 1:
|
while 1:
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not line:
|
if not line or line.startswith('endstream'):
|
||||||
raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
if line.strip():
|
|
||||||
if not line.startswith('endstream'):
|
|
||||||
raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
|
|
||||||
(linepos, line))
|
|
||||||
break
|
break
|
||||||
|
objlen += len(line)
|
||||||
|
data += line
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
|
||||||
(pos, objlen, dic, data[:10])
|
(pos, objlen, dic, data[:10])
|
||||||
|
@ -477,7 +510,9 @@ class PDFParser(PSStackParser):
|
||||||
if line:
|
if line:
|
||||||
prev = line
|
prev = line
|
||||||
else:
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('startxref not found!')
|
raise PDFSyntaxError('startxref not found!')
|
||||||
|
prev = 0
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'xref found: pos=%r' % prev
|
print >>stderr, 'xref found: pos=%r' % prev
|
||||||
self.seek(long(prev))
|
self.seek(long(prev))
|
||||||
|
@ -495,10 +530,11 @@ class PDFParser(PSStackParser):
|
||||||
# XRefStream: PDF-1.5
|
# XRefStream: PDF-1.5
|
||||||
self.seek(linepos)
|
self.seek(linepos)
|
||||||
xref = PDFXRefStream(self)
|
xref = PDFXRefStream(self)
|
||||||
elif line.strip() != 'xref':
|
else:
|
||||||
|
if line.strip() != 'xref':
|
||||||
|
if STRICT:
|
||||||
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
|
||||||
(linepos, line))
|
(linepos, line))
|
||||||
else:
|
|
||||||
xref = PDFXRef(self)
|
xref = PDFXRef(self)
|
||||||
yield xref
|
yield xref
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
|
|
42
psparser.py
42
psparser.py
|
@ -3,6 +3,8 @@ import sys, re
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist
|
from utils import choplist
|
||||||
|
|
||||||
|
STRICT = 0
|
||||||
|
|
||||||
|
|
||||||
## PS Exceptions
|
## PS Exceptions
|
||||||
##
|
##
|
||||||
|
@ -73,12 +75,18 @@ PSKeywordTable = PSSymbolTable(PSKeyword)
|
||||||
|
|
||||||
def literal_name(x):
|
def literal_name(x):
|
||||||
if not isinstance(x, PSLiteral):
|
if not isinstance(x, PSLiteral):
|
||||||
|
if STRICT:
|
||||||
raise PSTypeError('literal required: %r' % x)
|
raise PSTypeError('literal required: %r' % x)
|
||||||
|
else:
|
||||||
|
return str(x)
|
||||||
return x.name
|
return x.name
|
||||||
|
|
||||||
def keyword_name(x):
|
def keyword_name(x):
|
||||||
if not isinstance(x, PSKeyword):
|
if not isinstance(x, PSKeyword):
|
||||||
|
if STRICT:
|
||||||
raise PSTypeError('keyword required: %r' % x)
|
raise PSTypeError('keyword required: %r' % x)
|
||||||
|
else:
|
||||||
|
return str(x)
|
||||||
return x.name
|
return x.name
|
||||||
|
|
||||||
|
|
||||||
|
@ -237,23 +245,30 @@ class PSBaseParser:
|
||||||
s += s1[-1:]
|
s += s1[-1:]
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
|
if STRICT:
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||||
(linepos, line))
|
(linepos, line))
|
||||||
|
break
|
||||||
charpos = 0
|
charpos = 0
|
||||||
elif charpos == len(line):
|
elif charpos == len(line):
|
||||||
s += s1
|
s += s1
|
||||||
(linepos, line) = self.nextline()
|
(linepos, line) = self.nextline()
|
||||||
if not line:
|
if not line:
|
||||||
|
if STRICT:
|
||||||
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
|
||||||
(linepos, line))
|
(linepos, line))
|
||||||
|
break
|
||||||
charpos = 0
|
charpos = 0
|
||||||
else:
|
else:
|
||||||
s += s1
|
s += s1
|
||||||
break
|
break
|
||||||
if line[charpos] != ')':
|
if line[charpos] == ')':
|
||||||
|
charpos += 1
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||||
(linepos, line))
|
(linepos, line))
|
||||||
charpos += 1
|
pass
|
||||||
def convesc(m):
|
def convesc(m):
|
||||||
x = m.group(0)
|
x = m.group(0)
|
||||||
if x[1:].isdigit():
|
if x[1:].isdigit():
|
||||||
|
@ -271,10 +286,12 @@ class PSBaseParser:
|
||||||
# hex string object
|
# hex string object
|
||||||
ms = self.STRING_HEX.match(line, charpos)
|
ms = self.STRING_HEX.match(line, charpos)
|
||||||
charpos = ms.end(0)
|
charpos = ms.end(0)
|
||||||
if line[charpos] != '>':
|
if line[charpos] == '>':
|
||||||
|
charpos += 1
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
|
||||||
(linepos, line))
|
(linepos, line))
|
||||||
charpos += 1
|
|
||||||
def convhex(m1):
|
def convhex(m1):
|
||||||
return chr(int(m1.group(0), 16))
|
return chr(int(m1.group(0), 16))
|
||||||
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
|
||||||
|
@ -341,6 +358,7 @@ class PSStackParser(PSBaseParser):
|
||||||
Pop N objects from the stack.
|
Pop N objects from the stack.
|
||||||
'''
|
'''
|
||||||
if len(self.partobj) < n:
|
if len(self.partobj) < n:
|
||||||
|
if STRICT:
|
||||||
raise PSSyntaxError('stack too short < %d' % n)
|
raise PSSyntaxError('stack too short < %d' % n)
|
||||||
r = self.partobj[-n:]
|
r = self.partobj[-n:]
|
||||||
self.partobj = self.partobj[:-n]
|
self.partobj = self.partobj[:-n]
|
||||||
|
@ -366,10 +384,16 @@ class PSStackParser(PSBaseParser):
|
||||||
return
|
return
|
||||||
|
|
||||||
def endobj(type1):
|
def endobj(type1):
|
||||||
assert self.context
|
if not self.context:
|
||||||
|
if STRICT:
|
||||||
|
raise PSTypeError('stack empty.')
|
||||||
obj = self.partobj
|
obj = self.partobj
|
||||||
(type0, self.partobj) = self.context.pop()
|
(type0, partobj) = self.context[-1]
|
||||||
if type0 != type1:
|
if type0 == type1:
|
||||||
|
self.partobj = partobj
|
||||||
|
self.context.pop()
|
||||||
|
else:
|
||||||
|
if STRICT:
|
||||||
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
|
||||||
(type0, self.partobj, type1, obj))
|
(type0, self.partobj, type1, obj))
|
||||||
return obj
|
return obj
|
||||||
|
@ -407,6 +431,7 @@ class PSStackParser(PSBaseParser):
|
||||||
# end dictionary
|
# end dictionary
|
||||||
objs = endobj('d')
|
objs = endobj('d')
|
||||||
if len(objs) % 2 != 0:
|
if len(objs) % 2 != 0:
|
||||||
|
if STRICT:
|
||||||
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
raise PSTypeError('invalid dictionary construct: %r' % objs)
|
||||||
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
|
||||||
if 2 <= self.debug:
|
if 2 <= self.debug:
|
||||||
|
@ -415,4 +440,5 @@ class PSStackParser(PSBaseParser):
|
||||||
elif self.do_token(pos, t):
|
elif self.do_token(pos, t):
|
||||||
break
|
break
|
||||||
|
|
||||||
return endobj('o')
|
objs = endobj('o')
|
||||||
|
return objs
|
||||||
|
|
Loading…
Reference in New Issue