add non-strict mode.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@16 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-01-20 04:44:16 +00:00 · 2008-01-20 04:44:16 +00:00 · 94859ea428
parent 80d17eb79b
commit 94859ea428
3 changed files with 184 additions and 109 deletions
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -7,7 +7,7 @@ try:
 except ImportError:
  from StringIO import StringIO
 from psparser import PSException, PSSyntaxError, PSTypeError, \
-     PSStackParser, PSLiteral, PSKeyword, \
+     PSStackParser, PSLiteral, PSKeyword, STRICT, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
 from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
     int_value, float_value, num_value, \
@ -84,14 +84,14 @@ class PDFFont:
  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
    self.widths = widths
-    self.fontname = descriptor['FontName']
+    self.fontname = descriptor.get('FontName', 'unknown')
    if isinstance(self.fontname, PSLiteral):
      self.fontname = literal_name(self.fontname)
-    self.ascent = descriptor['Ascent']
+    self.ascent = num_value(descriptor.get('Ascent', 0))
-    self.descent = descriptor['Descent']
+    self.descent = num_value(descriptor.get('Descent', 0))
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
-    self.leading = descriptor.get('Leading', 0)
+    self.leading = num_value(descriptor.get('Leading', 0))
-    self.bbox = list_value(descriptor['FontBBox'])
+    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
    return
  def __repr__(self):
@ -155,20 +155,20 @@ class PDFSimpleFont(PDFFont):
 class PDFType1Font(PDFSimpleFont):
  def __init__(self, spec):
-    if 'BaseFont' not in spec:
+    try:
      raise PDFFontError('BaseFont is missing')
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    try:
      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
    except KeyError:
-      try:
+      descriptor = dict_value(spec.get('FontDescriptor', {}))
-        descriptor = dict_value(spec['FontDescriptor'])
+      firstchar = int_value(spec.get('FirstChar', 0))
-        firstchar = int_value(spec['FirstChar'])
+      lastchar = int_value(spec.get('LastChar', 255))
-        lastchar = int_value(spec['LastChar'])
+      widths = list_value(spec.get('Widths', [0]*256))
-        widths = dict( (i+firstchar,w) for (i,w)
+      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
                       in enumerate(list_value(spec['Widths'])) )
      except KeyError, k:
        raise PDFFontError('%s is missing' % k)
    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
@ -179,13 +179,10 @@ class PDFTrueTypeFont(PDFType1Font):
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
  def __init__(self, spec):
-    try:
+    firstchar = int_value(spec.get('FirstChar', 0))
-      firstchar = int_value(spec['FirstChar'])
+    lastchar = int_value(spec.get('LastChar', 0))
-      lastchar = int_value(spec['LastChar'])
+    widths = list_value(spec.get('Widths', [0]*256))
-      widths = dict( (i+firstchar,w) for (i,w)
+    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
                     in enumerate(list_value(spec['Widths'])) )
    except KeyError, k:
      raise PDFFontError('%s is missing' % k)
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
@ -215,7 +212,8 @@ class TrueTypeFont:
    return
  def create_cmap(self):
-    if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
+    if 'cmap' not in self.tables:
      raise TrueTypeFont.CMapNotFound
    (base_offset, length) = self.tables['cmap']
    fp = self.fp
    fp.seek(base_offset)
@ -274,15 +272,15 @@ class TrueTypeFont:
 class PDFCIDFont(PDFFont):
  def __init__(self, spec):
    if 'BaseFont' not in spec:
      raise PDFFontError('BaseFont is missing')
    try:
      self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
      self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
                                  self.cidsysteminfo['Ordering'])
    except KeyError:
      raise PDFFontError('CIDSystemInfo not properly defined.')
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
                                self.cidsysteminfo.get('Ordering', 'unknown'))
    self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
    descriptor = dict_value(spec['FontDescriptor'])
    ttf = None
@ -391,11 +389,16 @@ class PDFResourceManager:
    if objid and objid in self.fonts:
      font = self.fonts[objid]
    else:
-      assert spec['Type'] == LITERAL_FONT
+      if STRICT:
        if spec['Type'] != LITERAL_FONT:
          raise PDFFontError('Type is not /Font')
      # Create a Font object.
-      if 'Subtype' not in spec:
+      if 'Subtype' in spec:
        raise PDFFontError('Font Subtype is not specified.')
        subtype = literal_name(spec['Subtype'])
      else:
        if STRICT:
          raise PDFFontError('Font Subtype is not specified.')
        subtype = 'Type1'
      if subtype in ('Type1', 'MMType1'):
        # Type1 Font
        font = PDFType1Font(spec)
@ -411,14 +414,16 @@ class PDFResourceManager:
      elif subtype == 'Type0':
        # Type0 Font
        dfonts = list_value(spec['DescendantFonts'])
-        assert len(dfonts) == 1
+        assert dfonts
        subspec = dict_value(dfonts[0]).copy()
        for k in ('Encoding', 'ToUnicode'):
          if k in spec:
            subspec[k] = resolve1(spec[k])
        font = self.get_font(None, subspec)
      else:
        if STRICT:
          raise PDFFontError('Invalid Font: %r' % spec)
        font = PDFType1Font(spec) # this is so wrong!
      if objid:
        self.fonts[objid] = font
    return font
@ -480,14 +485,17 @@ class PDFContentParser(PSStackParser):
      objs = self.partobj
      (type0, self.partobj) = self.context.pop()
      if len(objs) % 2 != 0:
        if STRICT:
          raise PSTypeError('invalid dictionary construct: %r' % objs)
      dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
      pos += len('ID ')
      self.fp.seek(pos)
      data = self.fp.read(8192) 
      # XXX how do we know the real length other than scanning?
      data = ''
      while 1:
        data += self.fp.read(4096)
        m = self.EOIPAT.search(data)
-      assert m
+        if m: break
      objlen = m.start(0)
      obj = PDFStream(dic, data[:objlen])
      self.push(obj)
@ -731,7 +739,9 @@ class PDFPageInterpreter:
    try:
      self.textstate.font = self.fontmap[literal_name(fontid)]
    except KeyError:
      if STRICT:
        raise PDFInterpreterError('Undefined font id: %r' % fontid)
      return
    self.textstate.fontsize = fontsize
    return
  # setrendering
@ -816,7 +826,9 @@ class PDFPageInterpreter:
    try:
      xobj = stream_value(self.xobjmap[xobjid])
    except KeyError:
      if STRICT:
        raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
      return
    if xobj.dic['Subtype'] == LITERAL_FORM:
      if 1 <= self.debug:
        print >>stderr, 'Processing xobj: %r' % xobj
@ -897,6 +909,7 @@ class PDFPageInterpreter:
              print >>stderr, 'exec: %s' % (obj.name)
            func()
        else:
          if STRICT:
            raise PDFInterpreterError('unknown operator: %r' % obj.name)
      else:
        self.push(obj)
--- a/pdfparser.py
+++ b/pdfparser.py
@ -24,7 +24,7 @@ from utils import choplist, nunpack
 from psparser import PSException, PSSyntaxError, PSTypeError, \
     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, \
-     PSStackParser
+     PSStackParser, STRICT
 ##  PDF Exceptions
@ -52,6 +52,7 @@ class PDFObjRef:
  def __init__(self, doc, objid, genno):
    if objid == 0:
      if STRICT:
        raise PDFValueError('objid cannot be 0.')
    self.doc = doc
    self.objid = objid
@ -94,43 +95,57 @@ def resolveall(x):
 def int_value(x):
  x = resolve1(x)
  if not isinstance(x, int):
    if STRICT:
      raise PDFTypeError('integer required: %r' % x)
    return 0
  return x
 def float_value(x):
  x = resolve1(x)
  if not isinstance(x, float):
    if STRICT:
      raise PDFTypeError('float required: %r' % x)
    return 0.0
  return x
 def num_value(x):
  x = resolve1(x)
  if not (isinstance(x, int) or isinstance(x, float)):
    if STRICT:
      raise PDFTypeError('int or float required: %r' % x)
    return 0
  return x
 def str_value(x):
  x = resolve1(x)
  if not isinstance(x, str):
    if STRICT:
      raise PDFTypeError('string required: %r' % x)
    return ''
  return x
 def list_value(x):
  x = resolve1(x)
  if not (isinstance(x, list) or isinstance(x, tuple)):
    if STRICT:
      raise PDFTypeError('list required: %r' % x)
    return []
  return x
 def dict_value(x):
  x = resolve1(x)
  if not isinstance(x, dict):
    if STRICT:
      raise PDFTypeError('dict required: %r' % x)
    return {}
  return x
 def stream_value(x):
  x = resolve1(x)
  if not isinstance(x, PDFStream):
    if STRICT:
      raise PDFTypeError('stream required: %r' % x)
    return PDFStream({}, '')
  return x
@ -186,6 +201,7 @@ class PDFStream:
              ent0 = ent1
            data = buf
      else:
        if STRICT:
          raise PDFValueError('Invalid filter spec: %r' % f)
    self.data = data
    self.rawdata = None
@ -235,11 +251,14 @@ class PDFXRef:
    while 1:
      (_, line) = parser.nextline()
      if not line:
        if STRICT:
          raise PDFSyntaxError('premature eof: %r' % parser)
        break
      line = line.strip()
      f = line.split(' ')
      if len(f) != 2:
        if line != 'trailer':
          if STRICT:
            raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
        break
      (start, nobjs) = map(long, f)
@ -250,7 +269,9 @@ class PDFXRef:
        (_, line) = parser.nextline()
        f = line.strip().split(' ')
        if len(f) != 3:
          if STRICT:
            raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
          continue
        (pos, genno, use) = f
        self.offsets.append((int(genno), long(pos), use))
    # read trailer
@ -259,9 +280,10 @@ class PDFXRef:
  def getpos(self, objid):
    if objid < self.objid0 or self.objid1 <= objid:
-      raise IndexError
+      raise IndexError(objid)
    (genno, pos, use) = self.offsets[objid-self.objid0]
    if use != 'n':
      if STRICT:
        raise PDFValueError('unused objid=%r' % objid)
    return (None, pos)
@ -272,6 +294,7 @@ class PDFXRefStream:
  def __init__(self, parser):
    (objid, genno, _, stream) = list_value(parser.parse())
    if STRICT:
      assert stream.dic['Type'] == LITERAL_XREF
    size = stream.dic['Size']
    (start, nobjs) = stream.dic.get('Index', (0,size))
@ -285,7 +308,7 @@ class PDFXRefStream:
  def getpos(self, objid):
    if objid < self.objid0 or self.objid1 <= objid:
-      raise IndexError
+      raise IndexError(objid)
    i = self.entlen * (objid-self.objid0)
    ent = self.data[i:i+self.entlen]
    f1 = nunpack(ent[:self.fl1], 1)
@ -334,7 +357,7 @@ class PDFDocument:
    return
  def getobj(self, objid):
-    assert self.xrefs
+    #assert self.xrefs
    if objid in self.objs:
      obj = self.objs[objid]
    else:
@ -345,13 +368,20 @@ class PDFDocument:
        except IndexError:
          pass
      else:
        if STRICT:
          raise PDFValueError('Cannot locate objid=%r' % objid)
        return None
      if strmid:
        stream = stream_value(self.getobj(strmid))
        if stream.dic['Type'] != LITERAL_OBJSTM:
          if STRICT:
            raise PDFSyntaxError('Not a stream object: %r' % stream)
-        if 'N' not in stream.dic:
+        try:
          n = stream.dic['N']
        except KeyError:
          if STRICT:
            raise PDFSyntaxError('N is not defined: %r' % stream)
          n = 0
        if strmid in self.parsed_objs:
          objs = self.parsed_objs[stream]
        else:
@ -363,8 +393,10 @@ class PDFDocument:
      else:
        prevpos = self.parser.seek(index)
        seq = list_value(self.parser.parse())
-        if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
+        if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
          if STRICT:
            raise PDFSyntaxError('invalid stream spec: %r' % seq)
          return None
        obj = seq[3]
        self.parser.seek(prevpos)
      if 2 <= self.debug:
@ -373,7 +405,7 @@ class PDFDocument:
    return obj
  def get_pages(self, debug=0):
-    assert self.xrefs
+    #assert self.xrefs
    def search(obj, parent):
      tree = dict_value(obj).copy()
      for (k,v) in parent.iteritems():
@ -397,6 +429,7 @@ class PDFDocument:
    self.root = root
    self.catalog = dict_value(self.root)
    if self.catalog['Type'] != LITERAL_CATALOG:
      if STRICT:
        raise PDFValueError('Catalog not found!')
    self.outline = self.catalog.get('Outline')
    return
@ -437,24 +470,24 @@ class PDFParser(PSStackParser):
      # stream object
      (dic,) = self.pop(1)
      dic = dict_value(dic)
-      if 'Length' not in dic:
+      try:
        raise PDFValueError('/Length is undefined: %r' % dic)
        objlen = int_value(dic['Length'])
      except KeyError:
        if STRICT:
          raise PDFValueError('/Length is undefined: %r' % dic)
        objlen = 0
      self.seek(pos)
      (_, line) = self.nextline()  # 'stream'
-      self.fp.seek(pos+len(line))
+      pos += len(line)
      self.fp.seek(pos)
      data = self.fp.read(objlen)
-      self.seek(pos+len(line)+objlen)
+      self.seek(pos+objlen)
      while 1:
        (linepos, line) = self.nextline()
-        if not line:
+        if not line or line.startswith('endstream'):
          raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
                               (linepos, line))
        if line.strip():
          if not line.startswith('endstream'):
            raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
                                 (linepos, line))
          break
        objlen += len(line)
        data += line
      if 1 <= self.debug:
        print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
              (pos, objlen, dic, data[:10])
@ -477,7 +510,9 @@ class PDFParser(PSStackParser):
      if line:
        prev = line
    else:
      if STRICT:
        raise PDFSyntaxError('startxref not found!')
      prev = 0
    if 1 <= self.debug:
      print >>stderr, 'xref found: pos=%r' % prev
    self.seek(long(prev))
@ -495,10 +530,11 @@ class PDFParser(PSStackParser):
        # XRefStream: PDF-1.5
        self.seek(linepos)
        xref = PDFXRefStream(self)
-      elif line.strip() != 'xref':
+      else:
        if line.strip() != 'xref':
          if STRICT:
            raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
                                 (linepos, line))
      else:
        xref = PDFXRef(self)
      yield xref
      trailer = xref.trailer
--- a/psparser.py
+++ b/psparser.py
@ -3,6 +3,8 @@ import sys, re
 stderr = sys.stderr
 from utils import choplist
 STRICT = 0
 ##  PS Exceptions
 ##
@ -73,12 +75,18 @@ PSKeywordTable = PSSymbolTable(PSKeyword)
 def literal_name(x):
  if not isinstance(x, PSLiteral):
    if STRICT:
      raise PSTypeError('literal required: %r' % x)
    else:
      return str(x)
  return x.name
 def keyword_name(x):
  if not isinstance(x, PSKeyword):
    if STRICT:
      raise PSTypeError('keyword required: %r' % x)
    else:
      return str(x)
  return x.name
@ -237,23 +245,30 @@ class PSBaseParser:
              s += s1[-1:]
              (linepos, line) = self.nextline()
              if not line:
                if STRICT:
                  raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
                                      (linepos, line))
                break
              charpos = 0
            elif charpos == len(line):
              s += s1
              (linepos, line) = self.nextline()
              if not line:
                if STRICT:
                  raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
                                      (linepos, line))
                break
              charpos = 0
            else:
              s += s1
              break
-          if line[charpos] != ')':
+          if line[charpos] == ')':
            charpos += 1
          else:
            if STRICT:
              raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
                                  (linepos, line))
-          charpos += 1
+            pass
          def convesc(m):
            x = m.group(0)
            if x[1:].isdigit():
@ -271,10 +286,12 @@ class PSBaseParser:
          # hex string object
          ms = self.STRING_HEX.match(line, charpos)
          charpos = ms.end(0)
-          if line[charpos] != '>':
+          if line[charpos] == '>':
            charpos += 1
          else:
            if STRICT:
              raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
                                  (linepos, line))
          charpos += 1
          def convhex(m1):
            return chr(int(m1.group(0), 16))
          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
@ -341,6 +358,7 @@ class PSStackParser(PSBaseParser):
    Pop N objects from the stack.
    '''
    if len(self.partobj) < n:
      if STRICT:
        raise PSSyntaxError('stack too short < %d' % n)
    r = self.partobj[-n:]
    self.partobj = self.partobj[:-n]
@ -366,10 +384,16 @@ class PSStackParser(PSBaseParser):
      return
    def endobj(type1):
-      assert self.context
+      if not self.context:
        if STRICT:
          raise PSTypeError('stack empty.')
      obj = self.partobj
-      (type0, self.partobj) = self.context.pop()
+      (type0, partobj) = self.context[-1]
-      if type0 != type1:
+      if type0 == type1:
        self.partobj = partobj
        self.context.pop()
      else:
        if STRICT:
          raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
                            (type0, self.partobj, type1, obj))
      return obj
@ -407,6 +431,7 @@ class PSStackParser(PSBaseParser):
          # end dictionary
          objs = endobj('d')
          if len(objs) % 2 != 0:
            if STRICT:
              raise PSTypeError('invalid dictionary construct: %r' % objs)
          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
          if 2 <= self.debug:
@ -415,4 +440,5 @@ class PSStackParser(PSBaseParser):
        elif self.do_token(pos, t):
          break
-    return endobj('o')
+    objs = endobj('o')
    return objs