add non-strict mode.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@16 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-01-20 04:44:16 +00:00 · 2008-01-20 04:44:16 +00:00 · 94859ea428
parent 80d17eb79b
commit 94859ea428
3 changed files with 184 additions and 109 deletions
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -7,7 +7,7 @@ try:
 except ImportError:
  from StringIO import StringIO
 from psparser import PSException, PSSyntaxError, PSTypeError, \
-     PSStackParser, PSLiteral, PSKeyword, \
+     PSStackParser, PSLiteral, PSKeyword, STRICT, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
 from pdfparser import PDFException, PDFStream, PDFObjRef, resolve1, \
     int_value, float_value, num_value, \
@ -84,14 +84,14 @@ class PDFFont:
  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
    self.widths = widths
-    self.fontname = descriptor['FontName']
+    self.fontname = descriptor.get('FontName', 'unknown')
    if isinstance(self.fontname, PSLiteral):
      self.fontname = literal_name(self.fontname)
-    self.ascent = descriptor['Ascent']
-    self.descent = descriptor['Descent']
+    self.ascent = num_value(descriptor.get('Ascent', 0))
+    self.descent = num_value(descriptor.get('Descent', 0))
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
-    self.leading = descriptor.get('Leading', 0)
-    self.bbox = list_value(descriptor['FontBBox'])
+    self.leading = num_value(descriptor.get('Leading', 0))
+    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
    return

  def __repr__(self):
@ -155,20 +155,20 @@ class PDFSimpleFont(PDFFont):
 class PDFType1Font(PDFSimpleFont):
  
  def __init__(self, spec):
-    if 'BaseFont' not in spec:
-      raise PDFFontError('BaseFont is missing')
-    self.basefont = literal_name(spec['BaseFont'])
+    try:
+      self.basefont = literal_name(spec['BaseFont'])
+    except KeyError:
+      if STRICT:
+        raise PDFFontError('BaseFont is missing')
+      self.basefont = 'unknown'
    try:
      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
    except KeyError:
-      try:
-        descriptor = dict_value(spec['FontDescriptor'])
-        firstchar = int_value(spec['FirstChar'])
-        lastchar = int_value(spec['LastChar'])
-        widths = dict( (i+firstchar,w) for (i,w)
-                       in enumerate(list_value(spec['Widths'])) )
-      except KeyError, k:
-        raise PDFFontError('%s is missing' % k)
+      descriptor = dict_value(spec.get('FontDescriptor', {}))
+      firstchar = int_value(spec.get('FirstChar', 0))
+      lastchar = int_value(spec.get('LastChar', 255))
+      widths = list_value(spec.get('Widths', [0]*256))
+      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return

@ -179,13 +179,10 @@ class PDFTrueTypeFont(PDFType1Font):
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
  def __init__(self, spec):
-    try:
-      firstchar = int_value(spec['FirstChar'])
-      lastchar = int_value(spec['LastChar'])
-      widths = dict( (i+firstchar,w) for (i,w)
-                     in enumerate(list_value(spec['Widths'])) )
-    except KeyError, k:
-      raise PDFFontError('%s is missing' % k)
+    firstchar = int_value(spec.get('FirstChar', 0))
+    lastchar = int_value(spec.get('LastChar', 0))
+    widths = list_value(spec.get('Widths', [0]*256))
+    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
@ -215,7 +212,8 @@ class TrueTypeFont:
    return

  def create_cmap(self):
-    if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
+    if 'cmap' not in self.tables:
+      raise TrueTypeFont.CMapNotFound
    (base_offset, length) = self.tables['cmap']
    fp = self.fp
    fp.seek(base_offset)
@ -274,15 +272,15 @@ class TrueTypeFont:
 class PDFCIDFont(PDFFont):
  
  def __init__(self, spec):
-    if 'BaseFont' not in spec:
-      raise PDFFontError('BaseFont is missing')
    try:
-      self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
-      self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
-                                  self.cidsysteminfo['Ordering'])
+      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
-      raise PDFFontError('CIDSystemInfo not properly defined.')
-    self.basefont = literal_name(spec['BaseFont'])
+      if STRICT:
+        raise PDFFontError('BaseFont is missing')
+      self.basefont = 'unknown'
+    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
+    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
+                                self.cidsysteminfo.get('Ordering', 'unknown'))
    self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
    descriptor = dict_value(spec['FontDescriptor'])
    ttf = None
@ -391,11 +389,16 @@ class PDFResourceManager:
    if objid and objid in self.fonts:
      font = self.fonts[objid]
    else:
-      assert spec['Type'] == LITERAL_FONT
+      if STRICT:
+        if spec['Type'] != LITERAL_FONT:
+          raise PDFFontError('Type is not /Font')
      # Create a Font object.
-      if 'Subtype' not in spec:
-        raise PDFFontError('Font Subtype is not specified.')
-      subtype = literal_name(spec['Subtype'])
+      if 'Subtype' in spec:
+        subtype = literal_name(spec['Subtype'])
+      else:
+        if STRICT:
+          raise PDFFontError('Font Subtype is not specified.')
+        subtype = 'Type1'
      if subtype in ('Type1', 'MMType1'):
        # Type1 Font
        font = PDFType1Font(spec)
@ -411,14 +414,16 @@ class PDFResourceManager:
      elif subtype == 'Type0':
        # Type0 Font
        dfonts = list_value(spec['DescendantFonts'])
-        assert len(dfonts) == 1
+        assert dfonts
        subspec = dict_value(dfonts[0]).copy()
        for k in ('Encoding', 'ToUnicode'):
          if k in spec:
            subspec[k] = resolve1(spec[k])
        font = self.get_font(None, subspec)
      else:
-        raise PDFFontError('Invalid Font: %r' % spec)
+        if STRICT:
+          raise PDFFontError('Invalid Font: %r' % spec)
+        font = PDFType1Font(spec) # this is so wrong!
      if objid:
        self.fonts[objid] = font
    return font
@ -480,14 +485,17 @@ class PDFContentParser(PSStackParser):
      objs = self.partobj
      (type0, self.partobj) = self.context.pop()
      if len(objs) % 2 != 0:
-        raise PSTypeError('invalid dictionary construct: %r' % objs)
+        if STRICT:
+          raise PSTypeError('invalid dictionary construct: %r' % objs)
      dic = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
      pos += len('ID ')
      self.fp.seek(pos)
-      data = self.fp.read(8192) 
      # XXX how do we know the real length other than scanning?
-      m = self.EOIPAT.search(data)
-      assert m
+      data = ''
+      while 1:
+        data += self.fp.read(4096)
+        m = self.EOIPAT.search(data)
+        if m: break
      objlen = m.start(0)
      obj = PDFStream(dic, data[:objlen])
      self.push(obj)
@ -731,7 +739,9 @@ class PDFPageInterpreter:
    try:
      self.textstate.font = self.fontmap[literal_name(fontid)]
    except KeyError:
-      raise PDFInterpreterError('Undefined font id: %r' % fontid)
+      if STRICT:
+        raise PDFInterpreterError('Undefined font id: %r' % fontid)
+      return
    self.textstate.fontsize = fontsize
    return
  # setrendering
@ -816,7 +826,9 @@ class PDFPageInterpreter:
    try:
      xobj = stream_value(self.xobjmap[xobjid])
    except KeyError:
-      raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
+      if STRICT:
+        raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
+      return
    if xobj.dic['Subtype'] == LITERAL_FORM:
      if 1 <= self.debug:
        print >>stderr, 'Processing xobj: %r' % xobj
@ -897,7 +909,8 @@ class PDFPageInterpreter:
              print >>stderr, 'exec: %s' % (obj.name)
            func()
        else:
-          raise PDFInterpreterError('unknown operator: %r' % obj.name)
+          if STRICT:
+            raise PDFInterpreterError('unknown operator: %r' % obj.name)
      else:
        self.push(obj)
    return
--- a/pdfparser.py
+++ b/pdfparser.py
@ -24,7 +24,7 @@ from utils import choplist, nunpack
 from psparser import PSException, PSSyntaxError, PSTypeError, \
     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, \
-     PSStackParser
+     PSStackParser, STRICT


 ##  PDF Exceptions
@ -52,7 +52,8 @@ class PDFObjRef:
  
  def __init__(self, doc, objid, genno):
    if objid == 0:
-      raise PDFValueError('objid cannot be 0.')
+      if STRICT:
+        raise PDFValueError('objid cannot be 0.')
    self.doc = doc
    self.objid = objid
    #self.genno = genno  # Never used.
@ -94,43 +95,57 @@ def resolveall(x):
 def int_value(x):
  x = resolve1(x)
  if not isinstance(x, int):
-    raise PDFTypeError('integer required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('integer required: %r' % x)
+    return 0
  return x

 def float_value(x):
  x = resolve1(x)
  if not isinstance(x, float):
-    raise PDFTypeError('float required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('float required: %r' % x)
+    return 0.0
  return x

 def num_value(x):
  x = resolve1(x)
  if not (isinstance(x, int) or isinstance(x, float)):
-    raise PDFTypeError('int or float required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('int or float required: %r' % x)
+    return 0
  return x

 def str_value(x):
  x = resolve1(x)
  if not isinstance(x, str):
-    raise PDFTypeError('string required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('string required: %r' % x)
+    return ''
  return x

 def list_value(x):
  x = resolve1(x)
  if not (isinstance(x, list) or isinstance(x, tuple)):
-    raise PDFTypeError('list required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('list required: %r' % x)
+    return []
  return x

 def dict_value(x):
  x = resolve1(x)
  if not isinstance(x, dict):
-    raise PDFTypeError('dict required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('dict required: %r' % x)
+    return {}
  return x

 def stream_value(x):
  x = resolve1(x)
  if not isinstance(x, PDFStream):
-    raise PDFTypeError('stream required: %r' % x)
+    if STRICT:
+      raise PDFTypeError('stream required: %r' % x)
+    return PDFStream({}, '')
  return x


@ -186,7 +201,8 @@ class PDFStream:
              ent0 = ent1
            data = buf
      else:
-        raise PDFValueError('Invalid filter spec: %r' % f)
+        if STRICT:
+          raise PDFValueError('Invalid filter spec: %r' % f)
    self.data = data
    self.rawdata = None
    return
@ -235,12 +251,15 @@ class PDFXRef:
    while 1:
      (_, line) = parser.nextline()
      if not line:
-        raise PDFSyntaxError('premature eof: %r' % parser)
+        if STRICT:
+          raise PDFSyntaxError('premature eof: %r' % parser)
+        break
      line = line.strip()
      f = line.split(' ')
      if len(f) != 2:
        if line != 'trailer':
-          raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
+          if STRICT:
+            raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
        break
      (start, nobjs) = map(long, f)
      self.objid0 = start
@ -250,7 +269,9 @@ class PDFXRef:
        (_, line) = parser.nextline()
        f = line.strip().split(' ')
        if len(f) != 3:
-          raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
+          if STRICT:
+            raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
+          continue
        (pos, genno, use) = f
        self.offsets.append((int(genno), long(pos), use))
    # read trailer
@ -259,10 +280,11 @@ class PDFXRef:

  def getpos(self, objid):
    if objid < self.objid0 or self.objid1 <= objid:
-      raise IndexError
+      raise IndexError(objid)
    (genno, pos, use) = self.offsets[objid-self.objid0]
    if use != 'n':
-      raise PDFValueError('unused objid=%r' % objid)
+      if STRICT:
+        raise PDFValueError('unused objid=%r' % objid)
    return (None, pos)


@ -272,7 +294,8 @@ class PDFXRefStream:

  def __init__(self, parser):
    (objid, genno, _, stream) = list_value(parser.parse())
-    assert stream.dic['Type'] == LITERAL_XREF
+    if STRICT:
+      assert stream.dic['Type'] == LITERAL_XREF
    size = stream.dic['Size']
    (start, nobjs) = stream.dic.get('Index', (0,size))
    self.objid0 = start
@ -285,7 +308,7 @@ class PDFXRefStream:

  def getpos(self, objid):
    if objid < self.objid0 or self.objid1 <= objid:
-      raise IndexError
+      raise IndexError(objid)
    i = self.entlen * (objid-self.objid0)
    ent = self.data[i:i+self.entlen]
    f1 = nunpack(ent[:self.fl1], 1)
@ -334,7 +357,7 @@ class PDFDocument:
    return

  def getobj(self, objid):
-    assert self.xrefs
+    #assert self.xrefs
    if objid in self.objs:
      obj = self.objs[objid]
    else:
@ -345,13 +368,20 @@ class PDFDocument:
        except IndexError:
          pass
      else:
-        raise PDFValueError('Cannot locate objid=%r' % objid)
+        if STRICT:
+          raise PDFValueError('Cannot locate objid=%r' % objid)
+        return None
      if strmid:
        stream = stream_value(self.getobj(strmid))
        if stream.dic['Type'] != LITERAL_OBJSTM:
-          raise PDFSyntaxError('Not a stream object: %r' % stream)
-        if 'N' not in stream.dic:
-          raise PDFSyntaxError('N is not defined: %r' % stream)
+          if STRICT:
+            raise PDFSyntaxError('Not a stream object: %r' % stream)
+        try:
+          n = stream.dic['N']
+        except KeyError:
+          if STRICT:
+            raise PDFSyntaxError('N is not defined: %r' % stream)
+          n = 0
        if strmid in self.parsed_objs:
          objs = self.parsed_objs[stream]
        else:
@ -363,8 +393,10 @@ class PDFDocument:
      else:
        prevpos = self.parser.seek(index)
        seq = list_value(self.parser.parse())
-        if not (len(seq) == 4 and seq[0] == objid and seq[2] == KEYWORD_OBJ):
-          raise PDFSyntaxError('invalid stream spec: %r' % seq)
+        if not (4 <= len(seq) and seq[0] == objid and seq[2] == KEYWORD_OBJ):
+          if STRICT:
+            raise PDFSyntaxError('invalid stream spec: %r' % seq)
+          return None
        obj = seq[3]
        self.parser.seek(prevpos)
      if 2 <= self.debug:
@ -373,7 +405,7 @@ class PDFDocument:
    return obj
  
  def get_pages(self, debug=0):
-    assert self.xrefs
+    #assert self.xrefs
    def search(obj, parent):
      tree = dict_value(obj).copy()
      for (k,v) in parent.iteritems():
@ -397,7 +429,8 @@ class PDFDocument:
    self.root = root
    self.catalog = dict_value(self.root)
    if self.catalog['Type'] != LITERAL_CATALOG:
-      raise PDFValueError('Catalog not found!')
+      if STRICT:
+        raise PDFValueError('Catalog not found!')
    self.outline = self.catalog.get('Outline')
    return
  
@ -437,24 +470,24 @@ class PDFParser(PSStackParser):
      # stream object
      (dic,) = self.pop(1)
      dic = dict_value(dic)
-      if 'Length' not in dic:
-        raise PDFValueError('/Length is undefined: %r' % dic)
-      objlen = int_value(dic['Length'])
+      try:
+        objlen = int_value(dic['Length'])
+      except KeyError:
+        if STRICT:
+          raise PDFValueError('/Length is undefined: %r' % dic)
+        objlen = 0
      self.seek(pos)
      (_, line) = self.nextline()  # 'stream'
-      self.fp.seek(pos+len(line))
+      pos += len(line)
+      self.fp.seek(pos)
      data = self.fp.read(objlen)
-      self.seek(pos+len(line)+objlen)
+      self.seek(pos+objlen)
      while 1:
        (linepos, line) = self.nextline()
-        if not line:
-          raise PDFSyntaxError('premature eof, need endstream: linepos=%d, line=%r' %
-                               (linepos, line))
-        if line.strip():
-          if not line.startswith('endstream'):
-            raise PDFSyntaxError('need endstream: linepos=%d, line=%r' %
-                                 (linepos, line))
+        if not line or line.startswith('endstream'):
          break
+        objlen += len(line)
+        data += line
      if 1 <= self.debug:
        print >>stderr, 'Stream: pos=%d, objlen=%d, dic=%r, data=%r...' % \
              (pos, objlen, dic, data[:10])
@ -477,7 +510,9 @@ class PDFParser(PSStackParser):
      if line:
        prev = line
    else:
-      raise PDFSyntaxError('startxref not found!')
+      if STRICT:
+        raise PDFSyntaxError('startxref not found!')
+      prev = 0
    if 1 <= self.debug:
      print >>stderr, 'xref found: pos=%r' % prev
    self.seek(long(prev))
@ -495,10 +530,11 @@ class PDFParser(PSStackParser):
        # XRefStream: PDF-1.5
        self.seek(linepos)
        xref = PDFXRefStream(self)
-      elif line.strip() != 'xref':
-        raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
-                             (linepos, line))
      else:
+        if line.strip() != 'xref':
+          if STRICT:
+            raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
+                                 (linepos, line))
        xref = PDFXRef(self)
      yield xref
      trailer = xref.trailer
--- a/psparser.py
+++ b/psparser.py
@ -3,6 +3,8 @@ import sys, re
 stderr = sys.stderr
 from utils import choplist

+STRICT = 0
+

 ##  PS Exceptions
 ##
@ -73,12 +75,18 @@ PSKeywordTable = PSSymbolTable(PSKeyword)

 def literal_name(x):
  if not isinstance(x, PSLiteral):
-    raise PSTypeError('literal required: %r' % x)
+    if STRICT:
+      raise PSTypeError('literal required: %r' % x)
+    else:
+      return str(x)
  return x.name

 def keyword_name(x):
  if not isinstance(x, PSKeyword):
-    raise PSTypeError('keyword required: %r' % x)
+    if STRICT:
+      raise PSTypeError('keyword required: %r' % x)
+    else:
+      return str(x)
  return x.name


@ -237,23 +245,30 @@ class PSBaseParser:
              s += s1[-1:]
              (linepos, line) = self.nextline()
              if not line:
-                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
-                                    (linepos, line))
+                if STRICT:
+                  raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                      (linepos, line))
+                break
              charpos = 0
            elif charpos == len(line):
              s += s1
              (linepos, line) = self.nextline()
              if not line:
-                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
-                                    (linepos, line))
+                if STRICT:
+                  raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                      (linepos, line))
+                break
              charpos = 0
            else:
              s += s1
              break
-          if line[charpos] != ')':
-            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
-                                (linepos, line))
-          charpos += 1
+          if line[charpos] == ')':
+            charpos += 1
+          else:
+            if STRICT:
+              raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                  (linepos, line))
+            pass
          def convesc(m):
            x = m.group(0)
            if x[1:].isdigit():
@ -271,10 +286,12 @@ class PSBaseParser:
          # hex string object
          ms = self.STRING_HEX.match(line, charpos)
          charpos = ms.end(0)
-          if line[charpos] != '>':
-            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
-                                (linepos, line))
-          charpos += 1
+          if line[charpos] == '>':
+            charpos += 1
+          else:
+            if STRICT:
+              raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                  (linepos, line))
          def convhex(m1):
            return chr(int(m1.group(0), 16))
          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
@ -341,7 +358,8 @@ class PSStackParser(PSBaseParser):
    Pop N objects from the stack.
    '''
    if len(self.partobj) < n:
-      raise PSSyntaxError('stack too short < %d' % n)
+      if STRICT:
+        raise PSSyntaxError('stack too short < %d' % n)
    r = self.partobj[-n:]
    self.partobj = self.partobj[:-n]
    return r
@ -366,12 +384,18 @@ class PSStackParser(PSBaseParser):
      return

    def endobj(type1):
-      assert self.context
+      if not self.context:
+        if STRICT:
+          raise PSTypeError('stack empty.')
      obj = self.partobj
-      (type0, self.partobj) = self.context.pop()
-      if type0 != type1:
-        raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
-                          (type0, self.partobj, type1, obj))
+      (type0, partobj) = self.context[-1]
+      if type0 == type1:
+        self.partobj = partobj
+        self.context.pop()
+      else:
+        if STRICT:
+          raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
+                            (type0, self.partobj, type1, obj))
      return obj

    startobj('o')
@ -407,7 +431,8 @@ class PSStackParser(PSBaseParser):
          # end dictionary
          objs = endobj('d')
          if len(objs) % 2 != 0:
-            raise PSTypeError('invalid dictionary construct: %r' % objs)
+            if STRICT:
+              raise PSTypeError('invalid dictionary construct: %r' % objs)
          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
          if 2 <= self.debug:
            print >>stderr, 'end dict: %r' % d
@ -415,4 +440,5 @@ class PSStackParser(PSBaseParser):
        elif self.do_token(pos, t):
          break

-    return endobj('o')
+    objs = endobj('o')
+    return objs