pdfminer.six/psparser.py

#!/usr/bin/env python
import sys, re
stderr = sys.stderr
from utils import choplist


##  PS Exceptions
##
class PSException(Exception): pass
class PSSyntaxError(PSException): pass
class PSTypeError(PSException): pass
class PSValueError(PSException): pass


##  PostScript Types
##
class PSLiteral:
  '''
  PS literals (e.g. "/Name").
  Caution: Never create these objects directly.
  Use PSLiteralTable.intern() instead.
  '''
  def __init__(self, name):
    self.name = name
    return
  def __repr__(self):
    return '/%s' % self.name

class PSKeyword:
  '''
  PS keywords (e.g. "showpage").
  Caution: Never create these objects directly.
  Use PSKeywordTable.intern() instead.
  '''
  def __init__(self, name):
    self.name = name
    return
  def __repr__(self):
    return self.name

class PSSymbolTable:
  '''
  Symbol table that stores PSLiteral or PSKeyword.
  '''
  def __init__(self, classe):
    self.dic = {}
    self.classe = classe
    return

  def intern(self, name):
    if name in self.dic:
      lit = self.dic[name]
    else:
      lit = self.classe(name)
      self.dic[name] = lit
    return lit

PSLiteralTable = PSSymbolTable(PSLiteral)
PSKeywordTable = PSSymbolTable(PSKeyword)


def literal_name(x):
  if not isinstance(x, PSLiteral):
    raise PSTypeError('literal required: %r' % x)
  return x.name

def keyword_name(x):
  if not isinstance(x, PSKeyword):
    raise PSTypeError('keyword required: %r' % x)
  return x.name


##  PSBaseParser
##
class PSBaseParser:

  '''PostScript parser that performs only basic tokenization.'''

  def __init__(self, fp, debug=0):
    self.fp = fp
    self.debug = debug
    self.bufsize = 4096
    self.seek(0)
    return

  def __repr__(self):
    return '<PSBaseParser: %r>' % (self.fp,)

  def seek(self, pos):
    '''
    seeks to the given pos.
    '''
    if 2 <= self.debug:
      print >>stderr, 'seek:', pos
    self.fp.seek(pos)
    self.linepos = pos
    self.linebuf = None
    self.curpos = 0
    self.line = ''
    return

  EOLCHAR = re.compile(r'[\r\n]')
  def nextline(self):
    '''
    fetches the next line that ends either with \\r or \\n.
    '''
    line = ''
    eol = None
    while 1:
      if not self.linebuf or len(self.linebuf) <= self.curpos:
        # fetch next chunk.
        self.linebuf = self.fp.read(self.bufsize)
        if not self.linebuf:
          # at EOF.
          break
        self.curpos = 0
      if eol:
        c = self.linebuf[self.curpos]
        # handle '\r\n'
        if (eol == '\r' and c == '\n'):
          line += c
          self.curpos += 1
        break
      m = self.EOLCHAR.search(self.linebuf, self.curpos)
      if m:
        i = m.end(0)
        line += self.linebuf[self.curpos:i]
        eol = self.linebuf[i-1]
        self.curpos = i
      else:
        # fetch further
        line += self.linebuf[self.curpos:]
        self.linebuf = None
    self.linepos += len(line)
    return line

  def revreadlines(self):
    '''
    fetches lines backword. used to locate trailers.
    '''
    self.fp.seek(0, 2)
    pos = self.fp.tell()
    buf = ''
    while 0 < pos:
      pos = max(0, pos-self.bufsize)
      self.fp.seek(pos)
      s = self.fp.read(self.bufsize)
      if not s: break
      while 1:
        n = max(s.rfind('\r'), s.rfind('\n'))
        if n == -1:
          buf = s + buf
          break
        yield buf+s[n:]
        s = s[:n]
        buf = ''
    return

  SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
  TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
  LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
  NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
  STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
  STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
  STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
  STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')

  def parse(self):
    '''
    Yields a list of basic tokens: keywords, literals, strings,
    numbers and parentheses. Comments are skipped.
    Nested objects (i.e. arrays and dictionaries) are not handled.
    '''
    while 1:
      # do not strip line! we need to distinguish last '\n' or '\r'
      linepos0 = self.linepos
      self.line = self.nextline()
      if not self.line: break
      if 2 <= self.debug:
        print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
      # do this before removing comment
      if self.line.startswith('%%EOF'): break
      charpos = 0

      # tokenize
      while 1:
        m = self.TOKEN.search(self.line, charpos)
        if not m: break
        t = m.group(0)
        pos = linepos0 + m.start(0)
        charpos = m.end(0)

        if t == '%':
          # skip comment
          if 2 <= self.debug:
            print >>stderr, 'comment: %r' % self.line[charpos:]
          break

        elif t == '/':
          # literal object
          mn = self.LITERAL.match(self.line, m.start(0)+1)
          lit = PSLiteralTable.intern(mn.group(0))
          yield (pos, lit)
          charpos = mn.end(0)
          if 2 <= self.debug:
            print >>stderr, 'name: %r' % lit

        elif t == '(':
          # normal string object
          s = ''
          while 1:
            ms = self.STRING_NORM.match(self.line, charpos)
            if not ms: break
            s1 = ms.group(0)
            charpos = ms.end(0)
            if len(s1) == 1 and s1[-1] == '\\':
              s += s1[-1:]
              self.line = self.nextline()
              if not self.line:
                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
                                    (self.linepos, self.line))
              charpos = 0
            elif charpos == len(self.line):
              s += s1
              self.line = self.nextline()
              if not self.line:
                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
                                    (self.linepos, self.line))
              charpos = 0
            else:
              s += s1
              break
          if self.line[charpos] != ')':
            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
                                (self.linepos, self.line))
          charpos += 1
          def convesc(m):
            x = m.group(0)
            if x[1:].isdigit():
              return chr(int(x[1:], 8))
            else:
              return x[1]
          s = self.STRING_NORM_SUB.sub(convesc, s)
          if 2 <= self.debug:
            print >>stderr, 'str: %r' % s
          yield (pos, s)

        elif t == '<':
          # hex string object
          ms = self.STRING_HEX.match(self.line, charpos)
          charpos = ms.end(0)
          if self.line[charpos] != '>':
            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
                                (self.linepos, self.line))
          charpos += 1
          def convhex(m1):
            return chr(int(m1.group(0), 16))
          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
          if 2 <= self.debug:
            print >>stderr, 'str: %r' % s
          yield (pos, s)

        elif self.NUMBER.match(t):
          # number
          if '.' in t:
            n = float(t)
          else:
            n = int(t)
          if 2 <= self.debug:
            print >>stderr, 'number: %r' % n
          yield (pos, n)

        elif t in ('true','false'):
          # boolean
          if 2 <= self.debug:
            print >>stderr, 'boolean: %r' % t
          yield (pos, (t == 'true'))

        else:
          # other token
          if 2 <= self.debug:
            print >>stderr, 'keyword: %r' % t
          yield (pos, PSKeywordTable.intern(t))

    return


##  PSStackParser
##
class PSStackParser(PSBaseParser):

  '''
  PostScript parser that recognizes compound objects
  such as arrays and dictionaries.
  '''

  def __init__(self, fp, debug=0):
    PSBaseParser.__init__(self, fp, debug=debug)
    self.context = []
    self.partobj = None
    return

  def do_token(self, pos, token):
    '''
    Handles special tokens.
    Returns true if the token denotes the end of an object.
    '''
    return False

  def push(self, obj):
    '''
    Push an object to the stack.
    '''
    self.partobj.append(obj)
    return

  def pop(self, n):
    '''
    Pop N objects from the stack.
    '''
    if len(self.partobj) < n:
      raise PSSyntaxError('stack too short < %d' % n)
    r = self.partobj[-n:]
    self.partobj = self.partobj[:-n]
    return r

  def popall(self):
    '''
    Discards all the objects on the stack.
    '''
    self.partobj = []
    return

  def parse(self):
    '''
    Yields a list of objects: keywords, literals, strings,
    numbers, arrays and dictionaries. Arrays and dictionaries
    are represented as Python sequence and dictionaries.
    '''

    def startobj(type):
      self.context.append((type, self.partobj))
      self.partobj = []
      return

    def endobj(type1):
      assert self.context
      obj = self.partobj
      (type0, self.partobj) = self.context.pop()
      if type0 != type1:
        raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
                          (type0, self.partobj, type1, obj))
      return obj

    startobj('o')

    for (pos,t) in PSBaseParser.parse(self):
      if isinstance(t, int) or isinstance(t, float):
        self.push(t)
      elif isinstance(t, str):
        self.push(t)
      elif isinstance(t, PSLiteral):
        self.push(t)
      else:
        c = keyword_name(t)
        if c == '{' or c == '}':
          self.push(t)
        elif c == '[':
          # begin array
          if 2 <= self.debug:
            print >>stderr, 'start array'
          startobj('a')
        elif c == ']':
          # end array
          a = endobj('a')
          if 2 <= self.debug:
            print >>stderr, 'end array: %r' % a
          self.push(a)
        elif c == '<<':
          # begin dictionary
          if 2 <= self.debug:
            print >>stderr, 'start dict'
          startobj('d')
        elif c == '>>':
          # end dictionary
          objs = endobj('d')
          if len(objs) % 2 != 0:
            raise PSTypeError('invalid dictionary construct: %r' % objs)
          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
          if 2 <= self.debug:
            print >>stderr, 'end dict: %r' % d
          self.push(d)
        elif self.do_token(pos, t):
          break

    return endobj('o')