From 6d93b4a7f7f51e00b89af6136dce240a9b1f4e59 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Mon, 31 Dec 2007 03:41:45 +0000
Subject: [PATCH] split files.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@4 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 cmap.py      |  383 +++++++++++
 pdf2txt.py   |  111 +++
 pdfinterp.py |  827 +++++++++++++++++++++++
 pdfparser.py | 1834 +++-----------------------------------------------
 psparser.py  |  396 +++++++++++
 utils.py     |   29 +
 6 files changed, 1825 insertions(+), 1755 deletions(-)
 create mode 100644 cmap.py
 create mode 100755 pdf2txt.py
 create mode 100644 pdfinterp.py
 create mode 100644 psparser.py
 create mode 100644 utils.py

diff --git a/cmap.py b/cmap.py
new file mode 100644
index 0000000..d08a299
--- /dev/null
+++ b/cmap.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from struct import pack, unpack
+from utils import choplist, nunpack
+from psparser import PSException, PSSyntaxError, PSTypeError, \
+     PSLiteral, PSKeyword, literal_name, keyword_name, \
+     PSStackParser
+try:
+  import cdb
+except ImportError:
+  import pycdb as cdb
+
+
+##  CMap
+##
+class CMap:
+  
+  def __init__(self, debug=0):
+    self.debug = debug
+    self.code2cid = {}
+    self.cid2code = {}
+    self.attrs = {}
+    return
+
+  def __repr__(self):
+    return '<CMap: %s>' % self.attrs.get('CMapName')
+
+  def update(self, code2cid=None, cid2code=None):
+    if code2cid:
+      self.code2cid.update(code2cid)
+    if cid2code:
+      self.cid2code.update(cid2code)
+    return self
+    
+  def copycmap(self, cmap):
+    self.code2cid.update(cmap.getall_code2cid())
+    self.cid2code.update(cmap.getall_cid2code())
+    return self
+
+  def register_code2cid(self, code, cid):
+    assert isinstance(code, str)
+    assert isinstance(cid, int)
+    self.code2cid[code] = cid
+    return self
+
+  def register_cid2code(self, cid, code):
+    from glyphlist import charname2unicode
+    assert isinstance(cid, int)
+    if isinstance(code, PSLiteral):
+      code = pack('>H', charname2unicode[code.name])
+    self.cid2code[cid] = code
+    return self
+
+  def decode(self, bytes):
+    if self.debug:
+      print >>stderr, 'decode: %r, %r' % (self, bytes)
+    x = ''
+    for c in bytes:
+      if x:
+        if x+c in self.code2cid:
+          yield self.code2cid[x+c]
+        x = ''
+      elif c in self.code2cid:
+        yield self.code2cid[c]
+      else:
+        x = c
+    return
+  
+  def is_vertical(self):
+    return self.attrs.get('WMode', '0') == '1'
+
+  def tocid(self, code):
+    return self.code2cid.get(code)
+  def tocode(self, cid):
+    return self.cid2code.get(cid)
+
+  def getall_attrs(self):
+    return self.attrs.iteritems()
+  def getall_code2cid(self):
+    return self.code2cid.iteritems()
+  def getall_cid2code(self):
+    return self.cid2code.iteritems()
+
+  
+##  CDBCMap
+##
+class CDBCMap(CMap):
+  
+  def __init__(self, cdbname, debug=0):
+    CMap.__init__(self, debug=debug)
+    self.cdbname = cdbname
+    self.db = cdb.init(cdbname)
+    return
+
+  def __repr__(self):
+    return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
+
+  def tocid(self, code):
+    k = 'c'+code
+    if not self.db.has_key(k):
+      return None
+    return unpack('>L', self.db[k])
+  def tocode(self, cid):
+    k = 'i'+pack('>L', cid)
+    if not self.db.has_key(k):
+      return None
+    return self.db[k]
+  
+  def is_vertical(self):
+    return (self.db.has_key('/WMode') and
+            self.db['/WMode'] == '1')
+
+  def getall(self, c):
+    while 1:
+      x = self.db.each()
+      if not x: break
+      (k,v) = x
+      if k.startswith(c):
+        yield (k[1:], unpack('>L', v)[0])
+    return
+
+  def getall_attrs(self):
+    while 1:
+      x = self.db.each()
+      if not x: break
+      (k,v) = x
+      if k.startswith('/'):
+        yield (k[1:], eval(v)[0])
+    return
+  
+  def getall_cid2code(self):
+    return self.getall('i')
+  def getall_code2cid(self):
+    return self.getall('c')
+
+  def decode(self, bytes):
+    if self.debug:
+      print >>stderr, 'decode: %r, %r' % (self, bytes)
+    x = ''
+    for c in bytes:
+      if x:
+        if x+c in self.code2cid:
+          yield self.code2cid[x+c]
+        elif self.db.has_key('c'+x+c):
+          (dest,) = unpack('>L', self.db['c'+x+c])
+          self.code2cid[x+c] = dest
+          yield dest
+        x = ''
+      elif c in self.code2cid:
+        yield self.code2cid[c]
+      elif self.db.has_key('c'+c):
+        (dest,) = unpack('>L', self.db['c'+c])
+        self.code2cid[c] = dest
+        yield dest
+      else:
+        x = c
+    return
+
+
+##  CMapDB
+##
+class CMapDB:
+
+  CMAP_ALIAS = {
+    }
+  
+  debug = 0
+  dirname = None
+  cdbdirname = None
+  cmapdb = {}
+
+  @classmethod
+  def initialize(klass, dirname, cdbdirname=None, debug=0):
+    klass.dirname = dirname
+    klass.cdbdirname = cdbdirname or dirname
+    klass.debug = debug
+    return
+
+  @classmethod
+  def get_cmap(klass, cmapname):
+    import os.path
+    cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
+    if cmapname in klass.cmapdb:
+      cmap = klass.cmapdb[cmapname]
+    else:
+      fname = os.path.join(klass.dirname, cmapname)
+      cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
+      if os.path.exists(cdbname):
+        if 1 <= klass.debug:
+          print >>stderr, 'Opening: CDBCMap %r...' % cdbname
+        cmap = CDBCMap(cdbname)
+      elif os.path.exists(fname):
+        if 1 <= klass.debug:
+          print >>stderr, 'Reading: CMap %r...' % fname
+        cmap = CMap()
+        fp = file(fname)
+        CMapParser(cmap, fp).parse()
+        fp.close()
+      klass.cmapdb[cmapname] = cmap
+    return cmap
+
+
+##  CMapParser
+##
+class CMapParser(PSStackParser):
+
+  def __init__(self, cmap, fp, debug=0):
+    PSStackParser.__init__(self, fp, debug=debug)
+    self.cmap = cmap
+    self.in_cmap = False
+    return
+
+  def do_token(self, _, token):
+    name = token.name
+    if name == 'begincmap':
+      self.in_cmap = True
+      self.popall()
+      return
+    elif name == 'endcmap':
+      self.in_cmap = False
+      return
+    if not self.in_cmap: return
+    #
+    if name == 'def':
+      try:
+        (k,v) = self.pop(2)
+        self.cmap.attrs[literal_name(k)] = v
+      except PSSyntaxError:
+        pass
+      return
+    
+    if name == 'usecmap':
+      try:
+        (cmapname,) = self.pop(1)
+        self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
+      except PSSyntaxError:
+        pass
+      return
+      
+    if name == 'begincodespacerange':
+      self.popall()
+      return
+    if name == 'endcodespacerange':
+      if 1 <= self.debug:
+        print >>stderr, 'codespace: %r' % self.partobj
+      self.popall()
+      return
+    
+    if name == 'begincidrange':
+      self.popall()
+      return
+    if name == 'endcidrange':
+      for (s,e,cid) in choplist(3, self.partobj):
+        assert isinstance(s, str)
+        assert isinstance(e, str)
+        assert isinstance(cid, int)
+        assert len(s) == len(e)
+        sprefix = s[:-4]
+        eprefix = e[:-4]
+        assert sprefix == eprefix
+        svar = s[-4:]
+        evar = e[-4:]
+        s1 = nunpack(svar)
+        e1 = nunpack(evar)
+        vlen = len(svar)
+        assert s1 <= e1
+        for i in xrange(e1-s1+1):
+          x = sprefix+pack('>L',s1+i)[-vlen:]
+          self.cmap.register_code2cid(x, cid+i)
+      self.popall()
+      return
+    
+    if name == 'begincidchar':
+      self.popall()
+      return
+    if name == 'endcidchar':
+      for (cid,code) in choplist(2, self.partobj):
+        assert isinstance(code, str)
+        assert isinstance(cid, str)
+        self.cmap.register_code2cid(code, nunpack(cid))
+      self.popall()
+      return
+        
+    if name == 'beginbfrange':
+      self.popall()
+      return
+    if name == 'endbfrange':
+      for (s,e,code) in choplist(3, self.partobj):
+        assert isinstance(s, str)
+        assert isinstance(e, str)
+        assert len(s) == len(e)
+        s1 = nunpack(s)
+        e1 = nunpack(e)
+        assert s1 <= e1
+        if isinstance(code, list):
+          for i in xrange(e1-s1+1):
+            self.cmap.register_cid2code(s1+i, code[i])
+        else:
+          var = code[-4:]
+          base = nunpack(var)
+          prefix = code[:-4]
+          vlen = len(var)
+          for i in xrange(e1-s1+1):
+            x = prefix+pack('>L',base+i)[-vlen:]
+            self.cmap.register_cid2code(s1+i, x)
+      self.popall()
+      return
+        
+    if name == 'beginbfchar':
+      self.popall()
+      return
+    if name == 'endbfchar':
+      for (cid,code) in choplist(2, self.partobj):
+        assert isinstance(cid, str)
+        assert isinstance(code, str)
+        self.cmap.register_cid2code(nunpack(cid), code)
+      self.popall()
+      return
+        
+    if name == 'beginnotdefrange':
+      self.popall()
+      return
+    if name == 'endnotdefrange':
+      if 1 <= self.debug:
+        print >>stderr, 'notdefrange: %r' % self.partobj
+      self.popall()
+      return
+    
+    return
+
+
+##  FontMetricsDB
+##
+class FontMetricsDB:
+  from fontmetrics import FONT_METRICS
+  
+  @classmethod
+  def get_metrics(klass, fontname):
+    return klass.FONT_METRICS[fontname]
+
+
+##  EncodingDB
+##
+class EncodingDB:
+      
+  from glyphlist import charname2unicode
+  from latin_enc import ENCODING
+  
+  std2unicode = {}
+  mac2unicode = {}
+  win2unicode = {}
+  pdf2unicode = {}
+  for (name,std,mac,win,pdf) in ENCODING:
+    c = unichr(charname2unicode[name])
+    if std: std2unicode[std] = c
+    if mac: mac2unicode[mac] = c
+    if win: win2unicode[win] = c
+    if pdf: pdf2unicode[pdf] = c
+  
+  encodings = {
+    'StandardEncoding': std2unicode,
+    'MacRomanEncoding': mac2unicode,
+    'WinAnsiEncoding': win2unicode,
+    'PDFDocEncoding': pdf2unicode,
+    }
+  
+  @classmethod
+  def get_encoding(klass, name, diff=None):
+    cid2unicode = klass.encodings.get(name, klass.std2unicode)
+    if diff:
+      cid2unicode = cid2unicode.copy()
+      cid = 0
+      for x in diff:
+        if isinstance(x, int):
+          cid = x
+        elif isinstance(x, PSLiteral):
+          try:
+            cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
+          except KeyError:
+            pass
+          cid += 1
+    return cid2unicode
diff --git a/pdf2txt.py b/pdf2txt.py
new file mode 100755
index 0000000..c52daad
--- /dev/null
+++ b/pdf2txt.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+import sys
+stdout = sys.stdout
+stderr = sys.stderr
+from pdfparser import PDFDocument, PDFParser
+from pdfinterp import PDFDevice, PDFResourceManager, \
+     PDFPageInterpreter, PDFUnicodeNotDefined, \
+     mult_matrix, apply_matrix
+from cmap import CMapDB
+
+
+##  TextConverter
+##
+class TextConverter(PDFDevice):
+
+  def __init__(self, outfp, rsrc, codec):
+    PDFDevice.__init__(self, rsrc)
+    self.outfp = outfp
+    self.codec = codec
+    return
+
+  def close(self):
+    self.outfp.write('\n')
+    return
+  
+  def begin_block(self, name):
+    self.outfp.write('<block name="%s">\n' % name)
+    return
+  def end_block(self):
+    self.outfp.write('</block>\n')
+    return
+
+  def render_string(self, textstate, textmatrix, size, seq):
+    font = textstate.font
+    spwidth = int(-font.char_width(32) * 0.6) # space width
+    buf = ''
+    for x in seq:
+      if isinstance(x, int) or isinstance(x, float):
+        if not font.is_vertical() and x <= spwidth:
+          buf += ' '
+      else:
+        chars = font.decode(x)
+        for cid in chars:
+          try:
+            char = font.to_unicode(cid)
+          except PDFUnicodeNotDefined, e:
+            (cidcoding, cid) = e.args
+            char = u'[%s:%d]' % (cidcoding, cid)
+          buf += char
+    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
+    skewed = (b != 0 or c != 0)
+    if font.is_vertical():
+      size = -size
+      tag = 'vtext'
+    else:
+      tag = 'htext'
+    if skewed:
+      tag += ' skewed'
+    s = buf.encode(self.codec, 'xmlcharrefreplace')
+    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
+    def f(x): return '%.03f' % x
+    self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
+                     (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
+    return
+
+
+# pdf2txt
+def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
+  device = TextConverter(outfp, rsrc, codec)
+  doc = PDFDocument(debug=debug)
+  fp = file(fname)
+  parser = PDFParser(doc, fp, debug=debug)
+  interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
+  for (i,page) in enumerate(doc.get_pages(debug=debug)):
+    if pages and (i not in pages): continue
+    interpreter.process_page(page)
+  fp.close()
+  device.close()
+  return
+
+
+# main
+def main(argv):
+  import getopt
+  def usage():
+    print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
+    return 100
+  try:
+    (opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
+  except getopt.GetoptError:
+    return usage()
+  if not args: return usage()
+  debug = 0
+  cmapdir = 'CMap'
+  cdbcmapdir = 'CDBCMap'
+  codec = 'ascii'
+  pages = set()
+  outfp = stdout
+  for (k, v) in opts:
+    if k == '-d': debug += 1
+    elif k == '-p': pages.add(int(v))
+    elif k == '-o': outfp = file(v, 'wb')
+    elif k == '-c': codec = v
+  #
+  CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
+  rsrc = PDFResourceManager(debug=debug)
+  for fname in args:
+    pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug)
+  return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/pdfinterp.py b/pdfinterp.py
new file mode 100644
index 0000000..d24f848
--- /dev/null
+++ b/pdfinterp.py
@@ -0,0 +1,827 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from struct import pack, unpack
+try:
+  from cStringIO import StringIO
+except ImportError:
+  from StringIO import StringIO
+from psparser import PSException, PSSyntaxError, PSTypeError, \
+     PSStackParser, PSLiteral, PSKeyword, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
+from pdfparser import resolve1, int_value, float_value, num_value, \
+     str_value, list_value, dict_value, stream_value, PDFException
+from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+
+
+##  Exceptions
+##
+class PDFResourceError(PDFException): pass
+class PDFInterpreterError(PDFException): pass
+class PDFFontError(PDFException): pass
+class PDFUnicodeNotDefined(PDFFontError): pass
+
+
+##  Constants
+##
+LITERAL_PDF = PSLiteralTable.intern('PDF')
+LITERAL_TEXT = PSLiteralTable.intern('Text')
+LITERAL_FONT = PSLiteralTable.intern('Font')
+LITERAL_FORM = PSLiteralTable.intern('Form')
+LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
+LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
+LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
+LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
+LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
+LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
+MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
+CS_COMPONENTS = {
+  PSLiteralTable.intern('CalRGB'): 3,
+  PSLiteralTable.intern('CalGray'): 1,
+  PSLiteralTable.intern('Lab'): 3,
+  PSLiteralTable.intern('DeviceRGB'): 3,
+  PSLiteralTable.intern('DeviceCMYK'): 4,
+  PSLiteralTable.intern('DeviceGray'): 1,
+  PSLiteralTable.intern('Separation'): 1,
+  PSLiteralTable.intern('Indexed'): 1,
+  PSLiteralTable.intern('Pattern'): 1,
+  }
+
+
+##  Matrix operations
+##
+def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
+  '''Multiplies two matrices.'''
+  return (a0*a1+c0*b1,    b0*a1+d0*b1,
+          a0*c1+c0*d1,    b0*c1+d0*d1,
+          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
+
+def apply_matrix((a,b,c,d,e,f), (x,y)):
+  '''Applies a matrix to a coordination.'''
+  return (a*x+c*y+e, b*x+d*y+f)
+
+def cs_params(cs):
+  t = cs[0]
+  if t == LITERAL_ICC_BASED:
+    return stream_value(cs[1]).dic['N']
+  elif t == LITERAL_DEVICE_N:
+    return len(list_value(cs[1]))
+  else:
+    return CS_COMPONENTS[t]
+
+
+##  Fonts
+##
+
+# PDFFont
+class PDFFont:
+  
+  def __init__(self, fontid, descriptor, widths, default_width=None):
+    self.fontid = fontid
+    self.descriptor = descriptor
+    self.widths = widths
+    self.fontname = descriptor['FontName']
+    if isinstance(self.fontname, PSLiteral):
+      self.fontname = literal_name(self.fontname)
+    self.ascent = descriptor['Ascent']
+    self.descent = descriptor['Descent']
+    self.default_width = default_width or descriptor.get('MissingWidth', 0)
+    self.leading = descriptor.get('Leading', 0)
+    self.bbox = descriptor['FontBBox']
+    return
+
+  def __repr__(self):
+    return '<PDFFont: fontid=%r>' % (self.fontid,)
+
+  def is_vertical(self):
+    return False
+  
+  def decode(self, bytes):
+    return map(ord, bytes)
+
+  def char_width(self, cid):
+    return self.widths.get(cid, self.default_width)
+
+  def char_disp(self, cid):
+    return 0
+  
+  def string_width(self, s):
+    return sum( self.char_width(cid) for cid in self.decode(s) )
+  
+
+# PDFSimpleFont
+class PDFSimpleFont(PDFFont):
+  
+  def __init__(self, fontid, descriptor, widths, spec):
+    # Font encoding is specified either by a name of
+    # built-in encoding or a dictionary that describes
+    # the differences.
+    if 'Encoding' in spec:
+      encoding = resolve1(spec['Encoding'])
+    else:
+      encoding = LITERAL_STANDARD_ENCODING
+    if isinstance(encoding, dict):
+      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
+      diff = encoding.get('Differences', None)
+      self.encoding = EncodingDB.get_encoding(name, diff)
+    else:
+      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
+    PDFFont.__init__(self, fontid, descriptor, widths)
+    return
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      try:
+        return self.encoding[cid]
+      except KeyError:
+        raise PDFUnicodeNotDefined(None, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(None, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
+# PDFType1Font
+class PDFType1Font(PDFSimpleFont):
+  
+  def __init__(self, fontid, spec):
+    if 'BaseFont' not in spec:
+      raise PDFFontError('BaseFont is missing')
+    self.basefont = literal_name(spec['BaseFont'])
+    try:
+      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
+    except KeyError:
+      try:
+        descriptor = dict_value(spec['FontDescriptor'])
+        firstchar = int_value(spec['FirstChar'])
+        lastchar = int_value(spec['LastChar'])
+        widths = dict( (i+firstchar,w) for (i,w)
+                       in enumerate(list_value(spec['Widths'])) )
+      except KeyError, k:
+        raise PDFFontError('%s is missing' % k)
+    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    return
+
+# PDFTrueTypeFont
+class PDFTrueTypeFont(PDFType1Font):
+  pass
+
+# PDFType3Font
+class PDFType3Font(PDFSimpleFont):
+  def __init__(self, fontid, spec):
+    try:
+      firstchar = int_value(spec['FirstChar'])
+      lastchar = int_value(spec['LastChar'])
+      widths = dict( (i+firstchar,w) for (i,w)
+                     in enumerate(list_value(spec['Widths'])) )
+    except KeyError, k:
+      raise PDFFontError('%s is missing' % k)
+    if 'FontDescriptor' in spec:
+      descriptor = dict_value(spec['FontDescriptor'])
+    else:
+      descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
+                    'FontBBox':spec['FontBBox']}
+    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    return
+
+# PDFCIDFont
+
+##  TrueTypeFont
+##
+class TrueTypeFont:
+
+  class CMapNotFound(Exception): pass
+  
+  def __init__(self, name, fp):
+    self.name = name
+    self.fp = fp
+    self.tables = {}
+    fonttype = fp.read(4)
+    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+    for i in xrange(ntables):
+      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
+      self.tables[name] = (offset, length)
+    return
+
+  def create_cmap(self):
+    if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
+    (base_offset, length) = self.tables['cmap']
+    fp = self.fp
+    fp.seek(base_offset)
+    (version, nsubtables) = unpack('>HH', fp.read(4))
+    subtables = []
+    for i in xrange(nsubtables):
+      subtables.append(unpack('>HHL', fp.read(8)))
+    char2gid = {}
+    # Only supports subtable type 0, 2 and 4.
+    for (_1, _2, st_offset) in subtables:
+      fp.seek(base_offset+st_offset)
+      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
+      if fmttype == 0:
+        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
+      elif fmttype == 2:
+        subheaderkeys = unpack('>256H', fp.read(512))
+        firstbytes = [0]*8192
+        for (i,k) in enumerate(subheaderkeys):
+          firstbytes[k/8] = i
+        nhdrs = max(subheaderkeys)/8 + 1
+        hdrs = []
+        for i in xrange(nhdrs):
+          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
+          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
+        for (i,firstcode,entcount,delta,pos) in hdrs:
+          if not entcount: continue
+          first = firstcode + (firstbytes[i] << 8)
+          fp.seek(pos)
+          for c in xrange(entcount):
+            gid = unpack('>H', fp.read(2))
+            if gid:
+              gid += delta
+            char2gid[first+c] = gid
+      elif fmttype == 4:
+        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+        segcount /= 2
+        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        fp.read(2)
+        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
+        pos = fp.tell()
+        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
+          if idr:
+            fp.seek(pos+idr)
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
+          else:
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (c + idd) & 0xffff
+    gid2char = dict( (gid, pack('>H', char))
+                     for (char,gid) in char2gid.iteritems() )
+    cmapname = 'Adobe-Identity-UCS-%s' % self.name
+    return CMap(cmapname).update(char2gid, gid2char)
+
+class PDFCIDFont(PDFFont):
+  
+  def __init__(self, fontid, spec):
+    if 'BaseFont' not in spec:
+      raise PDFFontError('BaseFont is missing')
+    try:
+      self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
+      self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
+                                  self.cidsysteminfo['Ordering'])
+    except KeyError:
+      raise PDFFontError('CIDSystemInfo not properly defined.')
+    self.basefont = literal_name(spec['BaseFont'])
+    self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
+    descriptor = dict_value(spec['FontDescriptor'])
+    ttf = None
+    if 'FontFile2' in descriptor:
+      self.fontfile = stream_value(descriptor.get('FontFile2'))
+      ttf = TrueTypeFont(self.basefont,
+                         StringIO(self.fontfile.get_data()))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
+    elif self.cidcoding == 'Adobe-Identity':
+      if ttf:
+        try:
+          self.ucs2_cmap = ttf.create_cmap()
+        except TrueTypeFont.CMapNotFound:
+          pass
+    else:
+      self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
+    
+    def get_width(seq):
+      dic = {}
+      char1 = char2 = None
+      for v in seq:
+        if char1 == None:
+          char1 = v
+        elif char2 == None and isinstance(v, int):
+          char2 = v
+        else:
+          if char2 == None:
+            for (i,w) in enumerate(v):
+              dic[char1+i] = w
+          else:
+            for i in xrange(char1, char2+1):
+              dic[i] = v
+          char1 = char2 = None
+      return dic
+    self.vertical = self.cmap.is_vertical()
+    if self.vertical:
+      # writing mode: vertical
+      dic = get_width(list_value(spec.get('W2', [])))
+      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
+      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
+      (d,w) = spec.get('DW2', [880, -1000])
+      default_width = w
+      self.default_disp = d
+    else:
+      # writing mode: horizontal
+      widths = get_width(list_value(spec.get('W', [])))
+      self.disps = {}
+      default_width = spec.get('DW', 1000)
+      self.default_disp = 0
+    PDFFont.__init__(self, fontid, descriptor, widths, default_width)
+    return
+
+  def is_vertical(self):
+    return self.vertical
+  
+  def decode(self, bytes):
+    return self.cmap.decode(bytes)
+
+  def char_disp(self, cid):
+    return self.disps.get(cid, self.default_disp)
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
+##  Resource Manager
+##
+class PDFResourceManager:
+
+  '''
+  ResourceManager facilitates reuse of shared resources
+  such as fonts, images and cmaps so that large objects are not
+  allocated multiple times.
+  '''
+  
+  def __init__(self, debug=0):
+    self.debug = debug
+    self.fonts = {}
+    return
+
+  def get_procset(self, procs):
+    for proc in procs:
+      if proc == LITERAL_PDF:
+        pass
+      elif proc == LITERAL_TEXT:
+        pass
+      else:
+        #raise PDFResourceError('ProcSet %r is not supported.' % proc)
+        pass
+    return
+  
+  def get_cmap(self, name):
+    return CMapDB.get_cmap(name)
+
+  def get_font(self, fontid, spec):
+    if fontid in self.fonts:
+      font = self.fonts[fontid]
+    else:
+      spec = dict_value(spec)
+      assert spec['Type'] == LITERAL_FONT
+      # Create a Font object.
+      if 'Subtype' not in spec:
+        raise PDFFontError('Font Subtype is not specified.')
+      subtype = literal_name(spec['Subtype'])
+      if subtype in ('Type1', 'MMType1'):
+        # Type1 Font
+        font = PDFType1Font(fontid, spec)
+      elif subtype == 'TrueType':
+        # TrueType Font
+        font = PDFTrueTypeFont(fontid, spec)
+      elif subtype == 'Type3':
+        # Type3 Font
+        font = PDFType3Font(fontid, spec)
+      elif subtype in ('CIDFontType0', 'CIDFontType2'):
+        # CID Font
+        font = PDFCIDFont(fontid, spec)
+      elif subtype == 'Type0':
+        # Type0 Font
+        dfonts = list_value(spec['DescendantFonts'])
+        assert len(dfonts) == 1
+        subspec = dict_value(dfonts[0]).copy()
+        for k in ('Encoding', 'ToUnicode'):
+          if k in spec:
+            subspec[k] = resolve1(spec[k])
+        font = self.get_font(fontid, subspec)
+      else:
+        raise PDFFontError('Invalid Font: %r' % spec)
+      self.fonts[fontid] = font
+    return font
+
+
+##  PDFDevice
+##
+class PDFDevice:
+  
+  def __init__(self, rsrc):
+    self.rsrc = rsrc
+    self.ctm = None
+    return
+  
+  def __repr__(self):
+    return '<PDFDevice>'
+
+  def close(self):
+    return
+
+  def set_ctm(self, ctm):
+    self.ctm = ctm
+    return
+
+  def begin_block(self, name):
+    return
+  def end_block(self):
+    return
+  
+  def render_string(self, textstate, textmatrix, size, seq):
+    raise NotImplementedError
+
+
+##  Interpreter
+##
+class PDFPageInterpreter:
+  
+  class TextState:
+    def __init__(self):
+      self.font = None
+      self.fontsize = 0
+      self.charspace = 0
+      self.wordspace = 0
+      self.scaling = 100
+      self.leading = 0
+      self.render = 0
+      self.rise = 0
+      self.reset()
+      return
+    def __repr__(self):
+      return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
+              ' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
+              ' render=%r, rise=%r>' %
+              (self.font, self.fontsize, self.matrix,
+               self.charspace, self.wordspace, self.scaling, self.leading,
+               self.render, self.rise))
+    def reset(self):
+      self.matrix = MATRIX_IDENTITY
+      self.linematrix = (0, 0)
+      return
+
+  def __init__(self, rsrc, device, debug=0):
+    self.rsrc = rsrc
+    self.device = device
+    self.debug = debug
+    return
+
+  def initpage(self, ctm):
+    self.fontmap = {}
+    self.xobjmap = {}
+    self.csmap = {}
+    # gstack: stack for graphical states.
+    self.gstack = []
+    self.ctm = ctm
+    self.device.set_ctm(self.ctm)
+    self.textstate = PDFPageInterpreter.TextState()
+    # argstack: stack for command arguments.
+    self.argstack = []
+    # set some global states.
+    self.scs = None
+    self.ncs = None
+    return
+
+  def push(self, obj):
+    self.argstack.append(obj)
+    return
+
+  def pop(self, n):
+    x = self.argstack[-n:]
+    self.argstack = self.argstack[:-n]
+    return x
+
+  def get_current_state(self):
+    return (self.ctm, self.textstate)
+  
+  def set_current_state(self, state):
+    (self.ctm, self.textstate) = state
+    self.device.set_ctm(self.ctm)
+    return
+
+  # gsave
+  def do_q(self):
+    self.gstack.append(self.get_current_state())
+    return
+  # grestore
+  def do_Q(self):
+    if self.gstack:
+      self.set_current_state(self.gstack.pop())
+    return
+  
+  # concat-matrix
+  def do_cm(self, a1, b1, c1, d1, e1, f1):
+    self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
+    self.device.set_ctm(self.ctm)
+    return
+  
+  # setlinewidth
+  def do_w(self, width): return
+  # setlinecap
+  def do_J(self, cap): return
+  # setlinejoin
+  def do_j(self, join): return
+  # setmiterlimit
+  def do_M(self, limit): return
+  # setdash
+  def do_d(self, dash, phase): return
+  # setintent
+  def do_ri(self, intent): return
+  # setflatness
+  def do_i(self, flatness): return
+  # savedict
+  def do_gs(self, name): return
+  
+  # moveto
+  def do_m(self, x, y): return
+  # lineto
+  def do_l(self, x, y): return
+  # curveto
+  def do_c(self, x1, y1, x2, y2, x3, y3): return
+  # urveto
+  def do_v(self, x2, y2, x3, y3): return
+  # rveto
+  def do_y(self, x1, y1, x3, y3): return
+  # closepath
+  def do_h(self): return
+  # rectangle
+  def do_re(self, x, y, w, h): return
+  
+  # stroke
+  def do_S(self): return
+  # close-and-stroke
+  def do_s(self): return
+  # fill
+  def do_f(self): return
+  # fill (obsolete)
+  do_F = do_f
+  # fill-even-odd
+  def do_f_a(self): return
+  # fill-and-stroke
+  def do_B(self): return
+  # fill-and-stroke-even-odd
+  def do_B_a(self): return
+  # close-fill-and-stroke
+  def do_b(self): return
+  # close-fill-and-stroke-even-odd
+  def do_b_a(self): return
+  # close-only
+  def do_n(self): return
+  # clip
+  def do_W(self): return
+  # clip-even-odd
+  def do_W_a(self): return
+  
+  # setcolorspace-stroking
+  def do_CS(self, name):
+    self.scs = self.csmap.get(literal_name(name), None)
+    return
+  # setcolorspace-non-strokine
+  def do_cs(self, name):
+    self.ncs = self.csmap.get(literal_name(name), None)
+    return
+  # setgray-stroking
+  def do_G(self, gray):
+    self.do_CS(LITERAL_DEVICE_GRAY)
+    return
+  # setgray-non-stroking
+  def do_g(self, gray):
+    self.do_cs(LITERAL_DEVICE_GRAY)
+    return
+  # setrgb-stroking
+  def do_RG(self, r, g, b):
+    self.do_CS(LITERAL_DEVICE_RGB)
+    return
+  # setrgb-non-stroking
+  def do_rg(self, r, g, b):
+    self.do_cs(LITERAL_DEVICE_RGB)
+    return
+  # setcmyk-stroking
+  def do_K(self, c, m, y, k):
+    self.do_CS(LITERAL_DEVICE_CMYK)
+    return
+  # setcmyk-non-stroking
+  def do_k(self, c, m, y, k):
+    self.do_cs(LITERAL_DEVICE_CMYK)
+    return
+
+  # setcolor
+  def do_SCN(self):
+    n = cs_params(self.scs)
+    self.pop(n)
+    return
+  def do_scn(self):
+    n = cs_params(self.ncs)
+    self.pop(n)
+    return
+  def do_SC(self):
+    self.do_SCN()
+    return
+  def do_sc(self):
+    self.do_scn()
+    return
+    
+  # sharing-name
+  def do_sh(self, name): return
+  
+  # begin-text
+  def do_BT(self):
+    self.textstate.reset()
+    return
+  # end-text
+  def do_ET(self):
+    return
+
+  # begin-compat
+  def do_BX(self): return
+  # end-compat
+  def do_EX(self): return
+
+  # marked content operators
+  def do_MP(self, tag): return
+  def do_DP(self, tag, props): return
+  def do_BMC(self, tag): return
+  def do_BDC(self, tag, props): return
+  def do_EMC(self): return
+
+  # setcharspace
+  def do_Tc(self, space):
+    self.textstate.charspace = space
+    return
+  # setwordspace
+  def do_Tw(self, space):
+    self.textstate.wordspace = space
+    return
+  # textscale
+  def do_Tz(self, scale):
+    self.textstate.scaling = scale
+    return
+  # setleading
+  def do_TL(self, leading):
+    self.textstate.leading = leading
+    return
+  # selectfont
+  def do_Tf(self, fontid, fontsize):
+    try:
+      self.textstate.font = self.fontmap[literal_name(fontid)]
+    except KeyError:
+      raise PDFInterpreterError('Undefined font id: %r' % fontid)
+    self.textstate.fontsize = fontsize
+    return
+  # setrendering
+  def do_Tr(self, render):
+    self.textstate.render = render
+    return
+  # settextrise
+  def do_Ts(self, rise):
+    self.textstate.rise = rise
+    return
+
+  # text-move
+  def do_Td(self, tx, ty):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
+    self.textstate.linematrix = (0, 0)
+    return
+  # text-move
+  def do_TD(self, tx, ty):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
+    self.textstate.leading = -ty
+    self.textstate.linematrix = (0, 0)
+    return
+  # textmatrix
+  def do_Tm(self, a,b,c,d,e,f):
+    self.textstate.matrix = (a,b,c,d,e,f)
+    self.textstate.linematrix = (0, 0)
+    return
+  # nextline
+  def do_T_a(self):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
+    self.textstate.linematrix = (0, 0)
+    return
+  
+  # show-pos
+  def do_TJ(self, seq):
+    textstate = self.textstate
+    font = textstate.font
+    (a,b,c,d,e,f) = textstate.matrix
+    (lx,ly) = textstate.linematrix
+    s = ''.join( x for x in seq if isinstance(x, str) )
+    n = sum( x for x in seq if not isinstance(x, str) )
+    w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
+         len(s) * textstate.charspace +
+         s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
+    self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
+    if font.is_vertical():
+      ly += w
+    else:
+      lx += w
+    textstate.linematrix = (lx,ly)
+    return
+  # show
+  def do_Tj(self, s):
+    self.do_TJ([s])
+    return
+  # quote
+  def do__q(self, s):
+    self.do_T_a()
+    self.do_TJ([s])
+    return
+  # doublequote
+  def do__w(self, aw, ac, s):
+    self.do_Tw(aw)
+    self.do_Tc(ac)
+    self.do_TJ([s])
+    return
+
+  # inline image
+  def do_BI(self): # never called
+    return
+  def do_ID(self): # never called
+    return
+  def do_EI(self, obj):
+    return
+
+  # invoke an XObject
+  def do_Do(self, xobjid):
+    xobjid = literal_name(xobjid)
+    try:
+      xobj = stream_value(self.xobjmap[xobjid])
+    except KeyError:
+      raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
+    if xobj.dic['Subtype'] == LITERAL_FORM:
+      if 1 <= self.debug:
+        print >>stderr, 'Processing xobj: %r' % xobj
+      interpreter = PDFPageInterpreter(self.rsrc, self.device)
+      interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
+                                  xobj.dic.get('Matrix', MATRIX_IDENTITY))
+    return
+
+  def process_page(self, page):
+    if 1 <= self.debug:
+      print >>stderr, 'Processing page: %r' % page
+    self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
+    return
+
+  def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
+    self.initpage(ctm)
+    self.device.begin_block(contid)
+    # Handle resource declarations.
+    for (k,v) in dict_value(resources).iteritems():
+      if 1 <= self.debug:
+        print >>stderr, 'Resource: %r: %r' % (k,v)
+      if k == 'Font':
+        for (fontid,fontrsrc) in dict_value(v).iteritems():
+          self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+      elif k == 'ColorSpace':
+        for (csid,csspec) in dict_value(v).iteritems():
+          self.csmap[csid] = list_value(csspec)
+      elif k == 'ProcSet':
+        self.rsrc.get_procset(list_value(v))
+      elif k == 'XObject':
+        for (xobjid,xobjstrm) in dict_value(v).iteritems():
+          self.xobjmap[xobjid] = xobjstrm
+    for stream in list_value(contents):
+      self.execute(stream_value(stream))
+    self.device.end_block()
+    return
+  
+  def execute(self, stream):
+    for obj in stream.parse_data(inline=True, debug=self.debug):
+      if isinstance(obj, PSKeyword):
+        name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
+        if hasattr(self, name):
+          func = getattr(self, name)
+          nargs = func.func_code.co_argcount-1
+          if nargs:
+            args = self.pop(nargs)
+            if 1 <= self.debug:
+              print >>stderr, 'exec: %s %r' % (obj.name, args)
+            if len(args) == nargs:
+              func(*args)
+          else:
+            if 1 <= self.debug:
+              print >>stderr, 'exec: %s' % (obj.name)
+            func()
+        else:
+          raise PDFInterpreterError('unknown operator: %r' % obj.name)
+      else:
+        self.push(obj)
+    return
diff --git a/pdfparser.py b/pdfparser.py
index ed5d289..a12c7a1 100755
--- a/pdfparser.py
+++ b/pdfparser.py
@@ -15,858 +15,121 @@
 #   - Encryption?
 
 import sys, re
-from struct import pack, unpack
-try:
-  from cStringIO import StringIO
-except ImportError:
-  from StringIO import StringIO
-try:
-  import cdb
-except ImportError:
-  import pycdb as cdb
 stderr = sys.stderr
+from utils import choplist, nunpack
+from psparser import PSException, PSSyntaxError, PSTypeError, \
+     PSLiteral, PSKeyword, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
+     PSStackParser
 
 
-##  Utilities
+##  PDF Exceptions
 ##
-def choplist(n, seq):
-  '''Groups every n elements of the list.'''
-  r = []
-  for x in seq:
-    r.append(x)
-    if len(r) == n:
-      yield tuple(r)
-      r = []
-  return
-
-def nunpack(s, default=0):
-  '''Unpacks up to 4 bytes.'''
-  l = len(s)
-  if not l:
-    return default
-  elif l == 1:
-    return ord(s)
-  elif l == 2:
-    return unpack('>H', s)[0]
-  elif l == 3:
-    return unpack('>L', '\x00'+s)[0]
-  elif l == 4:
-    return unpack('>L', s)[0]
-  else:
-    return TypeError('invalid length: %d' % l)
-
-def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
-  '''Multiplies two matrices.'''
-  return (a0*a1+c0*b1,    b0*a1+d0*b1,
-          a0*c1+c0*d1,    b0*c1+d0*d1,
-          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
-
-def apply_matrix((a,b,c,d,e,f), (x,y)):
-  '''Applies a matrix to a coordination.'''
-  return (a*x+c*y+e, b*x+d*y+f)
-
-
-##  Exceptions
-##
-class PSException(Exception): pass
-class PSSyntaxError(PSException): pass
-class PSTypeError(PSException): pass
-class PSValueError(PSException): pass
 class PDFException(PSException): pass
 class PDFSyntaxError(PDFException): pass
 class PDFEncrypted(PDFException): pass
 class PDFTypeError(PDFException): pass
 class PDFValueError(PDFException): pass
-class PDFResourceError(PDFException): pass
-class PDFInterpreterError(PDFException): pass
-class PDFFontError(PDFException): pass
-class PDFUnicodeNotDefined(PDFFontError): pass
 
 
-##  PostScript Types
-##
-class PSLiteral:
-  '''
-  PS literals (e.g. "/Name").
-  Caution: Never create these objects directly.
-  Use PSLiteralTable.intern() instead.
-  '''
-  def __init__(self, name):
-    self.name = name
-    return
-  def __repr__(self):
-    return '/%s' % self.name
-
-class PSKeyword:
-  '''
-  PS keywords (e.g. "showpage").
-  Caution: Never create these objects directly.
-  Use PSKeywordTable.intern() instead.
-  '''
-  def __init__(self, name):
-    self.name = name
-    return
-  def __repr__(self):
-    return self.name
-
-class PSSymbolTable:
-  '''
-  Symbol table that stores PSLiteral or PSKeyword.
-  '''
-  def __init__(self, classe):
-    self.dic = {}
-    self.classe = classe
-    return
-  
-  def intern(self, name):
-    if name in self.dic:
-      lit = self.dic[name]
-    else:
-      lit = self.classe(name)
-      self.dic[name] = lit
-    return lit
-
-PSLiteralTable = PSSymbolTable(PSLiteral)
-PSKeywordTable = PSSymbolTable(PSKeyword)
-
 # some predefined literals and keywords.
 LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
-LITERAL_PDF = PSLiteralTable.intern('PDF')
-LITERAL_TEXT = PSLiteralTable.intern('Text')
 LITERAL_XREF = PSLiteralTable.intern('XRef')
-LITERAL_FONT = PSLiteralTable.intern('Font')
 LITERAL_PAGE = PSLiteralTable.intern('Page')
-LITERAL_FORM = PSLiteralTable.intern('Form')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
 LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
-LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
 KEYWORD_EI = PSKeywordTable.intern('EI')
 
 
-##  CMap
+##  PDFObjRef
 ##
-class CMap:
+class PDFObjRef:
   
-  def __init__(self, debug=0):
-    self.debug = 0
-    self.code2cid = {}
-    self.cid2code = {}
-    self.attrs = {}
+  def __init__(self, doc, objid, genno):
+    if objid == 0:
+      raise PDFValueError('objid cannot be 0.')
+    self.doc = doc
+    self.objid = objid
+    #self.genno = genno  # Never used.
     return
 
   def __repr__(self):
-    return '<CMap: %s>' % self.attrs.get('CMapName')
+    return '<PDFObjRef:%d>' % (self.objid)
 
-  def update(self, code2cid=None, cid2code=None):
-    if code2cid:
-      self.code2cid.update(code2cid)
-    if cid2code:
-      self.cid2code.update(cid2code)
-    return self
-    
-  def copycmap(self, cmap):
-    self.code2cid.update(cmap.getall_code2cid())
-    self.cid2code.update(cmap.getall_cid2code())
-    return self
-
-  def register_code2cid(self, code, cid):
-    assert isinstance(code, str)
-    assert isinstance(cid, int)
-    self.code2cid[code] = cid
-    return self
-
-  def register_cid2code(self, cid, code):
-    from glyphlist import charname2unicode
-    assert isinstance(cid, int)
-    if isinstance(code, PSLiteral):
-      code = pack('>H', charname2unicode[code.name])
-    self.cid2code[cid] = code
-    return self
-
-  def decode(self, bytes):
-    if self.debug:
-      print >>stderr, 'decode: %r, %r' % (self, bytes)
-    x = ''
-    for c in bytes:
-      if x:
-        if x+c in self.code2cid:
-          yield self.code2cid[x+c]
-        x = ''
-      elif c in self.code2cid:
-        yield self.code2cid[c]
-      else:
-        x = c
-    return
-  
-  def is_vertical(self):
-    return self.attrs.get('WMode', '0') == '1'
-
-  def tocid(self, code):
-    return self.code2cid.get(code)
-  def tocode(self, cid):
-    return self.cid2code.get(cid)
-
-  def getall_attrs(self):
-    return self.attrs.iteritems()
-  def getall_code2cid(self):
-    return self.code2cid.iteritems()
-  def getall_cid2code(self):
-    return self.cid2code.iteritems()
-
-  
-##  CDBCMap
-##
-class CDBCMap(CMap):
-  
-  def __init__(self, cdbname, debug=0):
-    CMap.__init__(self, debug=debug)
-    self.cdbname = cdbname
-    self.db = cdb.init(cdbname)
-    return
-
-  def __repr__(self):
-    return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
-
-  def tocid(self, code):
-    k = 'c'+code
-    if not self.db.has_key(k):
-      return None
-    return unpack('>L', self.db[k])
-  def tocode(self, cid):
-    k = 'i'+pack('>L', cid)
-    if not self.db.has_key(k):
-      return None
-    return self.db[k]
-  
-  def is_vertical(self):
-    return (self.db.has_key('/WMode') and
-            self.db['/WMode'] == '1')
-
-  def getall(self, c):
-    while 1:
-      x = self.db.each()
-      if not x: break
-      (k,v) = x
-      if k.startswith(c):
-        yield (k[1:], unpack('>L', v)[0])
-    return
-
-  def getall_attrs(self):
-    while 1:
-      x = self.db.each()
-      if not x: break
-      (k,v) = x
-      if k.startswith('/'):
-        yield (k[1:], eval(v)[0])
-    return
-  
-  def getall_cid2code(self):
-    return self.getall('i')
-  def getall_code2cid(self):
-    return self.getall('c')
-
-  def decode(self, bytes):
-    if self.debug:
-      print >>stderr, 'decode: %r, %r' % (self, bytes)
-    x = ''
-    for c in bytes:
-      if x:
-        if x+c in self.code2cid:
-          yield self.code2cid[x+c]
-        elif self.db.has_key('c'+x+c):
-          (dest,) = unpack('>L', self.db['c'+x+c])
-          self.code2cid[x+c] = dest
-          yield dest
-        x = ''
-      elif c in self.code2cid:
-        yield self.code2cid[c]
-      elif self.db.has_key('c'+c):
-        (dest,) = unpack('>L', self.db['c'+c])
-        self.code2cid[c] = dest
-        yield dest
-      else:
-        x = c
-    return
+  def resolve(self):
+    return self.doc.getobj(self.objid)
 
 
-##  CMapDB
-##
-class CMapDB:
-
-  CMAP_ALIAS = {
-    }
-  
-  debug = 0
-  dirname = None
-  cdbdirname = None
-  cmapdb = {}
-
-  @classmethod
-  def initialize(klass, dirname, cdbdirname=None, debug=0):
-    klass.dirname = dirname
-    klass.cdbdirname = cdbdirname or dirname
-    klass.debug = debug
-    return
-
-  @classmethod
-  def get_cmap(klass, cmapname):
-    import os.path
-    cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
-    if cmapname in klass.cmapdb:
-      cmap = klass.cmapdb[cmapname]
-    else:
-      fname = os.path.join(klass.dirname, cmapname)
-      cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
-      if os.path.exists(cdbname):
-        if 1 <= klass.debug:
-          print >>stderr, 'Opening: CDBCMap %r...' % cdbname
-        cmap = CDBCMap(cdbname)
-      elif os.path.exists(fname):
-        if 1 <= klass.debug:
-          print >>stderr, 'Reading: CMap %r...' % fname
-        cmap = CMap()
-        fp = file(fname)
-        CMapParser(cmap, fp).parse()
-        fp.close()
-      klass.cmapdb[cmapname] = cmap
-    return cmap
-
-
-##  FontMetricsDB
-##
-class FontMetricsDB:
-  from fontmetrics import FONT_METRICS
-  
-  @classmethod
-  def get_metrics(klass, fontname):
-    return klass.FONT_METRICS[fontname]
-
-
-##  EncodingDB
-##
-class EncodingDB:
-      
-  from glyphlist import charname2unicode
-  from latin_enc import ENCODING
-  std2unicode = {}
-  mac2unicode = {}
-  win2unicode = {}
-  pdf2unicode = {}
-  for (name,std,mac,win,pdf) in ENCODING:
-    c = unichr(charname2unicode[name])
-    if std: std2unicode[std] = c
-    if mac: mac2unicode[mac] = c
-    if win: win2unicode[win] = c
-    if pdf: pdf2unicode[pdf] = c
-  encodings = {
-    'StandardEncoding': std2unicode,
-    'MacRomanEncoding': mac2unicode,
-    'WinAnsiEncoding': win2unicode,
-    'PDFDocEncoding': pdf2unicode,
-    }
-  
-  @classmethod
-  def get_encoding(klass, name, diff=None):
-    cid2unicode = klass.encodings.get(name, klass.std2unicode)
-    if diff:
-      cid2unicode = cid2unicode.copy()
-      cid = 0
-      for x in diff:
-        if isinstance(x, int):
-          cid = x
-        elif isinstance(x, PSLiteral):
-          try:
-            cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
-          except KeyError:
-            pass
-          cid += 1
-    return cid2unicode
-  
-
-##  Color Spaces
-##
-LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
-LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
-LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
-LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
-LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
-CS_COMPONENTS = {
-  PSLiteralTable.intern('CalRGB'): 3,
-  PSLiteralTable.intern('CalGray'): 1,
-  PSLiteralTable.intern('Lab'): 3,
-  PSLiteralTable.intern('DeviceRGB'): 3,
-  PSLiteralTable.intern('DeviceCMYK'): 4,
-  PSLiteralTable.intern('DeviceGray'): 1,
-  PSLiteralTable.intern('Separation'): 1,
-  PSLiteralTable.intern('Indexed'): 1,
-  PSLiteralTable.intern('Pattern'): 1,
-  }
-
-def cs_params(cs):
-  t = cs[0]
-  if t == LITERAL_ICC_BASED:
-    return stream_value(cs[1]).dic['N']
-  elif t == LITERAL_DEVICE_N:
-    return len(list_value(cs[1]))
-  else:
-    return CS_COMPONENTS[t]
-
-
-##  PSBaseParser
-##
-class PSBaseParser:
-
-  '''PostScript parser that performs only basic tokenization.'''
-
-  def __init__(self, fp, debug=0):
-    self.fp = fp
-    self.debug = debug
-    self.bufsize = 4096
-    self.seek(0)
-    return
-
-  def __repr__(self):
-    return '<PSBaseParser: %r>' % (self.fp,)
-
-  def seek(self, pos):
-    '''
-    seeks to the given pos.
-    '''
-    if 2 <= self.debug:
-      print >>stderr, 'seek:', pos
-    self.fp.seek(pos)
-    self.linepos = pos
-    self.linebuf = None
-    self.curpos = 0
-    self.line = ''
-    return
-  
-  EOLCHAR = re.compile(r'[\r\n]')
-  def nextline(self):
-    '''
-    fetches the next line that ends either with \\r or \\n.
-    '''
-    line = ''
-    eol = None
-    while 1:
-      if not self.linebuf or len(self.linebuf) <= self.curpos:
-        # fetch next chunk.
-        self.linebuf = self.fp.read(self.bufsize)
-        if not self.linebuf:
-          # at EOF.
-          break
-        self.curpos = 0
-      if eol:
-        c = self.linebuf[self.curpos]
-        # handle '\r\n'
-        if (eol == '\r' and c == '\n'):
-          line += c
-          self.curpos += 1
-        break
-      m = self.EOLCHAR.search(self.linebuf, self.curpos)
-      if m:
-        i = m.end(0)
-        line += self.linebuf[self.curpos:i]
-        eol = self.linebuf[i-1]
-        self.curpos = i
-      else:
-        # fetch further
-        line += self.linebuf[self.curpos:]
-        self.linebuf = None
-    self.linepos += len(line)
-    return line
-
-  def revreadlines(self):
-    '''
-    fetches lines backword. used to locate trailers.
-    '''
-    self.fp.seek(0, 2)
-    pos = self.fp.tell()
-    buf = ''
-    while 0 < pos:
-      pos = max(0, pos-self.bufsize)
-      self.fp.seek(pos)
-      s = self.fp.read(self.bufsize)
-      if not s: break
-      while 1:
-        n = max(s.rfind('\r'), s.rfind('\n'))
-        if n == -1:
-          buf = s + buf
-          break
-        yield buf+s[n:]
-        s = s[:n]
-        buf = ''
-    return
-
-  SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
-  TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
-  LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
-  NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
-  STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
-  STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
-  STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
-  STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
-
-  def parse(self):
-    '''
-    Yields a list of basic tokens: keywords, literals, strings, 
-    numbers and parentheses. Comments are skipped.
-    Nested objects (i.e. arrays and dictionaries) are not handled.
-    '''
-    while 1:
-      # do not strip line! we need to distinguish last '\n' or '\r'
-      linepos0 = self.linepos
-      self.line = self.nextline()
-      if not self.line: break
-      if 2 <= self.debug:
-        print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
-      # do this before removing comment
-      if self.line.startswith('%%EOF'): break
-      charpos = 0
-      
-      # tokenize
-      while 1:
-        m = self.TOKEN.search(self.line, charpos)
-        if not m: break
-        t = m.group(0)
-        pos = linepos0 + m.start(0)
-        charpos = m.end(0)
-        
-        if t == '%':
-          # skip comment
-          if 2 <= self.debug:
-            print >>stderr, 'comment: %r' % self.line[charpos:]
-          break
-        
-        elif t == '/':
-          # literal object
-          mn = self.LITERAL.match(self.line, m.start(0)+1)
-          lit = PSLiteralTable.intern(mn.group(0))
-          yield (pos, lit)
-          charpos = mn.end(0)
-          if 2 <= self.debug:
-            print >>stderr, 'name: %r' % lit
-            
-        elif t == '(':
-          # normal string object
-          s = ''
-          while 1:
-            ms = self.STRING_NORM.match(self.line, charpos)
-            if not ms: break
-            s1 = ms.group(0)
-            charpos = ms.end(0)
-            if len(s1) == 1 and s1[-1] == '\\':
-              s += s1[-1:]
-              self.line = self.nextline()
-              if not self.line:
-                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
-                                    (self.linepos, self.line))
-              charpos = 0
-            elif charpos == len(self.line):
-              s += s1
-              self.line = self.nextline()
-              if not self.line:
-                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
-                                    (self.linepos, self.line))
-              charpos = 0
-            else:
-              s += s1
-              break
-          if self.line[charpos] != ')':
-            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
-                                (self.linepos, self.line))
-          charpos += 1
-          def convesc(m):
-            x = m.group(0)
-            if x[1:].isdigit():
-              return chr(int(x[1:], 8))
-            else:
-              return x[1]
-          s = self.STRING_NORM_SUB.sub(convesc, s)
-          if 2 <= self.debug:
-            print >>stderr, 'str: %r' % s
-          yield (pos, s)
-          
-        elif t == '<':
-          # hex string object
-          ms = self.STRING_HEX.match(self.line, charpos)
-          charpos = ms.end(0)
-          if self.line[charpos] != '>':
-            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
-                                (self.linepos, self.line))
-          charpos += 1
-          def convhex(m1):
-            return chr(int(m1.group(0), 16))
-          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
-          if 2 <= self.debug:
-            print >>stderr, 'str: %r' % s
-          yield (pos, s)
-
-        elif self.NUMBER.match(t):
-          # number
-          if '.' in t:
-            n = float(t)
-          else:
-            n = int(t)
-          if 2 <= self.debug:
-            print >>stderr, 'number: %r' % n
-          yield (pos, n)
-
-        elif t in ('true','false'):
-          # boolean
-          if 2 <= self.debug:
-            print >>stderr, 'boolean: %r' % t
-          yield (pos, (t == 'true'))
-        
-        else:
-          # other token
-          if 2 <= self.debug:
-            print >>stderr, 'keyword: %r' % t
-          yield (pos, PSKeywordTable.intern(t))
-
-    return
-
-
-##  PSStackParser
-##
-class PSStackParser(PSBaseParser):
-
+# resolve
+def resolve1(x):
   '''
-  PostScript parser that recognizes compound objects
-  such as arrays and dictionaries.
+  Resolve an object. If this is an array or dictionary,
+  it may still contains some indirect objects inside.
   '''
-  
-  def __init__(self, fp, debug=0):
-    PSBaseParser.__init__(self, fp, debug=debug)
-    self.context = []
-    self.partobj = None
-    return
+  while isinstance(x, PDFObjRef):
+    x = x.resolve()
+  return x
 
-  def do_token(self, pos, token):
-    '''
-    Handles special tokens.
-    Returns true if the token denotes the end of an object.
-    '''
-    return False
+def resolveall(x):
+  '''
+  Recursively resolve X and all the internals.
+  Make sure there is no indirect reference within the nested object.
+  This procedure might be slow. Do not used it unless
+  you really need it.
+  '''
+  while isinstance(x, PDFObjRef):
+    x = x.resolve()
+  if isinstance(x, list):
+    x = [ resolveall(v) for v in x ]
+  elif isinstance(x, dict):
+    for (k,v) in x.iteritems():
+      x[k] = resolveall(v)
+  return x
 
-  def push(self, obj):
-    '''
-    Push an object to the stack.
-    '''
-    self.partobj.append(obj)
-    return
+# Type cheking
+def int_value(x):
+  x = resolve1(x)
+  if not isinstance(x, int):
+    raise PDFTypeError('integer required: %r' % x)
+  return x
 
-  def pop(self, n):
-    '''
-    Pop N objects from the stack.
-    '''
-    if len(self.partobj) < n:
-      raise PSSyntaxError('stack too short < %d' % n)
-    r = self.partobj[-n:]
-    self.partobj = self.partobj[:-n]
-    return r
-  
-  def popall(self):
-    '''
-    Discards all the objects on the stack.
-    '''
-    self.partobj = []
-    return
+def float_value(x):
+  x = resolve1(x)
+  if not isinstance(x, float):
+    raise PDFTypeError('float required: %r' % x)
+  return x
 
-  def parse(self):
-    '''
-    Yields a list of objects: keywords, literals, strings, 
-    numbers, arrays and dictionaries. Arrays and dictionaries
-    are represented as Python sequence and dictionaries.
-    '''
-    
-    def startobj(type):
-      self.context.append((type, self.partobj))
-      self.partobj = []
-      return
+def num_value(x):
+  x = resolve1(x)
+  if not (isinstance(x, int) or isinstance(x, float)):
+    raise PDFTypeError('int or float required: %r' % x)
+  return x
 
-    def endobj(type1):
-      assert self.context
-      obj = self.partobj
-      (type0, self.partobj) = self.context.pop()
-      if type0 != type1:
-        raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
-                          (type0, self.partobj, type1, obj))
-      return obj
+def str_value(x):
+  x = resolve1(x)
+  if not isinstance(x, str):
+    raise PDFTypeError('string required: %r' % x)
+  return x
 
-    startobj('o')
+def list_value(x):
+  x = resolve1(x)
+  if not isinstance(x, list):
+    raise PDFTypeError('list required: %r' % x)
+  return x
 
-    for (pos,t) in PSBaseParser.parse(self):
-      if isinstance(t, int) or isinstance(t, float):
-        self.push(t)
-      elif isinstance(t, str):
-        self.push(t)
-      elif isinstance(t, PSLiteral):
-        self.push(t)
-      else:
-        c = keyword_name(t)
-        if c == '{' or c == '}':
-          self.push(t)
-        elif c == '[':
-          # begin array
-          if 2 <= self.debug:
-            print >>stderr, 'start array'
-          startobj('a')
-        elif c == ']':
-          # end array
-          a = endobj('a')
-          if 2 <= self.debug:
-            print >>stderr, 'end array: %r' % a
-          self.push(a)
-        elif c == '<<':
-          # begin dictionary
-          if 2 <= self.debug:
-            print >>stderr, 'start dict'
-          startobj('d')
-        elif c == '>>':
-          # end dictionary
-          objs = endobj('d')
-          if len(objs) % 2 != 0:
-            raise PSTypeError('invalid dictionary construct: %r' % objs)
-          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
-          if 2 <= self.debug:
-            print >>stderr, 'end dict: %r' % d
-          self.push(d)
-        elif self.do_token(pos, t):
-          break
+def dict_value(x):
+  x = resolve1(x)
+  if not isinstance(x, dict):
+    raise PDFTypeError('dict required: %r' % x)
+  return x
 
-    return endobj('o')
-
-
-##  CMapParser
-##
-class CMapParser(PSStackParser):
-
-  def __init__(self, cmap, fp, debug=0):
-    PSStackParser.__init__(self, fp, debug=debug)
-    self.cmap = cmap
-    self.in_cmap = False
-    return
-
-  def do_token(self, pos, token):
-    name = token.name
-    if name == 'begincmap':
-      self.in_cmap = True
-      self.popall()
-      return
-    elif name == 'endcmap':
-      self.in_cmap = False
-      return
-    if not self.in_cmap: return
-    #
-    if name == 'def':
-      try:
-        (k,v) = self.pop(2)
-        self.cmap.attrs[literal_name(k)] = v
-      except PSSyntaxError:
-        pass
-      return
-    
-    if name == 'usecmap':
-      try:
-        (cmapname,) = self.pop(1)
-        self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
-      except PSSyntaxError:
-        pass
-      return
-      
-    if name == 'begincodespacerange':
-      self.popall()
-      return
-    if name == 'endcodespacerange':
-      if 1 <= self.debug:
-        print >>stderr, 'codespace: %r' % self.partobj
-      self.popall()
-      return
-    
-    if name == 'begincidrange':
-      self.popall()
-      return
-    if name == 'endcidrange':
-      for (s,e,cid) in choplist(3, self.partobj):
-        assert isinstance(s, str)
-        assert isinstance(e, str)
-        assert isinstance(cid, int)
-        assert len(s) == len(e)
-        sprefix = s[:-4]
-        eprefix = e[:-4]
-        assert sprefix == eprefix
-        svar = s[-4:]
-        evar = e[-4:]
-        s1 = nunpack(svar)
-        e1 = nunpack(evar)
-        vlen = len(svar)
-        assert s1 <= e1
-        for i in xrange(e1-s1+1):
-          x = sprefix+pack('>L',s1+i)[-vlen:]
-          self.cmap.register_code2cid(x, cid+i)
-      self.popall()
-      return
-    
-    if name == 'begincidchar':
-      self.popall()
-      return
-    if name == 'endcidchar':
-      for (cid,code) in choplist(2, self.partobj):
-        assert isinstance(code, str)
-        assert isinstance(cid, str)
-        self.cmap.register_code2cid(code, nunpack(cid))
-      self.popall()
-      return
-        
-    if name == 'beginbfrange':
-      self.popall()
-      return
-    if name == 'endbfrange':
-      for (s,e,code) in choplist(3, self.partobj):
-        assert isinstance(s, str)
-        assert isinstance(e, str)
-        assert len(s) == len(e)
-        s1 = nunpack(s)
-        e1 = nunpack(e)
-        assert s1 <= e1
-        if isinstance(code, list):
-          for i in xrange(e1-s1+1):
-            self.cmap.register_cid2code(s1+i, code[i])
-        else:
-          var = code[-4:]
-          base = nunpack(var)
-          prefix = code[:-4]
-          vlen = len(var)
-          for i in xrange(e1-s1+1):
-            x = prefix+pack('>L',base+i)[-vlen:]
-            self.cmap.register_cid2code(s1+i, x)
-      self.popall()
-      return
-        
-    if name == 'beginbfchar':
-      self.popall()
-      return
-    if name == 'endbfchar':
-      for (cid,code) in choplist(2, self.partobj):
-        assert isinstance(cid, str)
-        assert isinstance(code, str)
-        self.cmap.register_cid2code(nunpack(cid), code)
-      self.popall()
-      return
-        
-    if name == 'beginnotdefrange':
-      self.popall()
-      return
-    if name == 'endnotdefrange':
-      if 1 <= self.debug:
-        print >>stderr, 'notdefrange: %r' % self.partobj
-      self.popall()
-      return
-    
-    return
+def stream_value(x):
+  x = resolve1(x)
+  if not isinstance(x, PDFStream):
+    raise PDFTypeError('stream required: %r' % x)
+  return x
 
 
 ##  PDFStream type
@@ -934,111 +197,14 @@ class PDFStream:
     return self.data
 
   def parse_data(self, inline=False, debug=0):
+    try:
+      from cStringIO import StringIO
+    except ImportError:
+      from StringIO import StringIO
     return PDFParser(self.doc, StringIO(self.get_data()),
                      inline=inline, debug=debug).parse()
   
 
-##  PDFObjRef
-##
-class PDFObjRef:
-  
-  def __init__(self, doc, objid, genno):
-    if objid == 0:
-      raise PDFValueError('objid cannot be 0.')
-    self.doc = doc
-    self.objid = objid
-    #self.genno = genno  # Never used.
-    return
-
-  def __repr__(self):
-    return '<PDFObjRef:%d>' % (self.objid)
-
-  def resolve(self):
-    return self.doc.getobj(self.objid)
-
-
-# resolve
-def resolve1(x):
-  '''
-  Resolve an object. If this is an array or dictionary,
-  it may still contains some indirect objects inside.
-  '''
-  while isinstance(x, PDFObjRef):
-    x = x.resolve()
-  return x
-
-def resolveall(x):
-  '''
-  Recursively resolve X and all the internals.
-  Make sure there is no indirect reference within the nested object.
-  This procedure might be slow. Do not used it unless
-  you really need it.
-  '''
-  while isinstance(x, PDFObjRef):
-    x = x.resolve()
-  if isinstance(x, list):
-    x = [ resolveall(v) for v in x ]
-  elif isinstance(x, dict):
-    for (k,v) in x.iteritems():
-      x[k] = resolveall(v)
-  return x
-
-# Type cheking
-def literal_name(x):
-  x = resolve1(x)
-  if not isinstance(x, PSLiteral):
-    raise PDFTypeError('literal required: %r' % x)
-  return x.name
-
-def keyword_name(x):
-  x = resolve1(x)
-  if not isinstance(x, PSKeyword):
-    raise PDFTypeError('keyword required: %r' % x)
-  return x.name
-
-def str_value(x):
-  x = resolve1(x)
-  if not isinstance(x, str):
-    raise PDFTypeError('string required: %r' % x)
-  return x
-
-def int_value(x):
-  x = resolve1(x)
-  if not isinstance(x, int):
-    raise PDFTypeError('integer required: %r' % x)
-  return x
-
-def float_value(x):
-  x = resolve1(x)
-  if not isinstance(x, float):
-    raise PDFTypeError('float required: %r' % x)
-  return x
-
-def num_value(x):
-  x = resolve1(x)
-  if not (isinstance(x, int) or isinstance(x, float)):
-    raise PDFTypeError('int or float required: %r' % x)
-  return x
-
-def list_value(x):
-  x = resolve1(x)
-  if not isinstance(x, list):
-    raise PDFTypeError('list required: %r' % x)
-  return x
-
-def dict_value(x):
-  x = resolve1(x)
-  if not isinstance(x, dict):
-    raise PDFTypeError('dict required: %r' % x)
-  return x
-
-def stream_value(x):
-  x = resolve1(x)
-  if not isinstance(x, PDFStream):
-    raise PDFTypeError('stream required: %r' % x)
-  return x
-
-
 ##  PDFPage
 ##
 class PDFPage:
@@ -1372,845 +538,3 @@ class PDFParser(PSStackParser):
       else:
         break
     return
-
-
-##  Fonts
-##
-
-# PDFFont
-class PDFFont:
-  
-  def __init__(self, fontid, descriptor, widths, default_width=None):
-    self.fontid = fontid
-    self.descriptor = descriptor
-    self.widths = widths
-    self.fontname = descriptor['FontName']
-    if isinstance(self.fontname, PSLiteral):
-      self.fontname = literal_name(self.fontname)
-    self.ascent = descriptor['Ascent']
-    self.descent = descriptor['Descent']
-    self.default_width = default_width or descriptor.get('MissingWidth', 0)
-    self.leading = descriptor.get('Leading', 0)
-    self.bbox = descriptor['FontBBox']
-    return
-
-  def __repr__(self):
-    return '<PDFFont: fontid=%r>' % (self.fontid,)
-
-  def is_vertical(self):
-    return False
-  
-  def decode(self, bytes):
-    return map(ord, bytes)
-
-  def char_width(self, cid):
-    return self.widths.get(cid, self.default_width)
-
-  def char_disp(self, cid):
-    return 0
-  
-  def string_width(self, s):
-    return sum( self.char_width(cid) for cid in self.decode(s) )
-  
-
-# PDFSimpleFont
-class PDFSimpleFont(PDFFont):
-  
-  def __init__(self, fontid, descriptor, widths, spec):
-    # Font encoding is specified either by a name of
-    # built-in encoding or a dictionary that describes
-    # the differences.
-    if 'Encoding' in spec:
-      encoding = resolve1(spec['Encoding'])
-    else:
-      encoding = LITERAL_STANDARD_ENCODING
-    if isinstance(encoding, dict):
-      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
-      diff = encoding.get('Differences', None)
-      self.encoding = EncodingDB.get_encoding(name, diff)
-    else:
-      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
-    self.ucs2_cmap = None
-    if 'ToUnicode' in spec:
-      strm = stream_value(spec['ToUnicode'])
-      self.ucs2_cmap = CMap()
-      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
-    PDFFont.__init__(self, fontid, descriptor, widths)
-    return
-
-  def to_unicode(self, cid):
-    if not self.ucs2_cmap:
-      try:
-        return self.encoding[cid]
-      except KeyError:
-        raise PDFUnicodeNotDefined(None, cid)
-    code = self.ucs2_cmap.tocode(cid)
-    if not code:
-      raise PDFUnicodeNotDefined(None, cid)
-    chars = unpack('>%dH' % (len(code)/2), code)
-    return ''.join( unichr(c) for c in chars )
-
-
-# PDFType1Font
-class PDFType1Font(PDFSimpleFont):
-  
-  def __init__(self, fontid, spec):
-    if 'BaseFont' not in spec:
-      raise PDFFontError('BaseFont is missing')
-    self.basefont = literal_name(spec['BaseFont'])
-    try:
-      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
-    except KeyError:
-      try:
-        descriptor = dict_value(spec['FontDescriptor'])
-        firstchar = int_value(spec['FirstChar'])
-        lastchar = int_value(spec['LastChar'])
-        widths = dict( (i+firstchar,w) for (i,w)
-                       in enumerate(list_value(spec['Widths'])) )
-      except KeyError, k:
-        raise PDFFontError('%s is missing' % k)
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
-    return
-
-# PDFTrueTypeFont
-class PDFTrueTypeFont(PDFType1Font):
-  pass
-
-# PDFType3Font
-class PDFType3Font(PDFSimpleFont):
-  def __init__(self, fontid, spec):
-    try:
-      firstchar = int_value(spec['FirstChar'])
-      lastchar = int_value(spec['LastChar'])
-      widths = dict( (i+firstchar,w) for (i,w)
-                     in enumerate(list_value(spec['Widths'])) )
-    except KeyError, k:
-      raise PDFFontError('%s is missing' % k)
-    if 'FontDescriptor' in spec:
-      descriptor = dict_value(spec['FontDescriptor'])
-    else:
-      descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
-                    'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
-    return
-
-# PDFCIDFont
-
-##  TrueTypeFont
-##
-class TrueTypeFont:
-
-  class CMapNotFound(Exception): pass
-  
-  def __init__(self, name, fp):
-    self.name = name
-    self.fp = fp
-    self.tables = {}
-    fonttype = fp.read(4)
-    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-    for i in xrange(ntables):
-      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
-      self.tables[name] = (offset, length)
-    return
-
-  def create_cmap(self):
-    if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
-    (base_offset, length) = self.tables['cmap']
-    fp = self.fp
-    fp.seek(base_offset)
-    (version, nsubtables) = unpack('>HH', fp.read(4))
-    subtables = []
-    for i in xrange(nsubtables):
-      subtables.append(unpack('>HHL', fp.read(8)))
-    char2gid = {}
-    # Only supports subtable type 0, 2 and 4.
-    for (_1, _2, st_offset) in subtables:
-      fp.seek(base_offset+st_offset)
-      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
-      if fmttype == 0:
-        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
-      elif fmttype == 2:
-        subheaderkeys = unpack('>256H', fp.read(512))
-        firstbytes = [0]*8192
-        for (i,k) in enumerate(subheaderkeys):
-          firstbytes[k/8] = i
-        nhdrs = max(subheaderkeys)/8 + 1
-        hdrs = []
-        for i in xrange(nhdrs):
-          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
-          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
-        for (i,firstcode,entcount,delta,pos) in hdrs:
-          if not entcount: continue
-          first = firstcode + (firstbytes[i] << 8)
-          fp.seek(pos)
-          for c in xrange(entcount):
-            gid = unpack('>H', fp.read(2))
-            if gid:
-              gid += delta
-            char2gid[first+c] = gid
-      elif fmttype == 4:
-        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-        segcount /= 2
-        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        fp.read(2)
-        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
-        pos = fp.tell()
-        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
-          if idr:
-            fp.seek(pos+idr)
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
-          else:
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (c + idd) & 0xffff
-    gid2char = dict( (gid, pack('>H', char))
-                     for (char,gid) in char2gid.iteritems() )
-    cmapname = 'Adobe-Identity-UCS-%s' % self.name
-    return CMap(cmapname).update(char2gid, gid2char)
-
-class PDFCIDFont(PDFFont):
-  
-  def __init__(self, fontid, spec):
-    if 'BaseFont' not in spec:
-      raise PDFFontError('BaseFont is missing')
-    try:
-      self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
-      self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
-                                  self.cidsysteminfo['Ordering'])
-    except KeyError:
-      raise PDFFontError('CIDSystemInfo not properly defined.')
-    self.basefont = literal_name(spec['BaseFont'])
-    self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
-    descriptor = dict_value(spec['FontDescriptor'])
-    ttf = None
-    if 'FontFile2' in descriptor:
-      self.fontfile = stream_value(descriptor.get('FontFile2'))
-      ttf = TrueTypeFont(self.basefont,
-                         StringIO(self.fontfile.get_data()))
-    self.ucs2_cmap = None
-    if 'ToUnicode' in spec:
-      strm = stream_value(spec['ToUnicode'])
-      self.ucs2_cmap = CMap()
-      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
-    elif self.cidcoding == 'Adobe-Identity':
-      if ttf:
-        try:
-          self.ucs2_cmap = ttf.create_cmap()
-        except TrueTypeFont.CMapNotFound:
-          pass
-    else:
-      self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
-    def get_width(seq):
-      dic = {}
-      char1 = char2 = None
-      for v in seq:
-        if char1 == None:
-          char1 = v
-        elif char2 == None and isinstance(v, int):
-          char2 = v
-        else:
-          if char2 == None:
-            for (i,w) in enumerate(v):
-              dic[char1+i] = w
-          else:
-            for i in xrange(char1, char2+1):
-              dic[i] = v
-          char1 = char2 = None
-      return dic
-    self.vertical = self.cmap.is_vertical()
-    if self.vertical:
-      # writing mode: vertical
-      dic = get_width(list_value(spec.get('W2', [])))
-      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
-      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
-      (d,w) = spec.get('DW2', [880, -1000])
-      default_width = w
-      self.default_disp = d
-    else:
-      # writing mode: horizontal
-      widths = get_width(list_value(spec.get('W', [])))
-      self.disps = {}
-      default_width = spec.get('DW', 1000)
-      self.default_disp = 0
-    PDFFont.__init__(self, fontid, descriptor, widths, default_width)
-    return
-
-  def is_vertical(self):
-    return self.vertical
-  
-  def decode(self, bytes):
-    return self.cmap.decode(bytes)
-
-  def char_disp(self, cid):
-    return self.disps.get(cid, self.default_disp)
-
-  def to_unicode(self, cid):
-    if not self.ucs2_cmap:
-      raise PDFUnicodeNotDefined(self.cidcoding, cid)
-    code = self.ucs2_cmap.tocode(cid)
-    if not code:
-      raise PDFUnicodeNotDefined(self.cidcoding, cid)
-    chars = unpack('>%dH' % (len(code)/2), code)
-    return ''.join( unichr(c) for c in chars )
-
-
-##  Resource Manager
-##
-class PDFResourceManager:
-
-  '''
-  ResourceManager facilitates reuse of shared resources
-  such as fonts, images and cmaps so that large objects are not
-  allocated multiple times.
-  '''
-  
-  def __init__(self, debug=0):
-    self.debug = debug
-    self.fonts = {}
-    return
-
-  def get_procset(self, procs):
-    for proc in procs:
-      if proc == LITERAL_PDF:
-        pass
-      elif proc == LITERAL_TEXT:
-        pass
-      else:
-        #raise PDFResourceError('ProcSet %r is not supported.' % proc)
-        pass
-    return
-  
-  def get_cmap(self, name):
-    return CMapDB.get_cmap(name)
-
-  def get_font(self, fontid, spec):
-    if fontid in self.fonts:
-      font = self.fonts[fontid]
-    else:
-      spec = dict_value(spec)
-      assert spec['Type'] == LITERAL_FONT
-      # Create a Font object.
-      if 'Subtype' not in spec:
-        raise PDFFontError('Font Subtype is not specified.')
-      subtype = literal_name(spec['Subtype'])
-      if subtype in ('Type1', 'MMType1'):
-        # Type1 Font
-        font = PDFType1Font(fontid, spec)
-      elif subtype == 'TrueType':
-        # TrueType Font
-        font = PDFTrueTypeFont(fontid, spec)
-      elif subtype == 'Type3':
-        # Type3 Font
-        font = PDFType3Font(fontid, spec)
-      elif subtype in ('CIDFontType0', 'CIDFontType2'):
-        # CID Font
-        font = PDFCIDFont(fontid, spec)
-      elif subtype == 'Type0':
-        # Type0 Font
-        dfonts = list_value(spec['DescendantFonts'])
-        assert len(dfonts) == 1
-        subspec = dict_value(dfonts[0]).copy()
-        for k in ('Encoding', 'ToUnicode'):
-          if k in spec:
-            subspec[k] = resolve1(spec[k])
-        font = self.get_font(fontid, subspec)
-      else:
-        raise PDFFontError('Invalid Font: %r' % spec)
-      self.fonts[fontid] = font
-    return font
-
-
-##  Interpreter
-##
-class PDFPageInterpreter:
-  
-  class TextState:
-    def __init__(self):
-      self.font = None
-      self.fontsize = 0
-      self.charspace = 0
-      self.wordspace = 0
-      self.scaling = 100
-      self.leading = 0
-      self.render = 0
-      self.rise = 0
-      self.reset()
-      return
-    def __repr__(self):
-      return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
-              ' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
-              ' render=%r, rise=%r>' %
-              (self.font, self.fontsize, self.matrix,
-               self.charspace, self.wordspace, self.scaling, self.leading,
-               self.render, self.rise))
-    def reset(self):
-      self.matrix = (1, 0, 0, 1, 0, 0)
-      self.linematrix = (0, 0)
-      return
-
-  def __init__(self, rsrc, device, debug=0):
-    self.rsrc = rsrc
-    self.device = device
-    self.debug = debug
-    return
-
-  def initpage(self, ctm):
-    self.fontmap = {}
-    self.xobjmap = {}
-    self.csmap = {}
-    # gstack: stack for graphical states.
-    self.gstack = []
-    self.ctm = ctm
-    self.device.set_ctm(self.ctm)
-    self.textstate = PDFPageInterpreter.TextState()
-    # argstack: stack for command arguments.
-    self.argstack = []
-    # set some global states.
-    self.scs = None
-    self.ncs = None
-    return
-
-  def push(self, obj):
-    self.argstack.append(obj)
-    return
-
-  def pop(self, n):
-    x = self.argstack[-n:]
-    self.argstack = self.argstack[:-n]
-    return x
-
-  def get_current_state(self):
-    return (self.ctm, self.textstate)
-  
-  def set_current_state(self, state):
-    (self.ctm, self.textstate) = state
-    self.device.set_ctm(self.ctm)
-    return
-
-  # gsave
-  def do_q(self):
-    self.gstack.append(self.get_current_state())
-    return
-  # grestore
-  def do_Q(self):
-    if self.gstack:
-      self.set_current_state(self.gstack.pop())
-    return
-  
-  # concat-matrix
-  def do_cm(self, a1, b1, c1, d1, e1, f1):
-    self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
-    self.device.set_ctm(self.ctm)
-    return
-  
-  # setlinewidth
-  def do_w(self, width): return
-  # setlinecap
-  def do_J(self, cap): return
-  # setlinejoin
-  def do_j(self, join): return
-  # setmiterlimit
-  def do_M(self, limit): return
-  # setdash
-  def do_d(self, dash, phase): return
-  # setintent
-  def do_ri(self, intent): return
-  # setflatness
-  def do_i(self, flatness): return
-  # savedict
-  def do_gs(self, name): return
-  
-  # moveto
-  def do_m(self, x, y): return
-  # lineto
-  def do_l(self, x, y): return
-  # curveto
-  def do_c(self, x1, y1, x2, y2, x3, y3): return
-  # urveto
-  def do_v(self, x2, y2, x3, y3): return
-  # rveto
-  def do_y(self, x1, y1, x3, y3): return
-  # closepath
-  def do_h(self): return
-  # rectangle
-  def do_re(self, x, y, w, h): return
-  
-  # stroke
-  def do_S(self): return
-  # close-and-stroke
-  def do_s(self): return
-  # fill
-  def do_f(self): return
-  # fill (obsolete)
-  do_F = do_f
-  # fill-even-odd
-  def do_f_a(self): return
-  # fill-and-stroke
-  def do_B(self): return
-  # fill-and-stroke-even-odd
-  def do_B_a(self): return
-  # close-fill-and-stroke
-  def do_b(self): return
-  # close-fill-and-stroke-even-odd
-  def do_b_a(self): return
-  # close-only
-  def do_n(self): return
-  # clip
-  def do_W(self): return
-  # clip-even-odd
-  def do_W_a(self): return
-  
-  # setcolorspace-stroking
-  def do_CS(self, name):
-    self.scs = self.csmap.get(literal_name(name), None)
-    return
-  # setcolorspace-non-strokine
-  def do_cs(self, name):
-    self.ncs = self.csmap.get(literal_name(name), None)
-    return
-  # setgray-stroking
-  def do_G(self, gray):
-    self.do_CS(LITERAL_DEVICE_GRAY)
-    return
-  # setgray-non-stroking
-  def do_g(self, gray):
-    self.do_cs(LITERAL_DEVICE_GRAY)
-    return
-  # setrgb-stroking
-  def do_RG(self, r, g, b):
-    self.do_CS(LITERAL_DEVICE_RGB)
-    return
-  # setrgb-non-stroking
-  def do_rg(self, r, g, b):
-    self.do_cs(LITERAL_DEVICE_RGB)
-    return
-  # setcmyk-stroking
-  def do_K(self, c, m, y, k):
-    self.do_CS(LITERAL_DEVICE_CMYK)
-    return
-  # setcmyk-non-stroking
-  def do_k(self, c, m, y, k):
-    self.do_cs(LITERAL_DEVICE_CMYK)
-    return
-
-  # setcolor
-  def do_SCN(self):
-    n = cs_params(self.scs)
-    self.pop(n)
-    return
-  def do_scn(self):
-    n = cs_params(self.ncs)
-    self.pop(n)
-    return
-  def do_SC(self):
-    self.do_SCN()
-    return
-  def do_sc(self):
-    self.do_scn()
-    return
-    
-  # sharing-name
-  def do_sh(self, name): return
-  
-  # begin-text
-  def do_BT(self):
-    self.textstate.reset()
-    return
-  # end-text
-  def do_ET(self):
-    return
-
-  # begin-compat
-  def do_BX(self): return
-  # end-compat
-  def do_EX(self): return
-
-  # marked content operators
-  def do_MP(self, tag): return
-  def do_DP(self, tag, props): return
-  def do_BMC(self, tag): return
-  def do_BDC(self, tag, props): return
-  def do_EMC(self): return
-
-  # setcharspace
-  def do_Tc(self, space):
-    self.textstate.charspace = space
-    return
-  # setwordspace
-  def do_Tw(self, space):
-    self.textstate.wordspace = space
-    return
-  # textscale
-  def do_Tz(self, scale):
-    self.textstate.scaling = scale
-    return
-  # setleading
-  def do_TL(self, leading):
-    self.textstate.leading = leading
-    return
-  # selectfont
-  def do_Tf(self, fontid, fontsize):
-    try:
-      self.textstate.font = self.fontmap[literal_name(fontid)]
-    except KeyError:
-      raise PDFInterpreterError('Undefined font id: %r' % fontid)
-    self.textstate.fontsize = fontsize
-    return
-  # setrendering
-  def do_Tr(self, render):
-    self.textstate.render = render
-    return
-  # settextrise
-  def do_Ts(self, rise):
-    self.textstate.rise = rise
-    return
-
-  # text-move
-  def do_Td(self, tx, ty):
-    (a,b,c,d,e,f) = self.textstate.matrix
-    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
-    self.textstate.linematrix = (0, 0)
-    return
-  # text-move
-  def do_TD(self, tx, ty):
-    (a,b,c,d,e,f) = self.textstate.matrix
-    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
-    self.textstate.leading = -ty
-    self.textstate.linematrix = (0, 0)
-    return
-  # textmatrix
-  def do_Tm(self, a,b,c,d,e,f):
-    self.textstate.matrix = (a,b,c,d,e,f)
-    self.textstate.linematrix = (0, 0)
-    return
-  # nextline
-  def do_T_a(self):
-    (a,b,c,d,e,f) = self.textstate.matrix
-    self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
-    self.textstate.linematrix = (0, 0)
-    return
-  
-  # show-pos
-  def do_TJ(self, seq):
-    textstate = self.textstate
-    font = textstate.font
-    (a,b,c,d,e,f) = textstate.matrix
-    (lx,ly) = textstate.linematrix
-    s = ''.join( x for x in seq if isinstance(x, str) )
-    n = sum( x for x in seq if not isinstance(x, str) )
-    w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
-         len(s) * textstate.charspace +
-         s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
-    self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
-    if font.is_vertical():
-      ly += w
-    else:
-      lx += w
-    textstate.linematrix = (lx,ly)
-    return
-  # show
-  def do_Tj(self, s):
-    self.do_TJ([s])
-    return
-  # quote
-  def do__q(self, s):
-    self.do_T_a()
-    self.do_TJ([s])
-    return
-  # doublequote
-  def do__w(self, aw, ac, s):
-    self.do_Tw(aw)
-    self.do_Tc(ac)
-    self.do_TJ([s])
-    return
-
-  # inline image
-  def do_BI(self): # never called
-    return
-  def do_ID(self): # never called
-    return
-  def do_EI(self, obj):
-    return
-
-  # invoke an XObject
-  def do_Do(self, xobjid):
-    xobjid = literal_name(xobjid)
-    try:
-      xobj = stream_value(self.xobjmap[xobjid])
-    except KeyError:
-      raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
-    if xobj.dic['Subtype'] == LITERAL_FORM:
-      if 1 <= self.debug:
-        print >>stderr, 'Processing xobj: %r' % xobj
-      interpreter = PDFPageInterpreter(self.rsrc, self.device)
-      interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj], xobj.dic['Matrix'])
-    return
-
-  def process_page(self, page):
-    if 1 <= self.debug:
-      print >>stderr, 'Processing page: %r' % page
-    self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
-    return
-
-  def render_contents(self, contid, resources, contents, ctm=(1, 0, 0, 1, 0, 0)):
-    self.initpage(ctm)
-    self.device.begin_block(contid)
-    # Handle resource declarations.
-    for (k,v) in resources.iteritems():
-      if 1 <= self.debug:
-        print >>stderr, 'Resource: %r: %r' % (k,v)
-      if k == 'Font':
-        for (fontid,fontrsrc) in dict_value(v).iteritems():
-          self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
-      elif k == 'ColorSpace':
-        for (csid,csspec) in dict_value(v).iteritems():
-          self.csmap[csid] = list_value(csspec)
-      elif k == 'ProcSet':
-        self.rsrc.get_procset(list_value(v))
-      elif k == 'XObject':
-        for (xobjid,xobjstrm) in dict_value(v).iteritems():
-          self.xobjmap[xobjid] = xobjstrm
-    for stream in contents:
-      self.execute(stream_value(stream))
-    self.device.end_block()
-    return
-  
-  def execute(self, stream):
-    for obj in stream.parse_data(inline=True, debug=self.debug):
-      if isinstance(obj, PSKeyword):
-        name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
-        if hasattr(self, name):
-          func = getattr(self, name)
-          nargs = func.func_code.co_argcount-1
-          if nargs:
-            args = self.pop(nargs)
-            if 1 <= self.debug:
-              print >>stderr, 'exec: %s %r' % (obj.name, args)
-            if len(args) == nargs:
-              func(*args)
-          else:
-            if 1 <= self.debug:
-              print >>stderr, 'exec: %s' % (obj.name)
-            func()
-        else:
-          raise PDFInterpreterError('unknown operator: %r' % obj.name)
-      else:
-        self.push(obj)
-    return
-
-
-##  PDFDevice
-##
-class PDFDevice:
-  
-  def __init__(self, rsrc):
-    self.rsrc = rsrc
-    self.ctm = None
-    return
-  
-  def __repr__(self):
-    return '<PDFDevice>'
-
-  def set_ctm(self, ctm):
-    self.ctm = ctm
-    return
-
-  def begin_block(self, name):
-    return
-  def end_block(self):
-    return
-  
-  def render_string(self, textstate, textmatrix, size, seq):
-    raise NotImplementedError
-
-
-##  TextConverter
-##
-class TextConverter(PDFDevice):
-
-  def __init__(self, rsrc, codec, outfp=sys.stdout):
-    PDFDevice.__init__(self, rsrc)
-    self.outfp = outfp
-    self.codec = codec
-    return
-  
-  def begin_block(self, name):
-    self.outfp.write('<block name="%s">\n' % name)
-    return
-  def end_block(self):
-    self.outfp.write('</block>\n')
-    return
-
-  def render_string(self, textstate, textmatrix, size, seq):
-    font = textstate.font
-    spwidth = int(-font.char_width(32) * 0.6) # space width
-    buf = ''
-    for x in seq:
-      if isinstance(x, int) or isinstance(x, float):
-        if not font.is_vertical() and x <= spwidth:
-          buf += ' '
-      else:
-        chars = font.decode(x)
-        for cid in chars:
-          try:
-            char = font.to_unicode(cid)
-          except PDFUnicodeNotDefined, e:
-            (cidcoding, cid) = e.args
-            char = u'[%s:%d]' % (cidcoding, cid)
-          buf += char
-    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
-    skewed = (b != 0 or c != 0)
-    if font.is_vertical():
-      size = -size
-      tag = 'vtext'
-    else:
-      tag = 'htext'
-    if skewed:
-      tag += ' skewed'
-    s = buf.encode(self.codec, 'xmlcharrefreplace')
-    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
-    def f(x): return '%.03f' % x
-    self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
-                     (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
-    return
-
-
-# main
-def main(argv):
-  import getopt
-  def usage():
-    print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
-    return 100
-  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
-  except getopt.GetoptError:
-    return usage()
-  if not args: return usage()
-  (debug, verbose) = (0, 0)
-  cmapdir = 'CMap'
-  cdbcmapdir = 'CDBCMap'
-  codec = 'ascii'
-  pages = set()
-  for (k, v) in opts:
-    if k == '-d': debug += 1
-    elif k == '-v': verbose += 1
-    elif k == '-p': pages.add(int(v))
-    elif k == '-c': codec = v
-  #
-  CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
-  rsrc = PDFResourceManager(debug=debug)
-  device = TextConverter(rsrc, codec)
-  for fname in args:
-    doc = PDFDocument(debug=debug)
-    fp = file(fname)
-    parser = PDFParser(doc, fp, debug=debug)
-    interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
-    for (i,page) in enumerate(doc.get_pages(debug=debug)):
-      if pages and (i not in pages): continue
-      interpreter.process_page(page)
-    fp.close()
-  return
-
-if __name__ == '__main__': sys.exit(main(sys.argv))
diff --git a/psparser.py b/psparser.py
new file mode 100644
index 0000000..5a72d46
--- /dev/null
+++ b/psparser.py
@@ -0,0 +1,396 @@
+#!/usr/bin/env python
+import sys, re
+stderr = sys.stderr
+from utils import choplist
+
+
+##  PS Exceptions
+##
+class PSException(Exception): pass
+class PSSyntaxError(PSException): pass
+class PSTypeError(PSException): pass
+class PSValueError(PSException): pass
+
+
+##  PostScript Types
+##
+class PSLiteral:
+  '''
+  PS literals (e.g. "/Name").
+  Caution: Never create these objects directly.
+  Use PSLiteralTable.intern() instead.
+  '''
+  def __init__(self, name):
+    self.name = name
+    return
+  def __repr__(self):
+    return '/%s' % self.name
+
+class PSKeyword:
+  '''
+  PS keywords (e.g. "showpage").
+  Caution: Never create these objects directly.
+  Use PSKeywordTable.intern() instead.
+  '''
+  def __init__(self, name):
+    self.name = name
+    return
+  def __repr__(self):
+    return self.name
+
+class PSSymbolTable:
+  '''
+  Symbol table that stores PSLiteral or PSKeyword.
+  '''
+  def __init__(self, classe):
+    self.dic = {}
+    self.classe = classe
+    return
+  
+  def intern(self, name):
+    if name in self.dic:
+      lit = self.dic[name]
+    else:
+      lit = self.classe(name)
+      self.dic[name] = lit
+    return lit
+
+PSLiteralTable = PSSymbolTable(PSLiteral)
+PSKeywordTable = PSSymbolTable(PSKeyword)
+
+
+def literal_name(x):
+  if not isinstance(x, PSLiteral):
+    raise PSTypeError('literal required: %r' % x)
+  return x.name
+
+def keyword_name(x):
+  if not isinstance(x, PSKeyword):
+    raise PSTypeError('keyword required: %r' % x)
+  return x.name
+
+
+##  PSBaseParser
+##
+class PSBaseParser:
+
+  '''PostScript parser that performs only basic tokenization.'''
+
+  def __init__(self, fp, debug=0):
+    self.fp = fp
+    self.debug = debug
+    self.bufsize = 4096
+    self.seek(0)
+    return
+
+  def __repr__(self):
+    return '<PSBaseParser: %r>' % (self.fp,)
+
+  def seek(self, pos):
+    '''
+    seeks to the given pos.
+    '''
+    if 2 <= self.debug:
+      print >>stderr, 'seek:', pos
+    self.fp.seek(pos)
+    self.linepos = pos
+    self.linebuf = None
+    self.curpos = 0
+    self.line = ''
+    return
+  
+  EOLCHAR = re.compile(r'[\r\n]')
+  def nextline(self):
+    '''
+    fetches the next line that ends either with \\r or \\n.
+    '''
+    line = ''
+    eol = None
+    while 1:
+      if not self.linebuf or len(self.linebuf) <= self.curpos:
+        # fetch next chunk.
+        self.linebuf = self.fp.read(self.bufsize)
+        if not self.linebuf:
+          # at EOF.
+          break
+        self.curpos = 0
+      if eol:
+        c = self.linebuf[self.curpos]
+        # handle '\r\n'
+        if (eol == '\r' and c == '\n'):
+          line += c
+          self.curpos += 1
+        break
+      m = self.EOLCHAR.search(self.linebuf, self.curpos)
+      if m:
+        i = m.end(0)
+        line += self.linebuf[self.curpos:i]
+        eol = self.linebuf[i-1]
+        self.curpos = i
+      else:
+        # fetch further
+        line += self.linebuf[self.curpos:]
+        self.linebuf = None
+    self.linepos += len(line)
+    return line
+
+  def revreadlines(self):
+    '''
+    fetches lines backword. used to locate trailers.
+    '''
+    self.fp.seek(0, 2)
+    pos = self.fp.tell()
+    buf = ''
+    while 0 < pos:
+      pos = max(0, pos-self.bufsize)
+      self.fp.seek(pos)
+      s = self.fp.read(self.bufsize)
+      if not s: break
+      while 1:
+        n = max(s.rfind('\r'), s.rfind('\n'))
+        if n == -1:
+          buf = s + buf
+          break
+        yield buf+s[n:]
+        s = s[:n]
+        buf = ''
+    return
+
+  SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
+  TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
+  LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
+  NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
+  STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
+  STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
+  STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
+  STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
+
+  def parse(self):
+    '''
+    Yields a list of basic tokens: keywords, literals, strings, 
+    numbers and parentheses. Comments are skipped.
+    Nested objects (i.e. arrays and dictionaries) are not handled.
+    '''
+    while 1:
+      # do not strip line! we need to distinguish last '\n' or '\r'
+      linepos0 = self.linepos
+      self.line = self.nextline()
+      if not self.line: break
+      if 2 <= self.debug:
+        print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
+      # do this before removing comment
+      if self.line.startswith('%%EOF'): break
+      charpos = 0
+      
+      # tokenize
+      while 1:
+        m = self.TOKEN.search(self.line, charpos)
+        if not m: break
+        t = m.group(0)
+        pos = linepos0 + m.start(0)
+        charpos = m.end(0)
+        
+        if t == '%':
+          # skip comment
+          if 2 <= self.debug:
+            print >>stderr, 'comment: %r' % self.line[charpos:]
+          break
+        
+        elif t == '/':
+          # literal object
+          mn = self.LITERAL.match(self.line, m.start(0)+1)
+          lit = PSLiteralTable.intern(mn.group(0))
+          yield (pos, lit)
+          charpos = mn.end(0)
+          if 2 <= self.debug:
+            print >>stderr, 'name: %r' % lit
+            
+        elif t == '(':
+          # normal string object
+          s = ''
+          while 1:
+            ms = self.STRING_NORM.match(self.line, charpos)
+            if not ms: break
+            s1 = ms.group(0)
+            charpos = ms.end(0)
+            if len(s1) == 1 and s1[-1] == '\\':
+              s += s1[-1:]
+              self.line = self.nextline()
+              if not self.line:
+                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                    (self.linepos, self.line))
+              charpos = 0
+            elif charpos == len(self.line):
+              s += s1
+              self.line = self.nextline()
+              if not self.line:
+                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                    (self.linepos, self.line))
+              charpos = 0
+            else:
+              s += s1
+              break
+          if self.line[charpos] != ')':
+            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                (self.linepos, self.line))
+          charpos += 1
+          def convesc(m):
+            x = m.group(0)
+            if x[1:].isdigit():
+              return chr(int(x[1:], 8))
+            else:
+              return x[1]
+          s = self.STRING_NORM_SUB.sub(convesc, s)
+          if 2 <= self.debug:
+            print >>stderr, 'str: %r' % s
+          yield (pos, s)
+          
+        elif t == '<':
+          # hex string object
+          ms = self.STRING_HEX.match(self.line, charpos)
+          charpos = ms.end(0)
+          if self.line[charpos] != '>':
+            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                (self.linepos, self.line))
+          charpos += 1
+          def convhex(m1):
+            return chr(int(m1.group(0), 16))
+          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
+          if 2 <= self.debug:
+            print >>stderr, 'str: %r' % s
+          yield (pos, s)
+
+        elif self.NUMBER.match(t):
+          # number
+          if '.' in t:
+            n = float(t)
+          else:
+            n = int(t)
+          if 2 <= self.debug:
+            print >>stderr, 'number: %r' % n
+          yield (pos, n)
+
+        elif t in ('true','false'):
+          # boolean
+          if 2 <= self.debug:
+            print >>stderr, 'boolean: %r' % t
+          yield (pos, (t == 'true'))
+        
+        else:
+          # other token
+          if 2 <= self.debug:
+            print >>stderr, 'keyword: %r' % t
+          yield (pos, PSKeywordTable.intern(t))
+
+    return
+
+
+##  PSStackParser
+##
+class PSStackParser(PSBaseParser):
+
+  '''
+  PostScript parser that recognizes compound objects
+  such as arrays and dictionaries.
+  '''
+  
+  def __init__(self, fp, debug=0):
+    PSBaseParser.__init__(self, fp, debug=debug)
+    self.context = []
+    self.partobj = None
+    return
+
+  def do_token(self, pos, token):
+    '''
+    Handles special tokens.
+    Returns true if the token denotes the end of an object.
+    '''
+    return False
+
+  def push(self, obj):
+    '''
+    Push an object to the stack.
+    '''
+    self.partobj.append(obj)
+    return
+
+  def pop(self, n):
+    '''
+    Pop N objects from the stack.
+    '''
+    if len(self.partobj) < n:
+      raise PSSyntaxError('stack too short < %d' % n)
+    r = self.partobj[-n:]
+    self.partobj = self.partobj[:-n]
+    return r
+  
+  def popall(self):
+    '''
+    Discards all the objects on the stack.
+    '''
+    self.partobj = []
+    return
+
+  def parse(self):
+    '''
+    Yields a list of objects: keywords, literals, strings, 
+    numbers, arrays and dictionaries. Arrays and dictionaries
+    are represented as Python sequence and dictionaries.
+    '''
+    
+    def startobj(type):
+      self.context.append((type, self.partobj))
+      self.partobj = []
+      return
+
+    def endobj(type1):
+      assert self.context
+      obj = self.partobj
+      (type0, self.partobj) = self.context.pop()
+      if type0 != type1:
+        raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
+                          (type0, self.partobj, type1, obj))
+      return obj
+
+    startobj('o')
+
+    for (pos,t) in PSBaseParser.parse(self):
+      if isinstance(t, int) or isinstance(t, float):
+        self.push(t)
+      elif isinstance(t, str):
+        self.push(t)
+      elif isinstance(t, PSLiteral):
+        self.push(t)
+      else:
+        c = keyword_name(t)
+        if c == '{' or c == '}':
+          self.push(t)
+        elif c == '[':
+          # begin array
+          if 2 <= self.debug:
+            print >>stderr, 'start array'
+          startobj('a')
+        elif c == ']':
+          # end array
+          a = endobj('a')
+          if 2 <= self.debug:
+            print >>stderr, 'end array: %r' % a
+          self.push(a)
+        elif c == '<<':
+          # begin dictionary
+          if 2 <= self.debug:
+            print >>stderr, 'start dict'
+          startobj('d')
+        elif c == '>>':
+          # end dictionary
+          objs = endobj('d')
+          if len(objs) % 2 != 0:
+            raise PSTypeError('invalid dictionary construct: %r' % objs)
+          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
+          if 2 <= self.debug:
+            print >>stderr, 'end dict: %r' % d
+          self.push(d)
+        elif self.do_token(pos, t):
+          break
+
+    return endobj('o')
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..5ceb333
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+##  Utilities
+##
+def choplist(n, seq):
+  '''Groups every n elements of the list.'''
+  r = []
+  for x in seq:
+    r.append(x)
+    if len(r) == n:
+      yield tuple(r)
+      r = []
+  return
+
+def nunpack(s, default=0):
+  '''Unpacks up to 4 bytes.'''
+  l = len(s)
+  if not l:
+    return default
+  elif l == 1:
+    return ord(s)
+  elif l == 2:
+    return unpack('>H', s)[0]
+  elif l == 3:
+    return unpack('>L', '\x00'+s)[0]
+  elif l == 4:
+    return unpack('>L', s)[0]
+  else:
+    return TypeError('invalid length: %d' % l)