split files.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@4 1aa58f4a-7d42-0410-adbc-911cccaed67c
2007-12-31 03:41:45 +00:00 · 2007-12-31 03:41:45 +00:00 · 6d93b4a7f7
parent 60d291d08b
commit 6d93b4a7f7
6 changed files with 1825 additions and 1755 deletions
--- a/cmap.py
+++ b/cmap.py
@ -0,0 +1,383 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from struct import pack, unpack
+from utils import choplist, nunpack
+from psparser import PSException, PSSyntaxError, PSTypeError, \
+     PSLiteral, PSKeyword, literal_name, keyword_name, \
+     PSStackParser
+try:
+  import cdb
+except ImportError:
+  import pycdb as cdb
+
+
+##  CMap
+##
+class CMap:
+  
+  def __init__(self, debug=0):
+    self.debug = debug
+    self.code2cid = {}
+    self.cid2code = {}
+    self.attrs = {}
+    return
+
+  def __repr__(self):
+    return '<CMap: %s>' % self.attrs.get('CMapName')
+
+  def update(self, code2cid=None, cid2code=None):
+    if code2cid:
+      self.code2cid.update(code2cid)
+    if cid2code:
+      self.cid2code.update(cid2code)
+    return self
+    
+  def copycmap(self, cmap):
+    self.code2cid.update(cmap.getall_code2cid())
+    self.cid2code.update(cmap.getall_cid2code())
+    return self
+
+  def register_code2cid(self, code, cid):
+    assert isinstance(code, str)
+    assert isinstance(cid, int)
+    self.code2cid[code] = cid
+    return self
+
+  def register_cid2code(self, cid, code):
+    from glyphlist import charname2unicode
+    assert isinstance(cid, int)
+    if isinstance(code, PSLiteral):
+      code = pack('>H', charname2unicode[code.name])
+    self.cid2code[cid] = code
+    return self
+
+  def decode(self, bytes):
+    if self.debug:
+      print >>stderr, 'decode: %r, %r' % (self, bytes)
+    x = ''
+    for c in bytes:
+      if x:
+        if x+c in self.code2cid:
+          yield self.code2cid[x+c]
+        x = ''
+      elif c in self.code2cid:
+        yield self.code2cid[c]
+      else:
+        x = c
+    return
+  
+  def is_vertical(self):
+    return self.attrs.get('WMode', '0') == '1'
+
+  def tocid(self, code):
+    return self.code2cid.get(code)
+  def tocode(self, cid):
+    return self.cid2code.get(cid)
+
+  def getall_attrs(self):
+    return self.attrs.iteritems()
+  def getall_code2cid(self):
+    return self.code2cid.iteritems()
+  def getall_cid2code(self):
+    return self.cid2code.iteritems()
+
+  
+##  CDBCMap
+##
+class CDBCMap(CMap):
+  
+  def __init__(self, cdbname, debug=0):
+    CMap.__init__(self, debug=debug)
+    self.cdbname = cdbname
+    self.db = cdb.init(cdbname)
+    return
+
+  def __repr__(self):
+    return '<CDBCMap: %s (%r)>' % (self.db['/CMapName'], self.cdbname)
+
+  def tocid(self, code):
+    k = 'c'+code
+    if not self.db.has_key(k):
+      return None
+    return unpack('>L', self.db[k])
+  def tocode(self, cid):
+    k = 'i'+pack('>L', cid)
+    if not self.db.has_key(k):
+      return None
+    return self.db[k]
+  
+  def is_vertical(self):
+    return (self.db.has_key('/WMode') and
+            self.db['/WMode'] == '1')
+
+  def getall(self, c):
+    while 1:
+      x = self.db.each()
+      if not x: break
+      (k,v) = x
+      if k.startswith(c):
+        yield (k[1:], unpack('>L', v)[0])
+    return
+
+  def getall_attrs(self):
+    while 1:
+      x = self.db.each()
+      if not x: break
+      (k,v) = x
+      if k.startswith('/'):
+        yield (k[1:], eval(v)[0])
+    return
+  
+  def getall_cid2code(self):
+    return self.getall('i')
+  def getall_code2cid(self):
+    return self.getall('c')
+
+  def decode(self, bytes):
+    if self.debug:
+      print >>stderr, 'decode: %r, %r' % (self, bytes)
+    x = ''
+    for c in bytes:
+      if x:
+        if x+c in self.code2cid:
+          yield self.code2cid[x+c]
+        elif self.db.has_key('c'+x+c):
+          (dest,) = unpack('>L', self.db['c'+x+c])
+          self.code2cid[x+c] = dest
+          yield dest
+        x = ''
+      elif c in self.code2cid:
+        yield self.code2cid[c]
+      elif self.db.has_key('c'+c):
+        (dest,) = unpack('>L', self.db['c'+c])
+        self.code2cid[c] = dest
+        yield dest
+      else:
+        x = c
+    return
+
+
+##  CMapDB
+##
+class CMapDB:
+
+  CMAP_ALIAS = {
+    }
+  
+  debug = 0
+  dirname = None
+  cdbdirname = None
+  cmapdb = {}
+
+  @classmethod
+  def initialize(klass, dirname, cdbdirname=None, debug=0):
+    klass.dirname = dirname
+    klass.cdbdirname = cdbdirname or dirname
+    klass.debug = debug
+    return
+
+  @classmethod
+  def get_cmap(klass, cmapname):
+    import os.path
+    cmapname = klass.CMAP_ALIAS.get(cmapname, cmapname)
+    if cmapname in klass.cmapdb:
+      cmap = klass.cmapdb[cmapname]
+    else:
+      fname = os.path.join(klass.dirname, cmapname)
+      cdbname = os.path.join(klass.cdbdirname, cmapname+'.cmap.cdb')
+      if os.path.exists(cdbname):
+        if 1 <= klass.debug:
+          print >>stderr, 'Opening: CDBCMap %r...' % cdbname
+        cmap = CDBCMap(cdbname)
+      elif os.path.exists(fname):
+        if 1 <= klass.debug:
+          print >>stderr, 'Reading: CMap %r...' % fname
+        cmap = CMap()
+        fp = file(fname)
+        CMapParser(cmap, fp).parse()
+        fp.close()
+      klass.cmapdb[cmapname] = cmap
+    return cmap
+
+
+##  CMapParser
+##
+class CMapParser(PSStackParser):
+
+  def __init__(self, cmap, fp, debug=0):
+    PSStackParser.__init__(self, fp, debug=debug)
+    self.cmap = cmap
+    self.in_cmap = False
+    return
+
+  def do_token(self, _, token):
+    name = token.name
+    if name == 'begincmap':
+      self.in_cmap = True
+      self.popall()
+      return
+    elif name == 'endcmap':
+      self.in_cmap = False
+      return
+    if not self.in_cmap: return
+    #
+    if name == 'def':
+      try:
+        (k,v) = self.pop(2)
+        self.cmap.attrs[literal_name(k)] = v
+      except PSSyntaxError:
+        pass
+      return
+    
+    if name == 'usecmap':
+      try:
+        (cmapname,) = self.pop(1)
+        self.cmap.copycmap(CMapDB.get_cmap(literal_name(cmapname)))
+      except PSSyntaxError:
+        pass
+      return
+      
+    if name == 'begincodespacerange':
+      self.popall()
+      return
+    if name == 'endcodespacerange':
+      if 1 <= self.debug:
+        print >>stderr, 'codespace: %r' % self.partobj
+      self.popall()
+      return
+    
+    if name == 'begincidrange':
+      self.popall()
+      return
+    if name == 'endcidrange':
+      for (s,e,cid) in choplist(3, self.partobj):
+        assert isinstance(s, str)
+        assert isinstance(e, str)
+        assert isinstance(cid, int)
+        assert len(s) == len(e)
+        sprefix = s[:-4]
+        eprefix = e[:-4]
+        assert sprefix == eprefix
+        svar = s[-4:]
+        evar = e[-4:]
+        s1 = nunpack(svar)
+        e1 = nunpack(evar)
+        vlen = len(svar)
+        assert s1 <= e1
+        for i in xrange(e1-s1+1):
+          x = sprefix+pack('>L',s1+i)[-vlen:]
+          self.cmap.register_code2cid(x, cid+i)
+      self.popall()
+      return
+    
+    if name == 'begincidchar':
+      self.popall()
+      return
+    if name == 'endcidchar':
+      for (cid,code) in choplist(2, self.partobj):
+        assert isinstance(code, str)
+        assert isinstance(cid, str)
+        self.cmap.register_code2cid(code, nunpack(cid))
+      self.popall()
+      return
+        
+    if name == 'beginbfrange':
+      self.popall()
+      return
+    if name == 'endbfrange':
+      for (s,e,code) in choplist(3, self.partobj):
+        assert isinstance(s, str)
+        assert isinstance(e, str)
+        assert len(s) == len(e)
+        s1 = nunpack(s)
+        e1 = nunpack(e)
+        assert s1 <= e1
+        if isinstance(code, list):
+          for i in xrange(e1-s1+1):
+            self.cmap.register_cid2code(s1+i, code[i])
+        else:
+          var = code[-4:]
+          base = nunpack(var)
+          prefix = code[:-4]
+          vlen = len(var)
+          for i in xrange(e1-s1+1):
+            x = prefix+pack('>L',base+i)[-vlen:]
+            self.cmap.register_cid2code(s1+i, x)
+      self.popall()
+      return
+        
+    if name == 'beginbfchar':
+      self.popall()
+      return
+    if name == 'endbfchar':
+      for (cid,code) in choplist(2, self.partobj):
+        assert isinstance(cid, str)
+        assert isinstance(code, str)
+        self.cmap.register_cid2code(nunpack(cid), code)
+      self.popall()
+      return
+        
+    if name == 'beginnotdefrange':
+      self.popall()
+      return
+    if name == 'endnotdefrange':
+      if 1 <= self.debug:
+        print >>stderr, 'notdefrange: %r' % self.partobj
+      self.popall()
+      return
+    
+    return
+
+
+##  FontMetricsDB
+##
+class FontMetricsDB:
+  from fontmetrics import FONT_METRICS
+  
+  @classmethod
+  def get_metrics(klass, fontname):
+    return klass.FONT_METRICS[fontname]
+
+
+##  EncodingDB
+##
+class EncodingDB:
+      
+  from glyphlist import charname2unicode
+  from latin_enc import ENCODING
+  
+  std2unicode = {}
+  mac2unicode = {}
+  win2unicode = {}
+  pdf2unicode = {}
+  for (name,std,mac,win,pdf) in ENCODING:
+    c = unichr(charname2unicode[name])
+    if std: std2unicode[std] = c
+    if mac: mac2unicode[mac] = c
+    if win: win2unicode[win] = c
+    if pdf: pdf2unicode[pdf] = c
+  
+  encodings = {
+    'StandardEncoding': std2unicode,
+    'MacRomanEncoding': mac2unicode,
+    'WinAnsiEncoding': win2unicode,
+    'PDFDocEncoding': pdf2unicode,
+    }
+  
+  @classmethod
+  def get_encoding(klass, name, diff=None):
+    cid2unicode = klass.encodings.get(name, klass.std2unicode)
+    if diff:
+      cid2unicode = cid2unicode.copy()
+      cid = 0
+      for x in diff:
+        if isinstance(x, int):
+          cid = x
+        elif isinstance(x, PSLiteral):
+          try:
+            cid2unicode[cid] = unichr(EncodingDB.charname2unicode[x.name])
+          except KeyError:
+            pass
+          cid += 1
+    return cid2unicode
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python
+import sys
+stdout = sys.stdout
+stderr = sys.stderr
+from pdfparser import PDFDocument, PDFParser
+from pdfinterp import PDFDevice, PDFResourceManager, \
+     PDFPageInterpreter, PDFUnicodeNotDefined, \
+     mult_matrix, apply_matrix
+from cmap import CMapDB
+
+
+##  TextConverter
+##
+class TextConverter(PDFDevice):
+
+  def __init__(self, outfp, rsrc, codec):
+    PDFDevice.__init__(self, rsrc)
+    self.outfp = outfp
+    self.codec = codec
+    return
+
+  def close(self):
+    self.outfp.write('\n')
+    return
+  
+  def begin_block(self, name):
+    self.outfp.write('<block name="%s">\n' % name)
+    return
+  def end_block(self):
+    self.outfp.write('</block>\n')
+    return
+
+  def render_string(self, textstate, textmatrix, size, seq):
+    font = textstate.font
+    spwidth = int(-font.char_width(32) * 0.6) # space width
+    buf = ''
+    for x in seq:
+      if isinstance(x, int) or isinstance(x, float):
+        if not font.is_vertical() and x <= spwidth:
+          buf += ' '
+      else:
+        chars = font.decode(x)
+        for cid in chars:
+          try:
+            char = font.to_unicode(cid)
+          except PDFUnicodeNotDefined, e:
+            (cidcoding, cid) = e.args
+            char = u'[%s:%d]' % (cidcoding, cid)
+          buf += char
+    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
+    skewed = (b != 0 or c != 0)
+    if font.is_vertical():
+      size = -size
+      tag = 'vtext'
+    else:
+      tag = 'htext'
+    if skewed:
+      tag += ' skewed'
+    s = buf.encode(self.codec, 'xmlcharrefreplace')
+    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
+    def f(x): return '%.03f' % x
+    self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
+                     (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
+    return
+
+
+# pdf2txt
+def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
+  device = TextConverter(outfp, rsrc, codec)
+  doc = PDFDocument(debug=debug)
+  fp = file(fname)
+  parser = PDFParser(doc, fp, debug=debug)
+  interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
+  for (i,page) in enumerate(doc.get_pages(debug=debug)):
+    if pages and (i not in pages): continue
+    interpreter.process_page(page)
+  fp.close()
+  device.close()
+  return
+
+
+# main
+def main(argv):
+  import getopt
+  def usage():
+    print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
+    return 100
+  try:
+    (opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
+  except getopt.GetoptError:
+    return usage()
+  if not args: return usage()
+  debug = 0
+  cmapdir = 'CMap'
+  cdbcmapdir = 'CDBCMap'
+  codec = 'ascii'
+  pages = set()
+  outfp = stdout
+  for (k, v) in opts:
+    if k == '-d': debug += 1
+    elif k == '-p': pages.add(int(v))
+    elif k == '-o': outfp = file(v, 'wb')
+    elif k == '-c': codec = v
+  #
+  CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
+  rsrc = PDFResourceManager(debug=debug)
+  for fname in args:
+    pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug)
+  return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -0,0 +1,827 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from struct import pack, unpack
+try:
+  from cStringIO import StringIO
+except ImportError:
+  from StringIO import StringIO
+from psparser import PSException, PSSyntaxError, PSTypeError, \
+     PSStackParser, PSLiteral, PSKeyword, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
+from pdfparser import resolve1, int_value, float_value, num_value, \
+     str_value, list_value, dict_value, stream_value, PDFException
+from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+
+
+##  Exceptions
+##
+class PDFResourceError(PDFException): pass
+class PDFInterpreterError(PDFException): pass
+class PDFFontError(PDFException): pass
+class PDFUnicodeNotDefined(PDFFontError): pass
+
+
+##  Constants
+##
+LITERAL_PDF = PSLiteralTable.intern('PDF')
+LITERAL_TEXT = PSLiteralTable.intern('Text')
+LITERAL_FONT = PSLiteralTable.intern('Font')
+LITERAL_FORM = PSLiteralTable.intern('Form')
+LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
+LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
+LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
+LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
+LITERAL_ICC_BASED = PSLiteralTable.intern('ICCBased')
+LITERAL_DEVICE_N = PSLiteralTable.intern('DeviceN')
+MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
+CS_COMPONENTS = {
+  PSLiteralTable.intern('CalRGB'): 3,
+  PSLiteralTable.intern('CalGray'): 1,
+  PSLiteralTable.intern('Lab'): 3,
+  PSLiteralTable.intern('DeviceRGB'): 3,
+  PSLiteralTable.intern('DeviceCMYK'): 4,
+  PSLiteralTable.intern('DeviceGray'): 1,
+  PSLiteralTable.intern('Separation'): 1,
+  PSLiteralTable.intern('Indexed'): 1,
+  PSLiteralTable.intern('Pattern'): 1,
+  }
+
+
+##  Matrix operations
+##
+def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
+  '''Multiplies two matrices.'''
+  return (a0*a1+c0*b1,    b0*a1+d0*b1,
+          a0*c1+c0*d1,    b0*c1+d0*d1,
+          a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
+
+def apply_matrix((a,b,c,d,e,f), (x,y)):
+  '''Applies a matrix to a coordination.'''
+  return (a*x+c*y+e, b*x+d*y+f)
+
+def cs_params(cs):
+  t = cs[0]
+  if t == LITERAL_ICC_BASED:
+    return stream_value(cs[1]).dic['N']
+  elif t == LITERAL_DEVICE_N:
+    return len(list_value(cs[1]))
+  else:
+    return CS_COMPONENTS[t]
+
+
+##  Fonts
+##
+
+# PDFFont
+class PDFFont:
+  
+  def __init__(self, fontid, descriptor, widths, default_width=None):
+    self.fontid = fontid
+    self.descriptor = descriptor
+    self.widths = widths
+    self.fontname = descriptor['FontName']
+    if isinstance(self.fontname, PSLiteral):
+      self.fontname = literal_name(self.fontname)
+    self.ascent = descriptor['Ascent']
+    self.descent = descriptor['Descent']
+    self.default_width = default_width or descriptor.get('MissingWidth', 0)
+    self.leading = descriptor.get('Leading', 0)
+    self.bbox = descriptor['FontBBox']
+    return
+
+  def __repr__(self):
+    return '<PDFFont: fontid=%r>' % (self.fontid,)
+
+  def is_vertical(self):
+    return False
+  
+  def decode(self, bytes):
+    return map(ord, bytes)
+
+  def char_width(self, cid):
+    return self.widths.get(cid, self.default_width)
+
+  def char_disp(self, cid):
+    return 0
+  
+  def string_width(self, s):
+    return sum( self.char_width(cid) for cid in self.decode(s) )
+  
+
+# PDFSimpleFont
+class PDFSimpleFont(PDFFont):
+  
+  def __init__(self, fontid, descriptor, widths, spec):
+    # Font encoding is specified either by a name of
+    # built-in encoding or a dictionary that describes
+    # the differences.
+    if 'Encoding' in spec:
+      encoding = resolve1(spec['Encoding'])
+    else:
+      encoding = LITERAL_STANDARD_ENCODING
+    if isinstance(encoding, dict):
+      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
+      diff = encoding.get('Differences', None)
+      self.encoding = EncodingDB.get_encoding(name, diff)
+    else:
+      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
+    PDFFont.__init__(self, fontid, descriptor, widths)
+    return
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      try:
+        return self.encoding[cid]
+      except KeyError:
+        raise PDFUnicodeNotDefined(None, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(None, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
+# PDFType1Font
+class PDFType1Font(PDFSimpleFont):
+  
+  def __init__(self, fontid, spec):
+    if 'BaseFont' not in spec:
+      raise PDFFontError('BaseFont is missing')
+    self.basefont = literal_name(spec['BaseFont'])
+    try:
+      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
+    except KeyError:
+      try:
+        descriptor = dict_value(spec['FontDescriptor'])
+        firstchar = int_value(spec['FirstChar'])
+        lastchar = int_value(spec['LastChar'])
+        widths = dict( (i+firstchar,w) for (i,w)
+                       in enumerate(list_value(spec['Widths'])) )
+      except KeyError, k:
+        raise PDFFontError('%s is missing' % k)
+    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    return
+
+# PDFTrueTypeFont
+class PDFTrueTypeFont(PDFType1Font):
+  pass
+
+# PDFType3Font
+class PDFType3Font(PDFSimpleFont):
+  def __init__(self, fontid, spec):
+    try:
+      firstchar = int_value(spec['FirstChar'])
+      lastchar = int_value(spec['LastChar'])
+      widths = dict( (i+firstchar,w) for (i,w)
+                     in enumerate(list_value(spec['Widths'])) )
+    except KeyError, k:
+      raise PDFFontError('%s is missing' % k)
+    if 'FontDescriptor' in spec:
+      descriptor = dict_value(spec['FontDescriptor'])
+    else:
+      descriptor = {'FontName':fontid, 'Ascent':0, 'Descent':0,
+                    'FontBBox':spec['FontBBox']}
+    PDFSimpleFont.__init__(self, fontid, descriptor, widths, spec)
+    return
+
+# PDFCIDFont
+
+##  TrueTypeFont
+##
+class TrueTypeFont:
+
+  class CMapNotFound(Exception): pass
+  
+  def __init__(self, name, fp):
+    self.name = name
+    self.fp = fp
+    self.tables = {}
+    fonttype = fp.read(4)
+    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+    for i in xrange(ntables):
+      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
+      self.tables[name] = (offset, length)
+    return
+
+  def create_cmap(self):
+    if 'cmap' not in self.tables: raise TrueTypeFont.CMapNotFound
+    (base_offset, length) = self.tables['cmap']
+    fp = self.fp
+    fp.seek(base_offset)
+    (version, nsubtables) = unpack('>HH', fp.read(4))
+    subtables = []
+    for i in xrange(nsubtables):
+      subtables.append(unpack('>HHL', fp.read(8)))
+    char2gid = {}
+    # Only supports subtable type 0, 2 and 4.
+    for (_1, _2, st_offset) in subtables:
+      fp.seek(base_offset+st_offset)
+      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
+      if fmttype == 0:
+        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
+      elif fmttype == 2:
+        subheaderkeys = unpack('>256H', fp.read(512))
+        firstbytes = [0]*8192
+        for (i,k) in enumerate(subheaderkeys):
+          firstbytes[k/8] = i
+        nhdrs = max(subheaderkeys)/8 + 1
+        hdrs = []
+        for i in xrange(nhdrs):
+          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
+          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
+        for (i,firstcode,entcount,delta,pos) in hdrs:
+          if not entcount: continue
+          first = firstcode + (firstbytes[i] << 8)
+          fp.seek(pos)
+          for c in xrange(entcount):
+            gid = unpack('>H', fp.read(2))
+            if gid:
+              gid += delta
+            char2gid[first+c] = gid
+      elif fmttype == 4:
+        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+        segcount /= 2
+        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        fp.read(2)
+        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
+        pos = fp.tell()
+        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
+          if idr:
+            fp.seek(pos+idr)
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
+          else:
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (c + idd) & 0xffff
+    gid2char = dict( (gid, pack('>H', char))
+                     for (char,gid) in char2gid.iteritems() )
+    cmapname = 'Adobe-Identity-UCS-%s' % self.name
+    return CMap(cmapname).update(char2gid, gid2char)
+
+class PDFCIDFont(PDFFont):
+  
+  def __init__(self, fontid, spec):
+    if 'BaseFont' not in spec:
+      raise PDFFontError('BaseFont is missing')
+    try:
+      self.cidsysteminfo = dict_value(spec['CIDSystemInfo'])
+      self.cidcoding = '%s-%s' % (self.cidsysteminfo['Registry'],
+                                  self.cidsysteminfo['Ordering'])
+    except KeyError:
+      raise PDFFontError('CIDSystemInfo not properly defined.')
+    self.basefont = literal_name(spec['BaseFont'])
+    self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
+    descriptor = dict_value(spec['FontDescriptor'])
+    ttf = None
+    if 'FontFile2' in descriptor:
+      self.fontfile = stream_value(descriptor.get('FontFile2'))
+      ttf = TrueTypeFont(self.basefont,
+                         StringIO(self.fontfile.get_data()))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).parse()
+    elif self.cidcoding == 'Adobe-Identity':
+      if ttf:
+        try:
+          self.ucs2_cmap = ttf.create_cmap()
+        except TrueTypeFont.CMapNotFound:
+          pass
+    else:
+      self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding)
+    
+    def get_width(seq):
+      dic = {}
+      char1 = char2 = None
+      for v in seq:
+        if char1 == None:
+          char1 = v
+        elif char2 == None and isinstance(v, int):
+          char2 = v
+        else:
+          if char2 == None:
+            for (i,w) in enumerate(v):
+              dic[char1+i] = w
+          else:
+            for i in xrange(char1, char2+1):
+              dic[i] = v
+          char1 = char2 = None
+      return dic
+    self.vertical = self.cmap.is_vertical()
+    if self.vertical:
+      # writing mode: vertical
+      dic = get_width(list_value(spec.get('W2', [])))
+      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
+      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
+      (d,w) = spec.get('DW2', [880, -1000])
+      default_width = w
+      self.default_disp = d
+    else:
+      # writing mode: horizontal
+      widths = get_width(list_value(spec.get('W', [])))
+      self.disps = {}
+      default_width = spec.get('DW', 1000)
+      self.default_disp = 0
+    PDFFont.__init__(self, fontid, descriptor, widths, default_width)
+    return
+
+  def is_vertical(self):
+    return self.vertical
+  
+  def decode(self, bytes):
+    return self.cmap.decode(bytes)
+
+  def char_disp(self, cid):
+    return self.disps.get(cid, self.default_disp)
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
+##  Resource Manager
+##
+class PDFResourceManager:
+
+  '''
+  ResourceManager facilitates reuse of shared resources
+  such as fonts, images and cmaps so that large objects are not
+  allocated multiple times.
+  '''
+  
+  def __init__(self, debug=0):
+    self.debug = debug
+    self.fonts = {}
+    return
+
+  def get_procset(self, procs):
+    for proc in procs:
+      if proc == LITERAL_PDF:
+        pass
+      elif proc == LITERAL_TEXT:
+        pass
+      else:
+        #raise PDFResourceError('ProcSet %r is not supported.' % proc)
+        pass
+    return
+  
+  def get_cmap(self, name):
+    return CMapDB.get_cmap(name)
+
+  def get_font(self, fontid, spec):
+    if fontid in self.fonts:
+      font = self.fonts[fontid]
+    else:
+      spec = dict_value(spec)
+      assert spec['Type'] == LITERAL_FONT
+      # Create a Font object.
+      if 'Subtype' not in spec:
+        raise PDFFontError('Font Subtype is not specified.')
+      subtype = literal_name(spec['Subtype'])
+      if subtype in ('Type1', 'MMType1'):
+        # Type1 Font
+        font = PDFType1Font(fontid, spec)
+      elif subtype == 'TrueType':
+        # TrueType Font
+        font = PDFTrueTypeFont(fontid, spec)
+      elif subtype == 'Type3':
+        # Type3 Font
+        font = PDFType3Font(fontid, spec)
+      elif subtype in ('CIDFontType0', 'CIDFontType2'):
+        # CID Font
+        font = PDFCIDFont(fontid, spec)
+      elif subtype == 'Type0':
+        # Type0 Font
+        dfonts = list_value(spec['DescendantFonts'])
+        assert len(dfonts) == 1
+        subspec = dict_value(dfonts[0]).copy()
+        for k in ('Encoding', 'ToUnicode'):
+          if k in spec:
+            subspec[k] = resolve1(spec[k])
+        font = self.get_font(fontid, subspec)
+      else:
+        raise PDFFontError('Invalid Font: %r' % spec)
+      self.fonts[fontid] = font
+    return font
+
+
+##  PDFDevice
+##
+class PDFDevice:
+  
+  def __init__(self, rsrc):
+    self.rsrc = rsrc
+    self.ctm = None
+    return
+  
+  def __repr__(self):
+    return '<PDFDevice>'
+
+  def close(self):
+    return
+
+  def set_ctm(self, ctm):
+    self.ctm = ctm
+    return
+
+  def begin_block(self, name):
+    return
+  def end_block(self):
+    return
+  
+  def render_string(self, textstate, textmatrix, size, seq):
+    raise NotImplementedError
+
+
+##  Interpreter
+##
+class PDFPageInterpreter:
+  
+  class TextState:
+    def __init__(self):
+      self.font = None
+      self.fontsize = 0
+      self.charspace = 0
+      self.wordspace = 0
+      self.scaling = 100
+      self.leading = 0
+      self.render = 0
+      self.rise = 0
+      self.reset()
+      return
+    def __repr__(self):
+      return ('<TextState: font=%r, fontsize=%r, matrix=%r,'
+              ' charspace=%r, wordspace=%r, scaling=%r, leading=%r,'
+              ' render=%r, rise=%r>' %
+              (self.font, self.fontsize, self.matrix,
+               self.charspace, self.wordspace, self.scaling, self.leading,
+               self.render, self.rise))
+    def reset(self):
+      self.matrix = MATRIX_IDENTITY
+      self.linematrix = (0, 0)
+      return
+
+  def __init__(self, rsrc, device, debug=0):
+    self.rsrc = rsrc
+    self.device = device
+    self.debug = debug
+    return
+
+  def initpage(self, ctm):
+    self.fontmap = {}
+    self.xobjmap = {}
+    self.csmap = {}
+    # gstack: stack for graphical states.
+    self.gstack = []
+    self.ctm = ctm
+    self.device.set_ctm(self.ctm)
+    self.textstate = PDFPageInterpreter.TextState()
+    # argstack: stack for command arguments.
+    self.argstack = []
+    # set some global states.
+    self.scs = None
+    self.ncs = None
+    return
+
+  def push(self, obj):
+    self.argstack.append(obj)
+    return
+
+  def pop(self, n):
+    x = self.argstack[-n:]
+    self.argstack = self.argstack[:-n]
+    return x
+
+  def get_current_state(self):
+    return (self.ctm, self.textstate)
+  
+  def set_current_state(self, state):
+    (self.ctm, self.textstate) = state
+    self.device.set_ctm(self.ctm)
+    return
+
+  # gsave
+  def do_q(self):
+    self.gstack.append(self.get_current_state())
+    return
+  # grestore
+  def do_Q(self):
+    if self.gstack:
+      self.set_current_state(self.gstack.pop())
+    return
+  
+  # concat-matrix
+  def do_cm(self, a1, b1, c1, d1, e1, f1):
+    self.ctm = mult_matrix((a1,b1,c1,d1,e1,f1), self.ctm)
+    self.device.set_ctm(self.ctm)
+    return
+  
+  # setlinewidth
+  def do_w(self, width): return
+  # setlinecap
+  def do_J(self, cap): return
+  # setlinejoin
+  def do_j(self, join): return
+  # setmiterlimit
+  def do_M(self, limit): return
+  # setdash
+  def do_d(self, dash, phase): return
+  # setintent
+  def do_ri(self, intent): return
+  # setflatness
+  def do_i(self, flatness): return
+  # savedict
+  def do_gs(self, name): return
+  
+  # moveto
+  def do_m(self, x, y): return
+  # lineto
+  def do_l(self, x, y): return
+  # curveto
+  def do_c(self, x1, y1, x2, y2, x3, y3): return
+  # urveto
+  def do_v(self, x2, y2, x3, y3): return
+  # rveto
+  def do_y(self, x1, y1, x3, y3): return
+  # closepath
+  def do_h(self): return
+  # rectangle
+  def do_re(self, x, y, w, h): return
+  
+  # stroke
+  def do_S(self): return
+  # close-and-stroke
+  def do_s(self): return
+  # fill
+  def do_f(self): return
+  # fill (obsolete)
+  do_F = do_f
+  # fill-even-odd
+  def do_f_a(self): return
+  # fill-and-stroke
+  def do_B(self): return
+  # fill-and-stroke-even-odd
+  def do_B_a(self): return
+  # close-fill-and-stroke
+  def do_b(self): return
+  # close-fill-and-stroke-even-odd
+  def do_b_a(self): return
+  # close-only
+  def do_n(self): return
+  # clip
+  def do_W(self): return
+  # clip-even-odd
+  def do_W_a(self): return
+  
+  # setcolorspace-stroking
+  def do_CS(self, name):
+    self.scs = self.csmap.get(literal_name(name), None)
+    return
+  # setcolorspace-non-strokine
+  def do_cs(self, name):
+    self.ncs = self.csmap.get(literal_name(name), None)
+    return
+  # setgray-stroking
+  def do_G(self, gray):
+    self.do_CS(LITERAL_DEVICE_GRAY)
+    return
+  # setgray-non-stroking
+  def do_g(self, gray):
+    self.do_cs(LITERAL_DEVICE_GRAY)
+    return
+  # setrgb-stroking
+  def do_RG(self, r, g, b):
+    self.do_CS(LITERAL_DEVICE_RGB)
+    return
+  # setrgb-non-stroking
+  def do_rg(self, r, g, b):
+    self.do_cs(LITERAL_DEVICE_RGB)
+    return
+  # setcmyk-stroking
+  def do_K(self, c, m, y, k):
+    self.do_CS(LITERAL_DEVICE_CMYK)
+    return
+  # setcmyk-non-stroking
+  def do_k(self, c, m, y, k):
+    self.do_cs(LITERAL_DEVICE_CMYK)
+    return
+
+  # setcolor
+  def do_SCN(self):
+    n = cs_params(self.scs)
+    self.pop(n)
+    return
+  def do_scn(self):
+    n = cs_params(self.ncs)
+    self.pop(n)
+    return
+  def do_SC(self):
+    self.do_SCN()
+    return
+  def do_sc(self):
+    self.do_scn()
+    return
+    
+  # sharing-name
+  def do_sh(self, name): return
+  
+  # begin-text
+  def do_BT(self):
+    self.textstate.reset()
+    return
+  # end-text
+  def do_ET(self):
+    return
+
+  # begin-compat
+  def do_BX(self): return
+  # end-compat
+  def do_EX(self): return
+
+  # marked content operators
+  def do_MP(self, tag): return
+  def do_DP(self, tag, props): return
+  def do_BMC(self, tag): return
+  def do_BDC(self, tag, props): return
+  def do_EMC(self): return
+
+  # setcharspace
+  def do_Tc(self, space):
+    self.textstate.charspace = space
+    return
+  # setwordspace
+  def do_Tw(self, space):
+    self.textstate.wordspace = space
+    return
+  # textscale
+  def do_Tz(self, scale):
+    self.textstate.scaling = scale
+    return
+  # setleading
+  def do_TL(self, leading):
+    self.textstate.leading = leading
+    return
+  # selectfont
+  def do_Tf(self, fontid, fontsize):
+    try:
+      self.textstate.font = self.fontmap[literal_name(fontid)]
+    except KeyError:
+      raise PDFInterpreterError('Undefined font id: %r' % fontid)
+    self.textstate.fontsize = fontsize
+    return
+  # setrendering
+  def do_Tr(self, render):
+    self.textstate.render = render
+    return
+  # settextrise
+  def do_Ts(self, rise):
+    self.textstate.rise = rise
+    return
+
+  # text-move
+  def do_Td(self, tx, ty):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
+    self.textstate.linematrix = (0, 0)
+    return
+  # text-move
+  def do_TD(self, tx, ty):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e+tx,f+ty)
+    self.textstate.leading = -ty
+    self.textstate.linematrix = (0, 0)
+    return
+  # textmatrix
+  def do_Tm(self, a,b,c,d,e,f):
+    self.textstate.matrix = (a,b,c,d,e,f)
+    self.textstate.linematrix = (0, 0)
+    return
+  # nextline
+  def do_T_a(self):
+    (a,b,c,d,e,f) = self.textstate.matrix
+    self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading)
+    self.textstate.linematrix = (0, 0)
+    return
+  
+  # show-pos
+  def do_TJ(self, seq):
+    textstate = self.textstate
+    font = textstate.font
+    (a,b,c,d,e,f) = textstate.matrix
+    (lx,ly) = textstate.linematrix
+    s = ''.join( x for x in seq if isinstance(x, str) )
+    n = sum( x for x in seq if not isinstance(x, str) )
+    w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize +
+         len(s) * textstate.charspace +
+         s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0
+    self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq)
+    if font.is_vertical():
+      ly += w
+    else:
+      lx += w
+    textstate.linematrix = (lx,ly)
+    return
+  # show
+  def do_Tj(self, s):
+    self.do_TJ([s])
+    return
+  # quote
+  def do__q(self, s):
+    self.do_T_a()
+    self.do_TJ([s])
+    return
+  # doublequote
+  def do__w(self, aw, ac, s):
+    self.do_Tw(aw)
+    self.do_Tc(ac)
+    self.do_TJ([s])
+    return
+
+  # inline image
+  def do_BI(self): # never called
+    return
+  def do_ID(self): # never called
+    return
+  def do_EI(self, obj):
+    return
+
+  # invoke an XObject
+  def do_Do(self, xobjid):
+    xobjid = literal_name(xobjid)
+    try:
+      xobj = stream_value(self.xobjmap[xobjid])
+    except KeyError:
+      raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
+    if xobj.dic['Subtype'] == LITERAL_FORM:
+      if 1 <= self.debug:
+        print >>stderr, 'Processing xobj: %r' % xobj
+      interpreter = PDFPageInterpreter(self.rsrc, self.device)
+      interpreter.render_contents(xobjid, xobj.dic['Resources'], [xobj],
+                                  xobj.dic.get('Matrix', MATRIX_IDENTITY))
+    return
+
+  def process_page(self, page):
+    if 1 <= self.debug:
+      print >>stderr, 'Processing page: %r' % page
+    self.render_contents('page-%d' % page.pageid, page.resources, page.contents)
+    return
+
+  def render_contents(self, contid, resources, contents, ctm=MATRIX_IDENTITY):
+    self.initpage(ctm)
+    self.device.begin_block(contid)
+    # Handle resource declarations.
+    for (k,v) in dict_value(resources).iteritems():
+      if 1 <= self.debug:
+        print >>stderr, 'Resource: %r: %r' % (k,v)
+      if k == 'Font':
+        for (fontid,fontrsrc) in dict_value(v).iteritems():
+          self.fontmap[fontid] = self.rsrc.get_font(fontid, fontrsrc)
+      elif k == 'ColorSpace':
+        for (csid,csspec) in dict_value(v).iteritems():
+          self.csmap[csid] = list_value(csspec)
+      elif k == 'ProcSet':
+        self.rsrc.get_procset(list_value(v))
+      elif k == 'XObject':
+        for (xobjid,xobjstrm) in dict_value(v).iteritems():
+          self.xobjmap[xobjid] = xobjstrm
+    for stream in list_value(contents):
+      self.execute(stream_value(stream))
+    self.device.end_block()
+    return
+  
+  def execute(self, stream):
+    for obj in stream.parse_data(inline=True, debug=self.debug):
+      if isinstance(obj, PSKeyword):
+        name = 'do_%s' % obj.name.replace('*','_a').replace('"','_w').replace("'",'_q')
+        if hasattr(self, name):
+          func = getattr(self, name)
+          nargs = func.func_code.co_argcount-1
+          if nargs:
+            args = self.pop(nargs)
+            if 1 <= self.debug:
+              print >>stderr, 'exec: %s %r' % (obj.name, args)
+            if len(args) == nargs:
+              func(*args)
+          else:
+            if 1 <= self.debug:
+              print >>stderr, 'exec: %s' % (obj.name)
+            func()
+        else:
+          raise PDFInterpreterError('unknown operator: %r' % obj.name)
+      else:
+        self.push(obj)
+    return
--- a/pdfparser.py
+++ b/pdfparser.py
--- a/psparser.py
+++ b/psparser.py
@ -0,0 +1,396 @@
+#!/usr/bin/env python
+import sys, re
+stderr = sys.stderr
+from utils import choplist
+
+
+##  PS Exceptions
+##
+class PSException(Exception): pass
+class PSSyntaxError(PSException): pass
+class PSTypeError(PSException): pass
+class PSValueError(PSException): pass
+
+
+##  PostScript Types
+##
+class PSLiteral:
+  '''
+  PS literals (e.g. "/Name").
+  Caution: Never create these objects directly.
+  Use PSLiteralTable.intern() instead.
+  '''
+  def __init__(self, name):
+    self.name = name
+    return
+  def __repr__(self):
+    return '/%s' % self.name
+
+class PSKeyword:
+  '''
+  PS keywords (e.g. "showpage").
+  Caution: Never create these objects directly.
+  Use PSKeywordTable.intern() instead.
+  '''
+  def __init__(self, name):
+    self.name = name
+    return
+  def __repr__(self):
+    return self.name
+
+class PSSymbolTable:
+  '''
+  Symbol table that stores PSLiteral or PSKeyword.
+  '''
+  def __init__(self, classe):
+    self.dic = {}
+    self.classe = classe
+    return
+  
+  def intern(self, name):
+    if name in self.dic:
+      lit = self.dic[name]
+    else:
+      lit = self.classe(name)
+      self.dic[name] = lit
+    return lit
+
+PSLiteralTable = PSSymbolTable(PSLiteral)
+PSKeywordTable = PSSymbolTable(PSKeyword)
+
+
+def literal_name(x):
+  if not isinstance(x, PSLiteral):
+    raise PSTypeError('literal required: %r' % x)
+  return x.name
+
+def keyword_name(x):
+  if not isinstance(x, PSKeyword):
+    raise PSTypeError('keyword required: %r' % x)
+  return x.name
+
+
+##  PSBaseParser
+##
+class PSBaseParser:
+
+  '''PostScript parser that performs only basic tokenization.'''
+
+  def __init__(self, fp, debug=0):
+    self.fp = fp
+    self.debug = debug
+    self.bufsize = 4096
+    self.seek(0)
+    return
+
+  def __repr__(self):
+    return '<PSBaseParser: %r>' % (self.fp,)
+
+  def seek(self, pos):
+    '''
+    seeks to the given pos.
+    '''
+    if 2 <= self.debug:
+      print >>stderr, 'seek:', pos
+    self.fp.seek(pos)
+    self.linepos = pos
+    self.linebuf = None
+    self.curpos = 0
+    self.line = ''
+    return
+  
+  EOLCHAR = re.compile(r'[\r\n]')
+  def nextline(self):
+    '''
+    fetches the next line that ends either with \\r or \\n.
+    '''
+    line = ''
+    eol = None
+    while 1:
+      if not self.linebuf or len(self.linebuf) <= self.curpos:
+        # fetch next chunk.
+        self.linebuf = self.fp.read(self.bufsize)
+        if not self.linebuf:
+          # at EOF.
+          break
+        self.curpos = 0
+      if eol:
+        c = self.linebuf[self.curpos]
+        # handle '\r\n'
+        if (eol == '\r' and c == '\n'):
+          line += c
+          self.curpos += 1
+        break
+      m = self.EOLCHAR.search(self.linebuf, self.curpos)
+      if m:
+        i = m.end(0)
+        line += self.linebuf[self.curpos:i]
+        eol = self.linebuf[i-1]
+        self.curpos = i
+      else:
+        # fetch further
+        line += self.linebuf[self.curpos:]
+        self.linebuf = None
+    self.linepos += len(line)
+    return line
+
+  def revreadlines(self):
+    '''
+    fetches lines backword. used to locate trailers.
+    '''
+    self.fp.seek(0, 2)
+    pos = self.fp.tell()
+    buf = ''
+    while 0 < pos:
+      pos = max(0, pos-self.bufsize)
+      self.fp.seek(pos)
+      s = self.fp.read(self.bufsize)
+      if not s: break
+      while 1:
+        n = max(s.rfind('\r'), s.rfind('\n'))
+        if n == -1:
+          buf = s + buf
+          break
+        yield buf+s[n:]
+        s = s[:n]
+        buf = ''
+    return
+
+  SPECIAL = r'%\[\]()<>{}/\000\011\012\014\015\040'
+  TOKEN = re.compile(r'<<|>>|[%\[\]()<>{}/]|[^'+SPECIAL+r']+')
+  LITERAL = re.compile(r'([^#'+SPECIAL+r']|#[0-9abcdefABCDEF]{2})+')
+  NUMBER = re.compile(r'[+-]?[0-9][.0-9]*$')
+  STRING_NORM = re.compile(r'(\\[0-9]{1,3}|\\.|[^\)])+')
+  STRING_NORM_SUB = re.compile(r'\\[0-7]{1,3}|\\.')
+  STRING_HEX = re.compile(r'[\s0-9a-fA-F]+')
+  STRING_HEX_SUB = re.compile(r'[0-9a-fA-F]{1,2}')
+
+  def parse(self):
+    '''
+    Yields a list of basic tokens: keywords, literals, strings, 
+    numbers and parentheses. Comments are skipped.
+    Nested objects (i.e. arrays and dictionaries) are not handled.
+    '''
+    while 1:
+      # do not strip line! we need to distinguish last '\n' or '\r'
+      linepos0 = self.linepos
+      self.line = self.nextline()
+      if not self.line: break
+      if 2 <= self.debug:
+        print >>stderr, 'line: (%d) %r' % (self.linepos, self.line)
+      # do this before removing comment
+      if self.line.startswith('%%EOF'): break
+      charpos = 0
+      
+      # tokenize
+      while 1:
+        m = self.TOKEN.search(self.line, charpos)
+        if not m: break
+        t = m.group(0)
+        pos = linepos0 + m.start(0)
+        charpos = m.end(0)
+        
+        if t == '%':
+          # skip comment
+          if 2 <= self.debug:
+            print >>stderr, 'comment: %r' % self.line[charpos:]
+          break
+        
+        elif t == '/':
+          # literal object
+          mn = self.LITERAL.match(self.line, m.start(0)+1)
+          lit = PSLiteralTable.intern(mn.group(0))
+          yield (pos, lit)
+          charpos = mn.end(0)
+          if 2 <= self.debug:
+            print >>stderr, 'name: %r' % lit
+            
+        elif t == '(':
+          # normal string object
+          s = ''
+          while 1:
+            ms = self.STRING_NORM.match(self.line, charpos)
+            if not ms: break
+            s1 = ms.group(0)
+            charpos = ms.end(0)
+            if len(s1) == 1 and s1[-1] == '\\':
+              s += s1[-1:]
+              self.line = self.nextline()
+              if not self.line:
+                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                    (self.linepos, self.line))
+              charpos = 0
+            elif charpos == len(self.line):
+              s += s1
+              self.line = self.nextline()
+              if not self.line:
+                raise PSSyntaxError('end inside string: linepos=%d, line=%r' %
+                                    (self.linepos, self.line))
+              charpos = 0
+            else:
+              s += s1
+              break
+          if self.line[charpos] != ')':
+            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                (self.linepos, self.line))
+          charpos += 1
+          def convesc(m):
+            x = m.group(0)
+            if x[1:].isdigit():
+              return chr(int(x[1:], 8))
+            else:
+              return x[1]
+          s = self.STRING_NORM_SUB.sub(convesc, s)
+          if 2 <= self.debug:
+            print >>stderr, 'str: %r' % s
+          yield (pos, s)
+          
+        elif t == '<':
+          # hex string object
+          ms = self.STRING_HEX.match(self.line, charpos)
+          charpos = ms.end(0)
+          if self.line[charpos] != '>':
+            raise PSSyntaxError('no close paren: linepos=%d, line=%r' %
+                                (self.linepos, self.line))
+          charpos += 1
+          def convhex(m1):
+            return chr(int(m1.group(0), 16))
+          s = self.STRING_HEX_SUB.sub(convhex, ms.group(0))
+          if 2 <= self.debug:
+            print >>stderr, 'str: %r' % s
+          yield (pos, s)
+
+        elif self.NUMBER.match(t):
+          # number
+          if '.' in t:
+            n = float(t)
+          else:
+            n = int(t)
+          if 2 <= self.debug:
+            print >>stderr, 'number: %r' % n
+          yield (pos, n)
+
+        elif t in ('true','false'):
+          # boolean
+          if 2 <= self.debug:
+            print >>stderr, 'boolean: %r' % t
+          yield (pos, (t == 'true'))
+        
+        else:
+          # other token
+          if 2 <= self.debug:
+            print >>stderr, 'keyword: %r' % t
+          yield (pos, PSKeywordTable.intern(t))
+
+    return
+
+
+##  PSStackParser
+##
+class PSStackParser(PSBaseParser):
+
+  '''
+  PostScript parser that recognizes compound objects
+  such as arrays and dictionaries.
+  '''
+  
+  def __init__(self, fp, debug=0):
+    PSBaseParser.__init__(self, fp, debug=debug)
+    self.context = []
+    self.partobj = None
+    return
+
+  def do_token(self, pos, token):
+    '''
+    Handles special tokens.
+    Returns true if the token denotes the end of an object.
+    '''
+    return False
+
+  def push(self, obj):
+    '''
+    Push an object to the stack.
+    '''
+    self.partobj.append(obj)
+    return
+
+  def pop(self, n):
+    '''
+    Pop N objects from the stack.
+    '''
+    if len(self.partobj) < n:
+      raise PSSyntaxError('stack too short < %d' % n)
+    r = self.partobj[-n:]
+    self.partobj = self.partobj[:-n]
+    return r
+  
+  def popall(self):
+    '''
+    Discards all the objects on the stack.
+    '''
+    self.partobj = []
+    return
+
+  def parse(self):
+    '''
+    Yields a list of objects: keywords, literals, strings, 
+    numbers, arrays and dictionaries. Arrays and dictionaries
+    are represented as Python sequence and dictionaries.
+    '''
+    
+    def startobj(type):
+      self.context.append((type, self.partobj))
+      self.partobj = []
+      return
+
+    def endobj(type1):
+      assert self.context
+      obj = self.partobj
+      (type0, self.partobj) = self.context.pop()
+      if type0 != type1:
+        raise PSTypeError('type mismatch: %r(%r) != %r(%r)' %
+                          (type0, self.partobj, type1, obj))
+      return obj
+
+    startobj('o')
+
+    for (pos,t) in PSBaseParser.parse(self):
+      if isinstance(t, int) or isinstance(t, float):
+        self.push(t)
+      elif isinstance(t, str):
+        self.push(t)
+      elif isinstance(t, PSLiteral):
+        self.push(t)
+      else:
+        c = keyword_name(t)
+        if c == '{' or c == '}':
+          self.push(t)
+        elif c == '[':
+          # begin array
+          if 2 <= self.debug:
+            print >>stderr, 'start array'
+          startobj('a')
+        elif c == ']':
+          # end array
+          a = endobj('a')
+          if 2 <= self.debug:
+            print >>stderr, 'end array: %r' % a
+          self.push(a)
+        elif c == '<<':
+          # begin dictionary
+          if 2 <= self.debug:
+            print >>stderr, 'start dict'
+          startobj('d')
+        elif c == '>>':
+          # end dictionary
+          objs = endobj('d')
+          if len(objs) % 2 != 0:
+            raise PSTypeError('invalid dictionary construct: %r' % objs)
+          d = dict( (literal_name(k), v) for (k,v) in choplist(2, objs) )
+          if 2 <= self.debug:
+            print >>stderr, 'end dict: %r' % d
+          self.push(d)
+        elif self.do_token(pos, t):
+          break
+
+    return endobj('o')
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+##  Utilities
+##
+def choplist(n, seq):
+  '''Groups every n elements of the list.'''
+  r = []
+  for x in seq:
+    r.append(x)
+    if len(r) == n:
+      yield tuple(r)
+      r = []
+  return
+
+def nunpack(s, default=0):
+  '''Unpacks up to 4 bytes.'''
+  l = len(s)
+  if not l:
+    return default
+  elif l == 1:
+    return ord(s)
+  elif l == 2:
+    return unpack('>H', s)[0]
+  elif l == 3:
+    return unpack('>L', '\x00'+s)[0]
+  elif l == 4:
+    return unpack('>L', s)[0]
+  else:
+    return TypeError('invalid length: %d' % l)