tmp

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-01-10 09:14:46 +00:00 · 2009-01-10 09:14:46 +00:00 · c41c279321
parent 24bdd33557
commit c41c279321
9 changed files with 740 additions and 700 deletions
--- a/pdflib/page.py
+++ b/pdflib/page.py
@ -2,10 +2,53 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
+from pdflib.pdffont import PDFUnicodeNotDefined
 from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
 ##  PDFDevice
 ##
 class PDFDevice(object):
  debug = 0
  def __init__(self, rsrc):
    self.rsrc = rsrc
    self.ctm = None
    return
  def __repr__(self):
    return '<PDFDevice>'
  def close(self):
    return
  def set_ctm(self, ctm):
    self.ctm = ctm
    return
  def begin_tag(self, tag, props=None):
    return
  def end_tag(self):
    return
  def do_tag(self, tag, props=None):
    return
  def begin_page(self, page):
    return
  def end_page(self, page):
    return
  def begin_figure(self, name, bbox):
    return
  def end_figure(self, name):
    return
  def render_string(self, textstate, textmatrix, seq):
    raise NotImplementedError
  def render_image(self, stream, size, matrix):
    raise NotImplementedError
 ##  PageItem
 ##
 class PageItem(object):
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -3,10 +3,10 @@ import sys
 stdout = sys.stdout
 stderr = sys.stderr
 from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
+from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
-     PDFPageInterpreter, PDFUnicodeNotDefined
+from pdflib.pdffont import PDFUnicodeNotDefined
 from pdflib.cmap import CMapDB
-from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
+from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator
 def enc(x, codec):
--- a/pdflib/pdfcolor.py
+++ b/pdflib/pdfcolor.py
@ -0,0 +1,35 @@
 #!/usr/bin/env python
 import sys
 stderr = sys.stderr
 from pdflib.psparser import PSLiteralTable
 ##  ColorSpace
 ##
 LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
 LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
 LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
 class ColorSpace(object):
  def __init__(self, name, ncomponents):
    self.name = name
    self.ncomponents = ncomponents
    return
  def __repr__(self):
    return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
 PREDEFINED_COLORSPACE = dict(
  (name, ColorSpace(name,n)) for (name,n) in {
  'CalRGB': 3,
  'CalGray': 1,
  'Lab': 3,
  'DeviceRGB': 3,
  'DeviceCMYK': 4,
  'DeviceGray': 1,
  'Separation': 1,
  'Indexed': 1,
  'Pattern': 1,
  }.iteritems())
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@ -0,0 +1,341 @@
 #!/usr/bin/env python
 import sys
 stderr = sys.stderr
 from struct import pack, unpack
 try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
 from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
     literal_name, keyword_name, STRICT
 from pdflib.pdftypes import PDFException, \
     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
 from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
 ##  Fonts
 ##
 class PDFFontError(PDFException): pass
 class PDFUnicodeNotDefined(PDFFontError): pass
 LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 # PDFFont
 class PDFFont(object):
  def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
    self.descriptor = descriptor
    self.widths = widths
    self.fontname = descriptor.get('FontName', 'unknown')
    if isinstance(self.fontname, PSLiteral):
      self.fontname = literal_name(self.fontname)
    self.ascent = num_value(descriptor.get('Ascent', 0))
    self.descent = num_value(descriptor.get('Descent', 0))
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
    self.leading = num_value(descriptor.get('Leading', 0))
    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
    self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
    return
  def __repr__(self):
    return '<PDFFont>'
  def is_vertical(self):
    return False
  def is_multibyte(self):
    return False
  def decode(self, bytes):
    return map(ord, bytes)
  def char_width(self, cid):
    return self.widths.get(cid, self.default_width)
  def char_disp(self, cid):
    return 0
  def string_width(self, s):
    return sum( self.char_width(cid) for cid in self.decode(s) )
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
  def __init__(self, descriptor, widths, spec, font_matrix=None):
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
    if 'Encoding' in spec:
      encoding = resolve1(spec['Encoding'])
    else:
      encoding = LITERAL_STANDARD_ENCODING
    if isinstance(encoding, dict):
      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
      diff = list_value(encoding.get('Differences', None))
      self.encoding = EncodingDB.get_encoding(name, diff)
    else:
      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
    self.ucs2_cmap = None
    if 'ToUnicode' in spec:
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
    PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
    return
  def to_unicode(self, cid):
    if not self.ucs2_cmap:
      try:
        return self.encoding[cid]
      except KeyError:
        raise PDFUnicodeNotDefined(None, cid)
    code = self.ucs2_cmap.tocode(cid)
    if not code:
      raise PDFUnicodeNotDefined(None, cid)
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
  def __init__(self, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    try:
      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
    except KeyError:
      descriptor = dict_value(spec.get('FontDescriptor', {}))
      firstchar = int_value(spec.get('FirstChar', 0))
      lastchar = int_value(spec.get('LastChar', 255))
      widths = list_value(spec.get('Widths', [0]*256))
      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
  def __repr__(self):
    return '<PDFType1Font: basefont=%r>' % self.basefont
 # PDFTrueTypeFont
 class PDFTrueTypeFont(PDFType1Font):
  def __repr__(self):
    return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
  def __init__(self, spec):
    firstchar = int_value(spec.get('FirstChar', 0))
    lastchar = int_value(spec.get('LastChar', 0))
    widths = list_value(spec.get('Widths', [0]*256))
    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
      descriptor = {'FontName':spec.get('Name'),
                    'Ascent':0, 'Descent':0,
                    'FontBBox':spec['FontBBox']}
    PDFSimpleFont.__init__(self, descriptor, widths, spec,
                           font_matrix=tuple(list_value(spec.get('FontMatrix'))))
    return
  def __repr__(self):
    return '<PDFType3Font>'
 # PDFCIDFont
 ##  TrueTypeFont
 ##
 class TrueTypeFont(object):
  class CMapNotFound(Exception): pass
  def __init__(self, name, fp):
    self.name = name
    self.fp = fp
    self.tables = {}
    fonttype = fp.read(4)
    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
    for i in xrange(ntables):
      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
      self.tables[name] = (offset, length)
    return
  def create_cmap(self):
    if 'cmap' not in self.tables:
      raise TrueTypeFont.CMapNotFound
    (base_offset, length) = self.tables['cmap']
    fp = self.fp
    fp.seek(base_offset)
    (version, nsubtables) = unpack('>HH', fp.read(4))
    subtables = []
    for i in xrange(nsubtables):
      subtables.append(unpack('>HHL', fp.read(8)))
    char2gid = {}
    # Only supports subtable type 0, 2 and 4.
    for (_1, _2, st_offset) in subtables:
      fp.seek(base_offset+st_offset)
      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
      if fmttype == 0:
        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
      elif fmttype == 2:
        subheaderkeys = unpack('>256H', fp.read(512))
        firstbytes = [0]*8192
        for (i,k) in enumerate(subheaderkeys):
          firstbytes[k/8] = i
        nhdrs = max(subheaderkeys)/8 + 1
        hdrs = []
        for i in xrange(nhdrs):
          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
        for (i,firstcode,entcount,delta,pos) in hdrs:
          if not entcount: continue
          first = firstcode + (firstbytes[i] << 8)
          fp.seek(pos)
          for c in xrange(entcount):
            gid = unpack('>H', fp.read(2))
            if gid:
              gid += delta
            char2gid[first+c] = gid
      elif fmttype == 4:
        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
        segcount /= 2
        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
        fp.read(2)
        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
        pos = fp.tell()
        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
          if idr:
            fp.seek(pos+idr)
            for c in xrange(sc, ec+1):
              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
          else:
            for c in xrange(sc, ec+1):
              char2gid[c] = (c + idd) & 0xffff
    gid2char = dict( (gid, pack('>H', char))
                     for (char,gid) in char2gid.iteritems() )
    cmapname = 'Adobe-Identity-UCS-%s' % self.name
    return CMap(cmapname).update(char2gid, gid2char)
 class PDFCIDFont(PDFFont):
  def __init__(self, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
                                self.cidsysteminfo.get('Ordering', 'unknown'))
    try:
      name = literal_name(spec['Encoding'])
    except KeyError:
      if STRICT:
        raise PDFFontError('Encoding is unspecified')
      name = 'unknown'
    try:
      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
    except CMapDB.CMapNotFound, e:
      raise PDFFontError(e)
    try:
      descriptor = dict_value(spec['FontDescriptor'])
    except KeyError:
      if STRICT:
        raise PDFFontError('FontDescriptor is missing')
      descriptor = {}
    ttf = None
    if 'FontFile2' in descriptor:
      self.fontfile = stream_value(descriptor.get('FontFile2'))
      ttf = TrueTypeFont(self.basefont,
                         StringIO(self.fontfile.get_data()))
    self.ucs2_cmap = None
    if 'ToUnicode' in spec:
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
    elif self.cidcoding == 'Adobe-Identity':
      if ttf:
        try:
          self.ucs2_cmap = ttf.create_cmap()
        except TrueTypeFont.CMapNotFound:
          pass
    else:
      try:
        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
                                         strict=STRICT)
      except CMapDB.CMapNotFound, e:
        raise PDFFontError(e)
    def get_width(seq):
      dic = {}
      char1 = char2 = None
      for v in seq:
        if char1 == None:
          char1 = v
        elif char2 == None and isinstance(v, int):
          char2 = v
        else:
          if char2 == None:
            for (i,w) in enumerate(v):
              dic[char1+i] = w
          else:
            for i in xrange(char1, char2+1):
              dic[i] = v
          char1 = char2 = None
      return dic
    self.vertical = self.cmap.is_vertical()
    if self.vertical:
      # writing mode: vertical
      dic = get_width(list_value(spec.get('W2', [])))
      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
      (d,w) = spec.get('DW2', [880, -1000])
      default_width = w
      self.default_disp = d
    else:
      # writing mode: horizontal
      widths = get_width(list_value(spec.get('W', [])))
      self.disps = {}
      default_width = spec.get('DW', 1000)
      self.default_disp = 0
    PDFFont.__init__(self, descriptor, widths, default_width=default_width)
    return
  def __repr__(self):
    return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
  def is_vertical(self):
    return self.vertical
  def is_multibyte(self):
    return True
  def decode(self, bytes):
    return self.cmap.decode(bytes)
  def char_disp(self, cid):
    return self.disps.get(cid, self.default_disp)
  def to_unicode(self, cid):
    if not self.ucs2_cmap:
      raise PDFUnicodeNotDefined(self.cidcoding, cid)
    code = self.ucs2_cmap.tocode(cid)
    if not code:
      raise PDFUnicodeNotDefined(self.cidcoding, cid)
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -6,33 +6,22 @@ try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
-from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from pdflib.psparser import PSException, PSTypeError, PSEOF, \
-     PSStackParser, PSLiteral, PSKeyword, STRICT, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
-     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
+     PSStackParser, PSKeyword, STRICT
-from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
+from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
-     int_value, float_value, num_value, \
+     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
-from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
+from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
 from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
     LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
 ##  Exceptions
 ##
 class PDFResourceError(PDFException): pass
 class PDFInterpreterError(PDFException): pass
 class PDFFontError(PDFException): pass
 class PDFUnicodeNotDefined(PDFFontError): pass
 ##  ColorSpace
 ##
 class ColorSpace(object):
  def __init__(self, name, ncomponents):
    self.name = name
    self.ncomponents = ncomponents
    return
  def __repr__(self):
    return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
 ##  Constants
@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text')
 LITERAL_FONT = PSLiteralTable.intern('Font')
 LITERAL_FORM = PSLiteralTable.intern('Form')
 LITERAL_IMAGE = PSLiteralTable.intern('Image')
 LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
 LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
 LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
 KEYWORD_BI = PSKeywordTable.intern('BI')
 KEYWORD_ID = PSKeywordTable.intern('ID')
 KEYWORD_EI = PSKeywordTable.intern('EI')
 MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
 PREDEFINED_COLORSPACE = dict(
  (name, ColorSpace(name,n)) for (name,n) in {
  'CalRGB': 3,
  'CalGray': 1,
  'Lab': 3,
  'DeviceRGB': 3,
  'DeviceCMYK': 4,
  'DeviceGray': 1,
  'Separation': 1,
  'Indexed': 1,
  'Pattern': 1,
  }.iteritems())
 ##  Fonts
 ##
 # PDFFont
 class PDFFont(object):
  def __init__(self, descriptor, widths, default_width=None):
    self.descriptor = descriptor
    self.widths = widths
    self.fontname = descriptor.get('FontName', 'unknown')
    if isinstance(self.fontname, PSLiteral):
      self.fontname = literal_name(self.fontname)
    self.ascent = num_value(descriptor.get('Ascent', 0))
    self.descent = num_value(descriptor.get('Descent', 0))
    self.default_width = default_width or descriptor.get('MissingWidth', 0)
    self.leading = num_value(descriptor.get('Leading', 0))
    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
    return
  def __repr__(self):
    return '<PDFFont>'
  def is_vertical(self):
    return False
  def is_multibyte(self):
    return False
  def decode(self, bytes):
    return map(ord, bytes)
  def char_width(self, cid):
    return self.widths.get(cid, self.default_width)
  def char_disp(self, cid):
    return 0
  def string_width(self, s):
    return sum( self.char_width(cid) for cid in self.decode(s) )
 # PDFSimpleFont
 class PDFSimpleFont(PDFFont):
  def __init__(self, descriptor, widths, spec):
    # Font encoding is specified either by a name of
    # built-in encoding or a dictionary that describes
    # the differences.
    if 'Encoding' in spec:
      encoding = resolve1(spec['Encoding'])
    else:
      encoding = LITERAL_STANDARD_ENCODING
    if isinstance(encoding, dict):
      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
      diff = list_value(encoding.get('Differences', None))
      self.encoding = EncodingDB.get_encoding(name, diff)
    else:
      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
    self.ucs2_cmap = None
    if 'ToUnicode' in spec:
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
    PDFFont.__init__(self, descriptor, widths)
    return
  def to_unicode(self, cid):
    if not self.ucs2_cmap:
      try:
        return self.encoding[cid]
      except KeyError:
        raise PDFUnicodeNotDefined(None, cid)
    code = self.ucs2_cmap.tocode(cid)
    if not code:
      raise PDFUnicodeNotDefined(None, cid)
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )
 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
  def __init__(self, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    try:
      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
    except KeyError:
      descriptor = dict_value(spec.get('FontDescriptor', {}))
      firstchar = int_value(spec.get('FirstChar', 0))
      lastchar = int_value(spec.get('LastChar', 255))
      widths = list_value(spec.get('Widths', [0]*256))
      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
  def __repr__(self):
    return '<PDFType1Font: basefont=%r>' % self.basefont
 # PDFTrueTypeFont
 class PDFTrueTypeFont(PDFType1Font):
  def __repr__(self):
    return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
 # PDFType3Font
 class PDFType3Font(PDFSimpleFont):
  def __init__(self, spec):
    firstchar = int_value(spec.get('FirstChar', 0))
    lastchar = int_value(spec.get('LastChar', 0))
    widths = list_value(spec.get('Widths', [0]*256))
    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
    if 'FontDescriptor' in spec:
      descriptor = dict_value(spec['FontDescriptor'])
    else:
      descriptor = {'FontName':spec.get('Name'),
                    'Ascent':0, 'Descent':0,
                    'FontBBox':spec['FontBBox']}
    PDFSimpleFont.__init__(self, descriptor, widths, spec)
    return
  def __repr__(self):
    return '<PDFType3Font>'
 # PDFCIDFont
 ##  TrueTypeFont
 ##
 class TrueTypeFont(object):
  class CMapNotFound(Exception): pass
  def __init__(self, name, fp):
    self.name = name
    self.fp = fp
    self.tables = {}
    fonttype = fp.read(4)
    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
    for i in xrange(ntables):
      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
      self.tables[name] = (offset, length)
    return
  def create_cmap(self):
    if 'cmap' not in self.tables:
      raise TrueTypeFont.CMapNotFound
    (base_offset, length) = self.tables['cmap']
    fp = self.fp
    fp.seek(base_offset)
    (version, nsubtables) = unpack('>HH', fp.read(4))
    subtables = []
    for i in xrange(nsubtables):
      subtables.append(unpack('>HHL', fp.read(8)))
    char2gid = {}
    # Only supports subtable type 0, 2 and 4.
    for (_1, _2, st_offset) in subtables:
      fp.seek(base_offset+st_offset)
      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
      if fmttype == 0:
        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
      elif fmttype == 2:
        subheaderkeys = unpack('>256H', fp.read(512))
        firstbytes = [0]*8192
        for (i,k) in enumerate(subheaderkeys):
          firstbytes[k/8] = i
        nhdrs = max(subheaderkeys)/8 + 1
        hdrs = []
        for i in xrange(nhdrs):
          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
        for (i,firstcode,entcount,delta,pos) in hdrs:
          if not entcount: continue
          first = firstcode + (firstbytes[i] << 8)
          fp.seek(pos)
          for c in xrange(entcount):
            gid = unpack('>H', fp.read(2))
            if gid:
              gid += delta
            char2gid[first+c] = gid
      elif fmttype == 4:
        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
        segcount /= 2
        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
        fp.read(2)
        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
        pos = fp.tell()
        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
          if idr:
            fp.seek(pos+idr)
            for c in xrange(sc, ec+1):
              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
          else:
            for c in xrange(sc, ec+1):
              char2gid[c] = (c + idd) & 0xffff
    gid2char = dict( (gid, pack('>H', char))
                     for (char,gid) in char2gid.iteritems() )
    cmapname = 'Adobe-Identity-UCS-%s' % self.name
    return CMap(cmapname).update(char2gid, gid2char)
 class PDFCIDFont(PDFFont):
  def __init__(self, spec):
    try:
      self.basefont = literal_name(spec['BaseFont'])
    except KeyError:
      if STRICT:
        raise PDFFontError('BaseFont is missing')
      self.basefont = 'unknown'
    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
                                self.cidsysteminfo.get('Ordering', 'unknown'))
    try:
      name = literal_name(spec['Encoding'])
    except KeyError:
      if STRICT:
        raise PDFFontError('Encoding is unspecified')
      name = 'unknown'
    try:
      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
    except CMapDB.CMapNotFound, e:
      raise PDFFontError(e)
    try:
      descriptor = dict_value(spec['FontDescriptor'])
    except KeyError:
      if STRICT:
        raise PDFFontError('FontDescriptor is missing')
      descriptor = {}
    ttf = None
    if 'FontFile2' in descriptor:
      self.fontfile = stream_value(descriptor.get('FontFile2'))
      ttf = TrueTypeFont(self.basefont,
                         StringIO(self.fontfile.get_data()))
    self.ucs2_cmap = None
    if 'ToUnicode' in spec:
      strm = stream_value(spec['ToUnicode'])
      self.ucs2_cmap = CMap()
      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
    elif self.cidcoding == 'Adobe-Identity':
      if ttf:
        try:
          self.ucs2_cmap = ttf.create_cmap()
        except TrueTypeFont.CMapNotFound:
          pass
    else:
      try:
        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
                                         strict=STRICT)
      except CMapDB.CMapNotFound, e:
        raise PDFFontError(e)
    def get_width(seq):
      dic = {}
      char1 = char2 = None
      for v in seq:
        if char1 == None:
          char1 = v
        elif char2 == None and isinstance(v, int):
          char2 = v
        else:
          if char2 == None:
            for (i,w) in enumerate(v):
              dic[char1+i] = w
          else:
            for i in xrange(char1, char2+1):
              dic[i] = v
          char1 = char2 = None
      return dic
    self.vertical = self.cmap.is_vertical()
    if self.vertical:
      # writing mode: vertical
      dic = get_width(list_value(spec.get('W2', [])))
      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
      (d,w) = spec.get('DW2', [880, -1000])
      default_width = w
      self.default_disp = d
    else:
      # writing mode: horizontal
      widths = get_width(list_value(spec.get('W', [])))
      self.disps = {}
      default_width = spec.get('DW', 1000)
      self.default_disp = 0
    PDFFont.__init__(self, descriptor, widths, default_width)
    return
  def __repr__(self):
    return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
  def is_vertical(self):
    return self.vertical
  def is_multibyte(self):
    return True
  def decode(self, bytes):
    return self.cmap.decode(bytes)
  def char_disp(self, cid):
    return self.disps.get(cid, self.default_disp)
  def to_unicode(self, cid):
    if not self.ucs2_cmap:
      raise PDFUnicodeNotDefined(self.cidcoding, cid)
    code = self.ucs2_cmap.tocode(cid)
    if not code:
      raise PDFUnicodeNotDefined(self.cidcoding, cid)
    chars = unpack('>%dH' % (len(code)/2), code)
    return ''.join( unichr(c) for c in chars )
 ##  Resource Manager
@ -388,7 +39,7 @@ class PDFResourceManager(object):
  '''
  ResourceManager facilitates reuse of shared resources
-  such as fonts, images and cmaps so that large objects are not
+  such as fonts and images so that large objects are not
  allocated multiple times.
  '''
  debug = 0
@ -399,24 +50,21 @@ class PDFResourceManager(object):
  def get_procset(self, procs):
    for proc in procs:
-      if proc == LITERAL_PDF:
+      if proc is LITERAL_PDF:
        pass
-      elif proc == LITERAL_TEXT:
+      elif proc is LITERAL_TEXT:
        pass
      else:
        #raise PDFResourceError('ProcSet %r is not supported.' % proc)
        pass
    return
  def get_cmap(self, name):
    return CMapDB.get_cmap(name, strict=STRICT)
  def get_font(self, objid, spec):
    if objid and objid in self.fonts:
      font = self.fonts[objid]
    else:
      if STRICT:
-        if spec['Type'] != LITERAL_FONT:
+        if spec['Type'] is not LITERAL_FONT:
          raise PDFFontError('Type is not /Font')
      # Create a Font object.
      if 'Subtype' in spec:
@ -455,49 +103,6 @@ class PDFResourceManager(object):
    return font
 ##  PDFDevice
 ##
 class PDFDevice(object):
  debug = 0
  def __init__(self, rsrc):
    self.rsrc = rsrc
    self.ctm = None
    return
  def __repr__(self):
    return '<PDFDevice>'
  def close(self):
    return
  def set_ctm(self, ctm):
    self.ctm = ctm
    return
  def begin_tag(self, tag, props=None):
    return
  def end_tag(self):
    return
  def do_tag(self, tag, props=None):
    return
  def begin_page(self, page):
    return
  def end_page(self, page):
    return
  def begin_figure(self, name, bbox):
    return
  def end_figure(self, name):
    return
  def render_string(self, textstate, textmatrix, seq):
    raise NotImplementedError
  def render_image(self, stream, size, matrix):
    raise NotImplementedError
 ##  PDFContentParser
 ##
 class PDFContentParser(PSStackParser):
@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser):
    self.add_results(*self.popall())
    return
  KEYWORD_BI = PSKeywordTable.intern('BI')
  KEYWORD_ID = PSKeywordTable.intern('ID')
  KEYWORD_EI = PSKeywordTable.intern('EI')
  def do_keyword(self, pos, token):
-    if token == KEYWORD_BI:
+    if token is self.KEYWORD_BI:
      # inline image within a content stream
      self.start_type(pos, 'inline')
-    elif token == KEYWORD_ID:
+    elif token is self.KEYWORD_ID:
      try:
        (_, objs) = self.end_type('inline')
        if len(objs) % 2 != 0:
@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser):
        (pos, data) = self.get_inline_data(pos+len('ID '))
        obj = PDFStream(d, data)
        self.push((pos, obj))
-        self.push((pos, KEYWORD_EI))
+        self.push((pos, self.KEYWORD_EI))
      except PSTypeError:
        if STRICT: raise
    else:
@ -975,7 +583,7 @@ class PDFPageInterpreter(object):
    if 1 <= self.debug:
      print >>stderr, 'Processing xobj: %r' % xobj
    subtype = xobj.dic.get('Subtype')
-    if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
+    if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
      interpreter = self.dup()
      (x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
      ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
@ -985,7 +593,7 @@ class PDFPageInterpreter(object):
      self.device.begin_figure(xobjid, bbox)
      interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
      self.device.end_figure(xobjid)
-    elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
+    elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
      (x0,y0) = apply_matrix(self.ctm, (0,0))
      (x1,y1) = apply_matrix(self.ctm, (1,1))
      self.device.begin_figure(xobjid, (x0,y0,x1,y1))
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -7,26 +7,22 @@
 import sys, re
 import md5, struct
 stderr = sys.stderr
-from utils import choplist, nunpack
+from pdflib.utils import choplist, nunpack
-from arcfour import Arcfour
+from pdflib.arcfour import Arcfour
-from lzw import LZWDecoder
+from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
-     PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
+     STRICT
-     literal_name, keyword_name, \
+from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
-     PSStackParser, STRICT
+     PDFStream, PDFObjRef, resolve1, decipher_all, \
     int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
-##  PDF Exceptions
+##  Exceptions
 ##
 class PDFException(PSException): pass
 class PDFSyntaxError(PDFException): pass
 class PDFNoValidXRef(PDFSyntaxError): pass
 class PDFEncryptionError(PDFException): pass
 class PDFPasswordIncorrect(PDFEncryptionError): pass
 class PDFTypeError(PDFException): pass
 class PDFValueError(PDFException): pass
 class PDFNotImplementedError(PSException): pass
 # some predefined literals and keywords.
 LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
 LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
 LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
 LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
                         PSLiteralTable.intern('Fl'))
 LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
                       PSLiteralTable.intern('LZW'))
 LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
                           PSLiteralTable.intern('A85'))
 KEYWORD_R = PSKeywordTable.intern('R')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
 KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
 KEYWORD_STREAM = PSKeywordTable.intern('stream')
 KEYWORD_XREF = PSKeywordTable.intern('xref')
 KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
 KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
 PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
 class PDFObject(PSObject): pass
 ##  PDFObjRef
 ##
 class PDFObjRef(PDFObject):
  def __init__(self, doc, objid, _):
    if objid == 0:
      if STRICT:
        raise PDFValueError('PDF object id cannot be 0.')
    self.doc = doc
    self.objid = objid
    #self.genno = genno  # Never used.
    return
  def __repr__(self):
    return '<PDFObjRef:%d>' % (self.objid)
  def resolve(self):
    return self.doc.getobj(self.objid)
 # resolve
 def resolve1(x):
  '''
  Resolve an object. If this is an array or dictionary,
  it may still contains some indirect objects inside.
  '''
  while isinstance(x, PDFObjRef):
    x = x.resolve()
  return x
 def resolve_all(x):
  '''
  Recursively resolve X and all the internals.
  Make sure there is no indirect reference within the nested object.
  This procedure might be slow.
  '''
  while isinstance(x, PDFObjRef):
    x = x.resolve()
  if isinstance(x, list):
    x = [ resolve_all(v) for v in x ]
  elif isinstance(x, dict):
    for (k,v) in x.iteritems():
      x[k] = resolve_all(v)
  return x
 def decipher_all(decipher, objid, genno, x):
  '''
  Recursively decipher X.
  '''
  if isinstance(x, str):
    return decipher(objid, genno, x)
  if isinstance(x, list):
    x = [ decipher_all(decipher, objid, genno, v) for v in x ]
  elif isinstance(x, dict):
    for (k,v) in x.iteritems():
      x[k] = decipher_all(decipher, objid, genno, v)
  return x
 # Type cheking
 def int_value(x):
  x = resolve1(x)
  if not isinstance(x, int):
    if STRICT:
      raise PDFTypeError('Integer required: %r' % x)
    return 0
  return x
 def float_value(x):
  x = resolve1(x)
  if not isinstance(x, float):
    if STRICT:
      raise PDFTypeError('Float required: %r' % x)
    return 0.0
  return x
 def num_value(x):
  x = resolve1(x)
  if not (isinstance(x, int) or isinstance(x, float)):
    if STRICT:
      raise PDFTypeError('Int or Float required: %r' % x)
    return 0
  return x
 def str_value(x):
  x = resolve1(x)
  if not isinstance(x, str):
    if STRICT:
      raise PDFTypeError('String required: %r' % x)
    return ''
  return x
 def list_value(x):
  x = resolve1(x)
  if not (isinstance(x, list) or isinstance(x, tuple)):
    if STRICT:
      raise PDFTypeError('List required: %r' % x)
    return []
  return x
 def dict_value(x):
  x = resolve1(x)
  if not isinstance(x, dict):
    if STRICT:
      raise PDFTypeError('Dict required: %r' % x)
    return {}
  return x
 def stream_value(x):
  x = resolve1(x)
  if not isinstance(x, PDFStream):
    if STRICT:
      raise PDFTypeError('PDFStream required: %r' % x)
    return PDFStream({}, '')
  return x
 ##  PDFStream type
 ##
 class PDFStream(PDFObject):
  def __init__(self, dic, rawdata, decipher=None):
    self.dic = dic
    self.rawdata = rawdata
    self.decipher = decipher
    self.data = None
    self.objid = None
    self.genno = None
    return
  def set_objid(self, objid, genno):
    self.objid = objid
    self.genno = genno
    return
  def __repr__(self):
    return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
  def decode(self):
    assert self.data == None and self.rawdata != None
    data = self.rawdata
    if self.decipher:
      # Handle encryption
      data = self.decipher(self.objid, self.genno, data)
    if 'Filter' not in self.dic:
      self.data = data
      self.rawdata = None
      return
    filters = self.dic['Filter']
    if not isinstance(filters, list):
      filters = [ filters ]
    for f in filters:
      if f in LITERALS_FLATE_DECODE:
        import zlib
        # will get errors if the document is encrypted.
        data = zlib.decompress(data)
      elif f in LITERALS_LZW_DECODE:
        try:
          from cStringIO import StringIO
        except ImportError:
          from StringIO import StringIO
        data = ''.join(LZWDecoder(StringIO(data)).run())
      elif f in LITERALS_ASCII85_DECODE:
        import ascii85
        data = ascii85.ascii85decode(data)
      elif f == LITERAL_CRYPT:
        raise PDFEncryptionError('/Crypt filter is unsupported')
      else:
        raise PDFNotImplementedError('Unsupported filter: %r' % f)
      # apply predictors
      params = self.dic.get('DecodeParms', {})
      if 'Predictor' in params:
        pred = int_value(params['Predictor'])
        if pred:
          if pred != 12:
            raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
          if 'Columns' not in params:
            raise PDFValueError('Columns undefined for predictor=12')
          columns = int_value(params['Columns'])
          buf = ''
          ent0 = '\x00' * columns
          for i in xrange(0, len(data), columns+1):
            pred = data[i]
            ent1 = data[i+1:i+1+columns]
            if pred == '\x02':
              ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
            buf += ent1
            ent0 = ent1
          data = buf
    self.data = data
    self.rawdata = None
    return
  def get_data(self):
    if self.data == None:
      self.decode()
    return self.data
  def get_rawdata(self):
    return self.rawdata
 ##  PDFPage
 ##
 class PDFPage(object):
  def __init__(self, doc, pageid, attrs):
    self.doc = doc
    self.pageid = pageid
    self.attrs = dict_value(attrs)
    self.lastmod = self.attrs.get('LastModified')
    self.resources = resolve1(self.attrs['Resources'])
    self.mediabox = resolve1(self.attrs['MediaBox'])
    if 'CropBox' in self.attrs:
      self.cropbox = resolve1(self.attrs['CropBox'])
    else:
      self.cropbox = self.mediabox
    self.rotate = self.attrs.get('Rotate', 0)
    self.annots = self.attrs.get('Annots')
    self.beads = self.attrs.get('B')
    if 'Contents' in self.attrs:
      contents = resolve1(self.attrs['Contents'])
    else:
      contents = []
    if not isinstance(contents, list):
      contents = [ contents ]
    self.contents = contents
    return
  def __repr__(self):
    return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
 ##  XRefs
 ##
 ##  PDFXRef
 ##
@ -296,7 +44,7 @@ class PDFXRef(object):
    return
  def objids(self):
-    return self.offsets.keys()
+    return self.offsets.iterkeys()
  def load(self, parser):
    while 1:
@ -330,10 +78,11 @@ class PDFXRef(object):
    self.load_trailer(parser)
    return
  KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
  def load_trailer(self, parser):
    try:
      (_,kwd) = parser.nexttoken()
-      assert kwd == KEYWORD_TRAILER
+      assert kwd is self.KEYWORD_TRAILER
      (_,dic) = parser.nextobject()
    except PSEOF:
      x = parser.pop(1)
@ -350,7 +99,7 @@ class PDFXRef(object):
      raise
    if use != 'n':
      if STRICT:
-        raise PDFValueError('Unused objid=%r' % objid)
+        raise PDFSyntaxError('Unused objid=%r' % objid)
    return (None, pos)
@ -367,14 +116,14 @@ class PDFXRefStream(object):
    return
  def objids(self):
-    return range(self.objid0, self.objid1+1)
+    return xrange(self.objid0, self.objid1)
  def load(self, parser):
    (_,objid) = parser.nexttoken() # ignored
    (_,genno) = parser.nexttoken() # ignored
    (_,kwd) = parser.nexttoken()
    (_,stream) = parser.nextobject()
-    if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
+    if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
      raise PDFNoValidXRef('Invalid PDF stream spec.')
    size = stream.dic['Size']
    (start, nobjs) = stream.dic.get('Index', (0,size))
@ -402,6 +151,37 @@ class PDFXRefStream(object):
      return (objid, index)
 ##  PDFPage
 ##
 class PDFPage(object):
  def __init__(self, doc, pageid, attrs):
    self.doc = doc
    self.pageid = pageid
    self.attrs = dict_value(attrs)
    self.lastmod = resolve1(self.attrs.get('LastModified'))
    self.resources = resolve1(self.attrs['Resources'])
    self.mediabox = resolve1(self.attrs['MediaBox'])
    if 'CropBox' in self.attrs:
      self.cropbox = resolve1(self.attrs['CropBox'])
    else:
      self.cropbox = self.mediabox
    self.rotate = self.attrs.get('Rotate', 0)
    self.annots = self.attrs.get('Annots')
    self.beads = self.attrs.get('B')
    if 'Contents' in self.attrs:
      contents = resolve1(self.attrs['Contents'])
    else:
      contents = []
    if not isinstance(contents, list):
      contents = [ contents ]
    self.contents = contents
    return
  def __repr__(self):
    return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
 ##  PDFDocument
 ##
 ##  A PDFDocument object represents a PDF document.
@ -463,15 +243,16 @@ class PDFDocument(object):
  def set_root(self, root):
    self.root = root
    self.catalog = dict_value(self.root)
-    if self.catalog.get('Type') != LITERAL_CATALOG:
+    if self.catalog.get('Type') is not LITERAL_CATALOG:
      if STRICT:
-        raise PDFValueError('Catalog not found!')
+        raise PDFSyntaxError('Catalog not found!')
    return
  # initialize(password='')
  #   Perform the initialization with a given password.
  #   This step is mandatory even if there's no password associated
  #   with the document.
  PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
  def initialize(self, password=''):
    if not self.encryption:
      self.is_printable = self.is_modifiable = self.is_extractable = True
@ -494,7 +275,7 @@ class PDFDocument(object):
    self.is_modifiable = bool(P & 8)
    self.is_extractable = bool(P & 16)
    # Algorithm 3.2
-    password = (password+PASSWORD_PADDING)[:32] # 1
+    password = (password+self.PASSWORD_PADDING)[:32] # 1
    hash = md5.md5(password) # 2
    hash.update(O) # 3
    hash.update(struct.pack('<l', P)) # 4
@ -512,7 +293,7 @@ class PDFDocument(object):
      u1 = Arcfour(key).process(password)
    elif R == 3:
      # Algorithm 3.5
-      hash = md5.md5(PASSWORD_PADDING) # 2
+      hash = md5.md5(self.PASSWORD_PADDING) # 2
      hash.update(docid[0]) # 3
      x = Arcfour(key).process(hash.digest()[:16]) # 4
      for i in xrange(1,19+1):
@ -536,6 +317,7 @@ class PDFDocument(object):
    key = hash.digest()[:min(len(key),16)]
    return Arcfour(key).process(data)
  KEYWORD_OBJ = PSKeywordTable.intern('obj')
  def getobj(self, objid):
    if not self.ready:
      raise PDFException('PDFDocument not initialized')
@ -554,11 +336,11 @@ class PDFDocument(object):
          pass
      else:
        if STRICT:
-          raise PDFValueError('Cannot locate objid=%r' % objid)
+          raise PDFSyntaxError('Cannot locate objid=%r' % objid)
        return None
      if strmid:
        stream = stream_value(self.getobj(strmid))
-        if stream.dic['Type'] != LITERAL_OBJSTM:
+        if stream.dic['Type'] is not LITERAL_OBJSTM:
          if STRICT:
            raise PDFSyntaxError('Not a stream object: %r' % stream)
        try:
@ -589,7 +371,7 @@ class PDFDocument(object):
        (_,genno) = self.parser.nexttoken() # genno
        #assert objid1 == objid, (objid, objid1)
        (_,kwd) = self.parser.nexttoken()
-        if kwd != KEYWORD_OBJ:
+        if kwd is not self.KEYWORD_OBJ:
          raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
        (_,obj) = self.parser.nextobject()
        if isinstance(obj, PDFStream):
@ -611,13 +393,13 @@ class PDFDocument(object):
      for (k,v) in parent.iteritems():
        if k in self.INHERITABLE_ATTRS and k not in tree:
          tree[k] = v
-      if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
+      if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
        if 1 <= self.debug:
          print >>stderr, 'Pages: Kids=%r' % tree['Kids']
        for c in tree['Kids']:
          for x in search(c, tree):
            yield x
-      elif tree.get('Type') == LITERAL_PAGE:
+      elif tree.get('Type') is LITERAL_PAGE:
        if 1 <= self.debug:
          print >>stderr, 'Page: %r' % tree
        yield (obj.objid, tree)
@ -683,15 +465,20 @@ class PDFParser(PSStackParser):
  def __repr__(self):
    return '<PDFParser>'
  KEYWORD_R = PSKeywordTable.intern('R')
  KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
  KEYWORD_STREAM = PSKeywordTable.intern('stream')
  KEYWORD_XREF = PSKeywordTable.intern('xref')
  KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
  def do_keyword(self, pos, token):
-    if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
+    if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
      self.add_results(*self.pop(1))
      return
-    if token == KEYWORD_ENDOBJ:
+    if token is self.KEYWORD_ENDOBJ:
      self.add_results(*self.pop(4))
      return
-    if token == KEYWORD_R:
+    if token is self.KEYWORD_R:
      # reference to indirect object
      try:
        ((_,objid), (_,genno)) = self.pop(2)
@ -702,7 +489,7 @@ class PDFParser(PSStackParser):
        pass
      return
-    if token == KEYWORD_STREAM:
+    if token is self.KEYWORD_STREAM:
      # stream object
      ((_,dic),) = self.pop(1)
      dic = dict_value(dic)
@ -710,7 +497,7 @@ class PDFParser(PSStackParser):
        objlen = int_value(dic['Length'])
      except KeyError:
        if STRICT:
-          raise PDFValueError('/Length is undefined: %r' % dic)
+          raise PDFSyntaxError('/Length is undefined: %r' % dic)
        objlen = 0
      self.seek(pos)
      try:
@ -785,7 +572,7 @@ class PDFParser(PSStackParser):
          xref = PDFXRefStream()
          xref.load(self)
        else:
-          if token != KEYWORD_XREF:
+          if token is not self.KEYWORD_XREF:
            raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % 
                                 (pos, token))
          self.nextline()
@ -835,6 +622,7 @@ class PDFParser(PSStackParser):
      yield xref
    return
 ##  PDFObjStrmParser
 ##
 class PDFObjStrmParser(PDFParser):
--- a/pdflib/pdftypes.py
+++ b/pdflib/pdftypes.py
@ -0,0 +1,222 @@
 #!/usr/bin/env python
 import sys, zlib
 stderr = sys.stderr
 from pdflib.lzw import LZWDecoder
 from pdflib.psparser import PSException, PSObject, \
     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, STRICT
 LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
 LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
 LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
 LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
 ##  PDF Objects
 ##
 class PDFObject(PSObject): pass
 class PDFException(PSException): pass
 class PDFTypeError(PDFException): pass
 class PDFValueError(PDFException): pass
 class PDFNotImplementedError(PSException): pass
 ##  PDFObjRef
 ##
 class PDFObjRef(PDFObject):
  def __init__(self, doc, objid, _):
    if objid == 0:
      if STRICT:
        raise PDFValueError('PDF object id cannot be 0.')
    self.doc = doc
    self.objid = objid
    #self.genno = genno  # Never used.
    return
  def __repr__(self):
    return '<PDFObjRef:%d>' % (self.objid)
  def resolve(self):
    return self.doc.getobj(self.objid)
 # resolve
 def resolve1(x):
  '''
  Resolve an object. If this is an array or dictionary,
  it may still contains some indirect objects inside.
  '''
  while isinstance(x, PDFObjRef):
    x = x.resolve()
  return x
 def resolve_all(x):
  '''
  Recursively resolve X and all the internals.
  Make sure there is no indirect reference within the nested object.
  This procedure might be slow.
  '''
  while isinstance(x, PDFObjRef):
    x = x.resolve()
  if isinstance(x, list):
    x = [ resolve_all(v) for v in x ]
  elif isinstance(x, dict):
    for (k,v) in x.iteritems():
      x[k] = resolve_all(v)
  return x
 def decipher_all(decipher, objid, genno, x):
  '''
  Recursively decipher X.
  '''
  if isinstance(x, str):
    return decipher(objid, genno, x)
  if isinstance(x, list):
    x = [ decipher_all(decipher, objid, genno, v) for v in x ]
  elif isinstance(x, dict):
    for (k,v) in x.iteritems():
      x[k] = decipher_all(decipher, objid, genno, v)
  return x
 # Type cheking
 def int_value(x):
  x = resolve1(x)
  if not isinstance(x, int):
    if STRICT:
      raise PDFTypeError('Integer required: %r' % x)
    return 0
  return x
 def float_value(x):
  x = resolve1(x)
  if not isinstance(x, float):
    if STRICT:
      raise PDFTypeError('Float required: %r' % x)
    return 0.0
  return x
 def num_value(x):
  x = resolve1(x)
  if not (isinstance(x, int) or isinstance(x, float)):
    if STRICT:
      raise PDFTypeError('Int or Float required: %r' % x)
    return 0
  return x
 def str_value(x):
  x = resolve1(x)
  if not isinstance(x, str):
    if STRICT:
      raise PDFTypeError('String required: %r' % x)
    return ''
  return x
 def list_value(x):
  x = resolve1(x)
  if not (isinstance(x, list) or isinstance(x, tuple)):
    if STRICT:
      raise PDFTypeError('List required: %r' % x)
    return []
  return x
 def dict_value(x):
  x = resolve1(x)
  if not isinstance(x, dict):
    if STRICT:
      raise PDFTypeError('Dict required: %r' % x)
    return {}
  return x
 def stream_value(x):
  x = resolve1(x)
  if not isinstance(x, PDFStream):
    if STRICT:
      raise PDFTypeError('PDFStream required: %r' % x)
    return PDFStream({}, '')
  return x
 ##  PDFStream type
 ##
 class PDFStream(PDFObject):
  def __init__(self, dic, rawdata, decipher=None):
    self.dic = dic
    self.rawdata = rawdata
    self.decipher = decipher
    self.data = None
    self.objid = None
    self.genno = None
    return
  def set_objid(self, objid, genno):
    self.objid = objid
    self.genno = genno
    return
  def __repr__(self):
    return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
  def decode(self):
    assert self.data == None and self.rawdata != None
    data = self.rawdata
    if self.decipher:
      # Handle encryption
      data = self.decipher(self.objid, self.genno, data)
    if 'Filter' not in self.dic:
      self.data = data
      self.rawdata = None
      return
    filters = self.dic['Filter']
    if not isinstance(filters, list):
      filters = [ filters ]
    for f in filters:
      if f in LITERALS_FLATE_DECODE:
        # will get errors if the document is encrypted.
        data = zlib.decompress(data)
      elif f in LITERALS_LZW_DECODE:
        try:
          from cStringIO import StringIO
        except ImportError:
          from StringIO import StringIO
        data = ''.join(LZWDecoder(StringIO(data)).run())
      elif f in LITERALS_ASCII85_DECODE:
        import ascii85
        data = ascii85.ascii85decode(data)
      elif f == LITERAL_CRYPT:
        raise PDFEncryptionError('/Crypt filter is unsupported')
      else:
        raise PDFNotImplementedError('Unsupported filter: %r' % f)
      # apply predictors
      params = self.dic.get('DecodeParms', {})
      if 'Predictor' in params:
        pred = int_value(params['Predictor'])
        if pred:
          if pred != 12:
            raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
          if 'Columns' not in params:
            raise PDFValueError('Columns undefined for predictor=12')
          columns = int_value(params['Columns'])
          buf = ''
          ent0 = '\x00' * columns
          for i in xrange(0, len(data), columns+1):
            pred = data[i]
            ent1 = data[i+1:i+1+columns]
            if pred == '\x02':
              ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
            buf += ent1
            ent0 = ent1
          data = buf
    self.data = data
    self.rawdata = None
    return
  def get_data(self):
    if self.data == None:
      self.decode()
    return self.data
  def get_rawdata(self):
    return self.rawdata
--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@ -1,7 +1,8 @@
 #!/usr/bin/env python
 import sys, re
 stderr = sys.stderr
-from utils import choplist
+
 from pdflib.utils import choplist
 STRICT = 0
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -4,6 +4,8 @@ from struct import unpack
 ##  Matrix operations
 ##
 MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
 def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
  '''Multiplies two matrices.'''
  return (a0*a1+c0*b1,    b0*a1+d0*b1,