tmp

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-01-10 09:14:46 +00:00 · 2009-01-10 09:14:46 +00:00 · c41c279321
parent 24bdd33557
commit c41c279321
9 changed files with 740 additions and 700 deletions
--- a/pdflib/page.py
+++ b/pdflib/page.py
@ -2,10 +2,53 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined
+from pdflib.pdffont import PDFUnicodeNotDefined
 from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix


+##  PDFDevice
+##
+class PDFDevice(object):
+
+  debug = 0
+  
+  def __init__(self, rsrc):
+    self.rsrc = rsrc
+    self.ctm = None
+    return
+  
+  def __repr__(self):
+    return '<PDFDevice>'
+
+  def close(self):
+    return
+
+  def set_ctm(self, ctm):
+    self.ctm = ctm
+    return
+
+  def begin_tag(self, tag, props=None):
+    return
+  def end_tag(self):
+    return
+  def do_tag(self, tag, props=None):
+    return
+
+  def begin_page(self, page):
+    return
+  def end_page(self, page):
+    return
+  def begin_figure(self, name, bbox):
+    return
+  def end_figure(self, name):
+    return
+  
+  def render_string(self, textstate, textmatrix, seq):
+    raise NotImplementedError
+  def render_image(self, stream, size, matrix):
+    raise NotImplementedError
+
+
 ##  PageItem
 ##
 class PageItem(object):
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -3,10 +3,10 @@ import sys
 stdout = sys.stdout
 stderr = sys.stderr
 from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
-     PDFPageInterpreter, PDFUnicodeNotDefined
+from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdflib.pdffont import PDFUnicodeNotDefined
 from pdflib.cmap import CMapDB
-from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator
+from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator


 def enc(x, codec):
--- a/pdflib/pdfcolor.py
+++ b/pdflib/pdfcolor.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from pdflib.psparser import PSLiteralTable
+
+
+##  ColorSpace
+##
+LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
+LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
+LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
+
+class ColorSpace(object):
+  
+  def __init__(self, name, ncomponents):
+    self.name = name
+    self.ncomponents = ncomponents
+    return
+  
+  def __repr__(self):
+    return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)
+
+
+PREDEFINED_COLORSPACE = dict(
+  (name, ColorSpace(name,n)) for (name,n) in {
+  'CalRGB': 3,
+  'CalGray': 1,
+  'Lab': 3,
+  'DeviceRGB': 3,
+  'DeviceCMYK': 4,
+  'DeviceGray': 1,
+  'Separation': 1,
+  'Indexed': 1,
+  'Pattern': 1,
+  }.iteritems())
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@ -0,0 +1,341 @@
+#!/usr/bin/env python
+import sys
+stderr = sys.stderr
+from struct import pack, unpack
+try:
+  from cStringIO import StringIO
+except ImportError:
+  from StringIO import StringIO
+from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
+     literal_name, keyword_name, STRICT
+from pdflib.pdftypes import PDFException, \
+     resolve1, int_value, float_value, num_value, \
+     str_value, list_value, dict_value, stream_value
+from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+
+
+##  Fonts
+##
+
+class PDFFontError(PDFException): pass
+class PDFUnicodeNotDefined(PDFFontError): pass
+
+LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
+
+
+# PDFFont
+class PDFFont(object):
+  
+  def __init__(self, descriptor, widths, default_width=None, font_matrix=None):
+    self.descriptor = descriptor
+    self.widths = widths
+    self.fontname = descriptor.get('FontName', 'unknown')
+    if isinstance(self.fontname, PSLiteral):
+      self.fontname = literal_name(self.fontname)
+    self.ascent = num_value(descriptor.get('Ascent', 0))
+    self.descent = num_value(descriptor.get('Descent', 0))
+    self.default_width = default_width or descriptor.get('MissingWidth', 0)
+    self.leading = num_value(descriptor.get('Leading', 0))
+    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
+    self.font_matrix = font_matrix or (.001,0,0,.001,0,0)
+    return
+
+  def __repr__(self):
+    return '<PDFFont>'
+
+  def is_vertical(self):
+    return False
+  
+  def is_multibyte(self):
+    return False
+  
+  def decode(self, bytes):
+    return map(ord, bytes)
+
+  def char_width(self, cid):
+    return self.widths.get(cid, self.default_width)
+
+  def char_disp(self, cid):
+    return 0
+  
+  def string_width(self, s):
+    return sum( self.char_width(cid) for cid in self.decode(s) )
+
+# PDFSimpleFont
+class PDFSimpleFont(PDFFont):
+  
+  def __init__(self, descriptor, widths, spec, font_matrix=None):
+    # Font encoding is specified either by a name of
+    # built-in encoding or a dictionary that describes
+    # the differences.
+    if 'Encoding' in spec:
+      encoding = resolve1(spec['Encoding'])
+    else:
+      encoding = LITERAL_STANDARD_ENCODING
+    if isinstance(encoding, dict):
+      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
+      diff = list_value(encoding.get('Differences', None))
+      self.encoding = EncodingDB.get_encoding(name, diff)
+    else:
+      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
+    PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix)
+    return
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      try:
+        return self.encoding[cid]
+      except KeyError:
+        raise PDFUnicodeNotDefined(None, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(None, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
+# PDFType1Font
+class PDFType1Font(PDFSimpleFont):
+  
+  def __init__(self, spec):
+    try:
+      self.basefont = literal_name(spec['BaseFont'])
+    except KeyError:
+      if STRICT:
+        raise PDFFontError('BaseFont is missing')
+      self.basefont = 'unknown'
+    try:
+      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
+    except KeyError:
+      descriptor = dict_value(spec.get('FontDescriptor', {}))
+      firstchar = int_value(spec.get('FirstChar', 0))
+      lastchar = int_value(spec.get('LastChar', 255))
+      widths = list_value(spec.get('Widths', [0]*256))
+      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
+    PDFSimpleFont.__init__(self, descriptor, widths, spec)
+    return
+
+  def __repr__(self):
+    return '<PDFType1Font: basefont=%r>' % self.basefont
+
+# PDFTrueTypeFont
+class PDFTrueTypeFont(PDFType1Font):
+
+  def __repr__(self):
+    return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
+
+# PDFType3Font
+class PDFType3Font(PDFSimpleFont):
+  
+  def __init__(self, spec):
+    firstchar = int_value(spec.get('FirstChar', 0))
+    lastchar = int_value(spec.get('LastChar', 0))
+    widths = list_value(spec.get('Widths', [0]*256))
+    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
+    if 'FontDescriptor' in spec:
+      descriptor = dict_value(spec['FontDescriptor'])
+    else:
+      descriptor = {'FontName':spec.get('Name'),
+                    'Ascent':0, 'Descent':0,
+                    'FontBBox':spec['FontBBox']}
+    PDFSimpleFont.__init__(self, descriptor, widths, spec,
+                           font_matrix=tuple(list_value(spec.get('FontMatrix'))))
+    return
+
+  def __repr__(self):
+    return '<PDFType3Font>'
+
+
+# PDFCIDFont
+
+##  TrueTypeFont
+##
+class TrueTypeFont(object):
+
+  class CMapNotFound(Exception): pass
+  
+  def __init__(self, name, fp):
+    self.name = name
+    self.fp = fp
+    self.tables = {}
+    fonttype = fp.read(4)
+    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+    for i in xrange(ntables):
+      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
+      self.tables[name] = (offset, length)
+    return
+
+  def create_cmap(self):
+    if 'cmap' not in self.tables:
+      raise TrueTypeFont.CMapNotFound
+    (base_offset, length) = self.tables['cmap']
+    fp = self.fp
+    fp.seek(base_offset)
+    (version, nsubtables) = unpack('>HH', fp.read(4))
+    subtables = []
+    for i in xrange(nsubtables):
+      subtables.append(unpack('>HHL', fp.read(8)))
+    char2gid = {}
+    # Only supports subtable type 0, 2 and 4.
+    for (_1, _2, st_offset) in subtables:
+      fp.seek(base_offset+st_offset)
+      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
+      if fmttype == 0:
+        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
+      elif fmttype == 2:
+        subheaderkeys = unpack('>256H', fp.read(512))
+        firstbytes = [0]*8192
+        for (i,k) in enumerate(subheaderkeys):
+          firstbytes[k/8] = i
+        nhdrs = max(subheaderkeys)/8 + 1
+        hdrs = []
+        for i in xrange(nhdrs):
+          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
+          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
+        for (i,firstcode,entcount,delta,pos) in hdrs:
+          if not entcount: continue
+          first = firstcode + (firstbytes[i] << 8)
+          fp.seek(pos)
+          for c in xrange(entcount):
+            gid = unpack('>H', fp.read(2))
+            if gid:
+              gid += delta
+            char2gid[first+c] = gid
+      elif fmttype == 4:
+        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+        segcount /= 2
+        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        fp.read(2)
+        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
+        pos = fp.tell()
+        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
+          if idr:
+            fp.seek(pos+idr)
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
+          else:
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (c + idd) & 0xffff
+    gid2char = dict( (gid, pack('>H', char))
+                     for (char,gid) in char2gid.iteritems() )
+    cmapname = 'Adobe-Identity-UCS-%s' % self.name
+    return CMap(cmapname).update(char2gid, gid2char)
+
+class PDFCIDFont(PDFFont):
+  
+  def __init__(self, spec):
+    try:
+      self.basefont = literal_name(spec['BaseFont'])
+    except KeyError:
+      if STRICT:
+        raise PDFFontError('BaseFont is missing')
+      self.basefont = 'unknown'
+    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
+    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
+                                self.cidsysteminfo.get('Ordering', 'unknown'))
+    try:
+      name = literal_name(spec['Encoding'])
+    except KeyError:
+      if STRICT:
+        raise PDFFontError('Encoding is unspecified')
+      name = 'unknown'
+    try:
+      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
+    except CMapDB.CMapNotFound, e:
+      raise PDFFontError(e)
+    try:
+      descriptor = dict_value(spec['FontDescriptor'])
+    except KeyError:
+      if STRICT:
+        raise PDFFontError('FontDescriptor is missing')
+      descriptor = {}
+    ttf = None
+    if 'FontFile2' in descriptor:
+      self.fontfile = stream_value(descriptor.get('FontFile2'))
+      ttf = TrueTypeFont(self.basefont,
+                         StringIO(self.fontfile.get_data()))
+    self.ucs2_cmap = None
+    if 'ToUnicode' in spec:
+      strm = stream_value(spec['ToUnicode'])
+      self.ucs2_cmap = CMap()
+      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
+    elif self.cidcoding == 'Adobe-Identity':
+      if ttf:
+        try:
+          self.ucs2_cmap = ttf.create_cmap()
+        except TrueTypeFont.CMapNotFound:
+          pass
+    else:
+      try:
+        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
+                                         strict=STRICT)
+      except CMapDB.CMapNotFound, e:
+        raise PDFFontError(e)
+    
+    def get_width(seq):
+      dic = {}
+      char1 = char2 = None
+      for v in seq:
+        if char1 == None:
+          char1 = v
+        elif char2 == None and isinstance(v, int):
+          char2 = v
+        else:
+          if char2 == None:
+            for (i,w) in enumerate(v):
+              dic[char1+i] = w
+          else:
+            for i in xrange(char1, char2+1):
+              dic[i] = v
+          char1 = char2 = None
+      return dic
+    self.vertical = self.cmap.is_vertical()
+    if self.vertical:
+      # writing mode: vertical
+      dic = get_width(list_value(spec.get('W2', [])))
+      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
+      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
+      (d,w) = spec.get('DW2', [880, -1000])
+      default_width = w
+      self.default_disp = d
+    else:
+      # writing mode: horizontal
+      widths = get_width(list_value(spec.get('W', [])))
+      self.disps = {}
+      default_width = spec.get('DW', 1000)
+      self.default_disp = 0
+    PDFFont.__init__(self, descriptor, widths, default_width=default_width)
+    return
+
+  def __repr__(self):
+    return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
+  
+  def is_vertical(self):
+    return self.vertical
+
+  def is_multibyte(self):
+    return True
+  
+  def decode(self, bytes):
+    return self.cmap.decode(bytes)
+
+  def char_disp(self, cid):
+    return self.disps.get(cid, self.default_disp)
+
+  def to_unicode(self, cid):
+    if not self.ucs2_cmap:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    code = self.ucs2_cmap.tocode(cid)
+    if not code:
+      raise PDFUnicodeNotDefined(self.cidcoding, cid)
+    chars = unpack('>%dH' % (len(code)/2), code)
+    return ''.join( unichr(c) for c in chars )
+
+
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -6,33 +6,22 @@ try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
-from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
-     PSStackParser, PSLiteral, PSKeyword, STRICT, \
-     PSLiteralTable, PSKeywordTable, literal_name, keyword_name
-from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \
-     int_value, float_value, num_value, \
+from pdflib.psparser import PSException, PSTypeError, PSEOF, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
+     PSStackParser, PSKeyword, STRICT
+from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
+     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
-from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix
+from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY
+from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
+from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
+     LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK


 ##  Exceptions
 ##
 class PDFResourceError(PDFException): pass
 class PDFInterpreterError(PDFException): pass
-class PDFFontError(PDFException): pass
-class PDFUnicodeNotDefined(PDFFontError): pass
-
-
-##  ColorSpace
-##
-class ColorSpace(object):
-  def __init__(self, name, ncomponents):
-    self.name = name
-    self.ncomponents = ncomponents
-    return
-  def __repr__(self):
-    return '<ColorSpace: %s, ncomponents=%d>' % (self.name, self.ncomponents)


 ##  Constants
@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text')
 LITERAL_FONT = PSLiteralTable.intern('Font')
 LITERAL_FORM = PSLiteralTable.intern('Form')
 LITERAL_IMAGE = PSLiteralTable.intern('Image')
-LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
-LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
-LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
-LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK')
-KEYWORD_BI = PSKeywordTable.intern('BI')
-KEYWORD_ID = PSKeywordTable.intern('ID')
-KEYWORD_EI = PSKeywordTable.intern('EI')
-MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
-
-PREDEFINED_COLORSPACE = dict(
-  (name, ColorSpace(name,n)) for (name,n) in {
-  'CalRGB': 3,
-  'CalGray': 1,
-  'Lab': 3,
-  'DeviceRGB': 3,
-  'DeviceCMYK': 4,
-  'DeviceGray': 1,
-  'Separation': 1,
-  'Indexed': 1,
-  'Pattern': 1,
-  }.iteritems())
-
-
-##  Fonts
-##
-
-# PDFFont
-class PDFFont(object):
-  
-  def __init__(self, descriptor, widths, default_width=None):
-    self.descriptor = descriptor
-    self.widths = widths
-    self.fontname = descriptor.get('FontName', 'unknown')
-    if isinstance(self.fontname, PSLiteral):
-      self.fontname = literal_name(self.fontname)
-    self.ascent = num_value(descriptor.get('Ascent', 0))
-    self.descent = num_value(descriptor.get('Descent', 0))
-    self.default_width = default_width or descriptor.get('MissingWidth', 0)
-    self.leading = num_value(descriptor.get('Leading', 0))
-    self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0)))
-    return
-
-  def __repr__(self):
-    return '<PDFFont>'
-
-  def is_vertical(self):
-    return False
-  
-  def is_multibyte(self):
-    return False
-  
-  def decode(self, bytes):
-    return map(ord, bytes)
-
-  def char_width(self, cid):
-    return self.widths.get(cid, self.default_width)
-
-  def char_disp(self, cid):
-    return 0
-  
-  def string_width(self, s):
-    return sum( self.char_width(cid) for cid in self.decode(s) )
-  
-
-# PDFSimpleFont
-class PDFSimpleFont(PDFFont):
-  
-  def __init__(self, descriptor, widths, spec):
-    # Font encoding is specified either by a name of
-    # built-in encoding or a dictionary that describes
-    # the differences.
-    if 'Encoding' in spec:
-      encoding = resolve1(spec['Encoding'])
-    else:
-      encoding = LITERAL_STANDARD_ENCODING
-    if isinstance(encoding, dict):
-      name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING))
-      diff = list_value(encoding.get('Differences', None))
-      self.encoding = EncodingDB.get_encoding(name, diff)
-    else:
-      self.encoding = EncodingDB.get_encoding(literal_name(encoding))
-    self.ucs2_cmap = None
-    if 'ToUnicode' in spec:
-      strm = stream_value(spec['ToUnicode'])
-      self.ucs2_cmap = CMap()
-      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
-    PDFFont.__init__(self, descriptor, widths)
-    return
-
-  def to_unicode(self, cid):
-    if not self.ucs2_cmap:
-      try:
-        return self.encoding[cid]
-      except KeyError:
-        raise PDFUnicodeNotDefined(None, cid)
-    code = self.ucs2_cmap.tocode(cid)
-    if not code:
-      raise PDFUnicodeNotDefined(None, cid)
-    chars = unpack('>%dH' % (len(code)/2), code)
-    return ''.join( unichr(c) for c in chars )
-
-
-# PDFType1Font
-class PDFType1Font(PDFSimpleFont):
-  
-  def __init__(self, spec):
-    try:
-      self.basefont = literal_name(spec['BaseFont'])
-    except KeyError:
-      if STRICT:
-        raise PDFFontError('BaseFont is missing')
-      self.basefont = 'unknown'
-    try:
-      (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont)
-    except KeyError:
-      descriptor = dict_value(spec.get('FontDescriptor', {}))
-      firstchar = int_value(spec.get('FirstChar', 0))
-      lastchar = int_value(spec.get('LastChar', 255))
-      widths = list_value(spec.get('Widths', [0]*256))
-      widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) )
-    PDFSimpleFont.__init__(self, descriptor, widths, spec)
-    return
-
-  def __repr__(self):
-    return '<PDFType1Font: basefont=%r>' % self.basefont
-
-# PDFTrueTypeFont
-class PDFTrueTypeFont(PDFType1Font):
-
-  def __repr__(self):
-    return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
-
-# PDFType3Font
-class PDFType3Font(PDFSimpleFont):
-  def __init__(self, spec):
-    firstchar = int_value(spec.get('FirstChar', 0))
-    lastchar = int_value(spec.get('LastChar', 0))
-    widths = list_value(spec.get('Widths', [0]*256))
-    widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths))
-    if 'FontDescriptor' in spec:
-      descriptor = dict_value(spec['FontDescriptor'])
-    else:
-      descriptor = {'FontName':spec.get('Name'),
-                    'Ascent':0, 'Descent':0,
-                    'FontBBox':spec['FontBBox']}
-    PDFSimpleFont.__init__(self, descriptor, widths, spec)
-    return
-
-  def __repr__(self):
-    return '<PDFType3Font>'
-
-
-# PDFCIDFont
-
-##  TrueTypeFont
-##
-class TrueTypeFont(object):
-
-  class CMapNotFound(Exception): pass
-  
-  def __init__(self, name, fp):
-    self.name = name
-    self.fp = fp
-    self.tables = {}
-    fonttype = fp.read(4)
-    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-    for i in xrange(ntables):
-      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
-      self.tables[name] = (offset, length)
-    return
-
-  def create_cmap(self):
-    if 'cmap' not in self.tables:
-      raise TrueTypeFont.CMapNotFound
-    (base_offset, length) = self.tables['cmap']
-    fp = self.fp
-    fp.seek(base_offset)
-    (version, nsubtables) = unpack('>HH', fp.read(4))
-    subtables = []
-    for i in xrange(nsubtables):
-      subtables.append(unpack('>HHL', fp.read(8)))
-    char2gid = {}
-    # Only supports subtable type 0, 2 and 4.
-    for (_1, _2, st_offset) in subtables:
-      fp.seek(base_offset+st_offset)
-      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
-      if fmttype == 0:
-        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
-      elif fmttype == 2:
-        subheaderkeys = unpack('>256H', fp.read(512))
-        firstbytes = [0]*8192
-        for (i,k) in enumerate(subheaderkeys):
-          firstbytes[k/8] = i
-        nhdrs = max(subheaderkeys)/8 + 1
-        hdrs = []
-        for i in xrange(nhdrs):
-          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
-          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
-        for (i,firstcode,entcount,delta,pos) in hdrs:
-          if not entcount: continue
-          first = firstcode + (firstbytes[i] << 8)
-          fp.seek(pos)
-          for c in xrange(entcount):
-            gid = unpack('>H', fp.read(2))
-            if gid:
-              gid += delta
-            char2gid[first+c] = gid
-      elif fmttype == 4:
-        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-        segcount /= 2
-        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        fp.read(2)
-        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
-        pos = fp.tell()
-        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
-          if idr:
-            fp.seek(pos+idr)
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
-          else:
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (c + idd) & 0xffff
-    gid2char = dict( (gid, pack('>H', char))
-                     for (char,gid) in char2gid.iteritems() )
-    cmapname = 'Adobe-Identity-UCS-%s' % self.name
-    return CMap(cmapname).update(char2gid, gid2char)
-
-class PDFCIDFont(PDFFont):
-  
-  def __init__(self, spec):
-    try:
-      self.basefont = literal_name(spec['BaseFont'])
-    except KeyError:
-      if STRICT:
-        raise PDFFontError('BaseFont is missing')
-      self.basefont = 'unknown'
-    self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
-    self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
-                                self.cidsysteminfo.get('Ordering', 'unknown'))
-    try:
-      name = literal_name(spec['Encoding'])
-    except KeyError:
-      if STRICT:
-        raise PDFFontError('Encoding is unspecified')
-      name = 'unknown'
-    try:
-      self.cmap = CMapDB.get_cmap(name, strict=STRICT)
-    except CMapDB.CMapNotFound, e:
-      raise PDFFontError(e)
-    try:
-      descriptor = dict_value(spec['FontDescriptor'])
-    except KeyError:
-      if STRICT:
-        raise PDFFontError('FontDescriptor is missing')
-      descriptor = {}
-    ttf = None
-    if 'FontFile2' in descriptor:
-      self.fontfile = stream_value(descriptor.get('FontFile2'))
-      ttf = TrueTypeFont(self.basefont,
-                         StringIO(self.fontfile.get_data()))
-    self.ucs2_cmap = None
-    if 'ToUnicode' in spec:
-      strm = stream_value(spec['ToUnicode'])
-      self.ucs2_cmap = CMap()
-      CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run()
-    elif self.cidcoding == 'Adobe-Identity':
-      if ttf:
-        try:
-          self.ucs2_cmap = ttf.create_cmap()
-        except TrueTypeFont.CMapNotFound:
-          pass
-    else:
-      try:
-        self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding,
-                                         strict=STRICT)
-      except CMapDB.CMapNotFound, e:
-        raise PDFFontError(e)
-    
-    def get_width(seq):
-      dic = {}
-      char1 = char2 = None
-      for v in seq:
-        if char1 == None:
-          char1 = v
-        elif char2 == None and isinstance(v, int):
-          char2 = v
-        else:
-          if char2 == None:
-            for (i,w) in enumerate(v):
-              dic[char1+i] = w
-          else:
-            for i in xrange(char1, char2+1):
-              dic[i] = v
-          char1 = char2 = None
-      return dic
-    self.vertical = self.cmap.is_vertical()
-    if self.vertical:
-      # writing mode: vertical
-      dic = get_width(list_value(spec.get('W2', [])))
-      widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() )
-      self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() )
-      (d,w) = spec.get('DW2', [880, -1000])
-      default_width = w
-      self.default_disp = d
-    else:
-      # writing mode: horizontal
-      widths = get_width(list_value(spec.get('W', [])))
-      self.disps = {}
-      default_width = spec.get('DW', 1000)
-      self.default_disp = 0
-    PDFFont.__init__(self, descriptor, widths, default_width)
-    return
-
-  def __repr__(self):
-    return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
-  
-  def is_vertical(self):
-    return self.vertical
-
-  def is_multibyte(self):
-    return True
-  
-  def decode(self, bytes):
-    return self.cmap.decode(bytes)
-
-  def char_disp(self, cid):
-    return self.disps.get(cid, self.default_disp)
-
-  def to_unicode(self, cid):
-    if not self.ucs2_cmap:
-      raise PDFUnicodeNotDefined(self.cidcoding, cid)
-    code = self.ucs2_cmap.tocode(cid)
-    if not code:
-      raise PDFUnicodeNotDefined(self.cidcoding, cid)
-    chars = unpack('>%dH' % (len(code)/2), code)
-    return ''.join( unichr(c) for c in chars )


 ##  Resource Manager
@ -388,7 +39,7 @@ class PDFResourceManager(object):

  '''
  ResourceManager facilitates reuse of shared resources
-  such as fonts, images and cmaps so that large objects are not
+  such as fonts and images so that large objects are not
  allocated multiple times.
  '''
  debug = 0
@ -399,24 +50,21 @@ class PDFResourceManager(object):

  def get_procset(self, procs):
    for proc in procs:
-      if proc == LITERAL_PDF:
+      if proc is LITERAL_PDF:
        pass
-      elif proc == LITERAL_TEXT:
+      elif proc is LITERAL_TEXT:
        pass
      else:
        #raise PDFResourceError('ProcSet %r is not supported.' % proc)
        pass
    return
  
-  def get_cmap(self, name):
-    return CMapDB.get_cmap(name, strict=STRICT)
-
  def get_font(self, objid, spec):
    if objid and objid in self.fonts:
      font = self.fonts[objid]
    else:
      if STRICT:
-        if spec['Type'] != LITERAL_FONT:
+        if spec['Type'] is not LITERAL_FONT:
          raise PDFFontError('Type is not /Font')
      # Create a Font object.
      if 'Subtype' in spec:
@ -455,49 +103,6 @@ class PDFResourceManager(object):
    return font


-##  PDFDevice
-##
-class PDFDevice(object):
-
-  debug = 0
-  
-  def __init__(self, rsrc):
-    self.rsrc = rsrc
-    self.ctm = None
-    return
-  
-  def __repr__(self):
-    return '<PDFDevice>'
-
-  def close(self):
-    return
-
-  def set_ctm(self, ctm):
-    self.ctm = ctm
-    return
-
-  def begin_tag(self, tag, props=None):
-    return
-  def end_tag(self):
-    return
-  def do_tag(self, tag, props=None):
-    return
-
-  def begin_page(self, page):
-    return
-  def end_page(self, page):
-    return
-  def begin_figure(self, name, bbox):
-    return
-  def end_figure(self, name):
-    return
-  
-  def render_string(self, textstate, textmatrix, seq):
-    raise NotImplementedError
-  def render_image(self, stream, size, matrix):
-    raise NotImplementedError
-
-
 ##  PDFContentParser
 ##
 class PDFContentParser(PSStackParser):
@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser):
    self.add_results(*self.popall())
    return

+  KEYWORD_BI = PSKeywordTable.intern('BI')
+  KEYWORD_ID = PSKeywordTable.intern('ID')
+  KEYWORD_EI = PSKeywordTable.intern('EI')
  def do_keyword(self, pos, token):
-    if token == KEYWORD_BI:
+    if token is self.KEYWORD_BI:
      # inline image within a content stream
      self.start_type(pos, 'inline')
-    elif token == KEYWORD_ID:
+    elif token is self.KEYWORD_ID:
      try:
        (_, objs) = self.end_type('inline')
        if len(objs) % 2 != 0:
@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser):
        (pos, data) = self.get_inline_data(pos+len('ID '))
        obj = PDFStream(d, data)
        self.push((pos, obj))
-        self.push((pos, KEYWORD_EI))
+        self.push((pos, self.KEYWORD_EI))
      except PSTypeError:
        if STRICT: raise
    else:
@ -975,7 +583,7 @@ class PDFPageInterpreter(object):
    if 1 <= self.debug:
      print >>stderr, 'Processing xobj: %r' % xobj
    subtype = xobj.dic.get('Subtype')
-    if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
+    if subtype is LITERAL_FORM and 'BBox' in xobj.dic:
      interpreter = self.dup()
      (x0,y0,x1,y1) = list_value(xobj.dic['BBox'])
      ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm)
@ -985,7 +593,7 @@ class PDFPageInterpreter(object):
      self.device.begin_figure(xobjid, bbox)
      interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm)
      self.device.end_figure(xobjid)
-    elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
+    elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
      (x0,y0) = apply_matrix(self.ctm, (0,0))
      (x1,y1) = apply_matrix(self.ctm, (1,1))
      self.device.begin_figure(xobjid, (x0,y0,x1,y1))
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -7,26 +7,22 @@
 import sys, re
 import md5, struct
 stderr = sys.stderr
-from utils import choplist, nunpack
-from arcfour import Arcfour
-from lzw import LZWDecoder
-from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
-     PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
-     literal_name, keyword_name, \
-     PSStackParser, STRICT
+from pdflib.utils import choplist, nunpack
+from pdflib.arcfour import Arcfour
+from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
+     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
+     STRICT
+from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
+     PDFStream, PDFObjRef, resolve1, decipher_all, \
+     int_value, float_value, num_value, str_value, list_value, dict_value, stream_value


-##  PDF Exceptions
+##  Exceptions
 ##
-class PDFException(PSException): pass
 class PDFSyntaxError(PDFException): pass
 class PDFNoValidXRef(PDFSyntaxError): pass
 class PDFEncryptionError(PDFException): pass
 class PDFPasswordIncorrect(PDFEncryptionError): pass
-class PDFTypeError(PDFException): pass
-class PDFValueError(PDFException): pass
-class PDFNotImplementedError(PSException): pass
-

 # some predefined literals and keywords.
 LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm')
@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
 LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
-LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
-LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'),
-                         PSLiteralTable.intern('Fl'))
-LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'),
-                       PSLiteralTable.intern('LZW'))
-LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'),
-                           PSLiteralTable.intern('A85'))
-KEYWORD_R = PSKeywordTable.intern('R')
-KEYWORD_OBJ = PSKeywordTable.intern('obj')
-KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
-KEYWORD_STREAM = PSKeywordTable.intern('stream')
-KEYWORD_XREF = PSKeywordTable.intern('xref')
-KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
-KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
-PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
-
-class PDFObject(PSObject): pass
-
-
-##  PDFObjRef
-##
-class PDFObjRef(PDFObject):
-  
-  def __init__(self, doc, objid, _):
-    if objid == 0:
-      if STRICT:
-        raise PDFValueError('PDF object id cannot be 0.')
-    self.doc = doc
-    self.objid = objid
-    #self.genno = genno  # Never used.
-    return
-
-  def __repr__(self):
-    return '<PDFObjRef:%d>' % (self.objid)
-
-  def resolve(self):
-    return self.doc.getobj(self.objid)
-
-
-# resolve
-def resolve1(x):
-  '''
-  Resolve an object. If this is an array or dictionary,
-  it may still contains some indirect objects inside.
-  '''
-  while isinstance(x, PDFObjRef):
-    x = x.resolve()
-  return x
-
-def resolve_all(x):
-  '''
-  Recursively resolve X and all the internals.
-  Make sure there is no indirect reference within the nested object.
-  This procedure might be slow.
-  '''
-  while isinstance(x, PDFObjRef):
-    x = x.resolve()
-  if isinstance(x, list):
-    x = [ resolve_all(v) for v in x ]
-  elif isinstance(x, dict):
-    for (k,v) in x.iteritems():
-      x[k] = resolve_all(v)
-  return x
-
-def decipher_all(decipher, objid, genno, x):
-  '''
-  Recursively decipher X.
-  '''
-  if isinstance(x, str):
-    return decipher(objid, genno, x)
-  if isinstance(x, list):
-    x = [ decipher_all(decipher, objid, genno, v) for v in x ]
-  elif isinstance(x, dict):
-    for (k,v) in x.iteritems():
-      x[k] = decipher_all(decipher, objid, genno, v)
-  return x
-
-# Type cheking
-def int_value(x):
-  x = resolve1(x)
-  if not isinstance(x, int):
-    if STRICT:
-      raise PDFTypeError('Integer required: %r' % x)
-    return 0
-  return x
-
-def float_value(x):
-  x = resolve1(x)
-  if not isinstance(x, float):
-    if STRICT:
-      raise PDFTypeError('Float required: %r' % x)
-    return 0.0
-  return x
-
-def num_value(x):
-  x = resolve1(x)
-  if not (isinstance(x, int) or isinstance(x, float)):
-    if STRICT:
-      raise PDFTypeError('Int or Float required: %r' % x)
-    return 0
-  return x
-
-def str_value(x):
-  x = resolve1(x)
-  if not isinstance(x, str):
-    if STRICT:
-      raise PDFTypeError('String required: %r' % x)
-    return ''
-  return x
-
-def list_value(x):
-  x = resolve1(x)
-  if not (isinstance(x, list) or isinstance(x, tuple)):
-    if STRICT:
-      raise PDFTypeError('List required: %r' % x)
-    return []
-  return x
-
-def dict_value(x):
-  x = resolve1(x)
-  if not isinstance(x, dict):
-    if STRICT:
-      raise PDFTypeError('Dict required: %r' % x)
-    return {}
-  return x
-
-def stream_value(x):
-  x = resolve1(x)
-  if not isinstance(x, PDFStream):
-    if STRICT:
-      raise PDFTypeError('PDFStream required: %r' % x)
-    return PDFStream({}, '')
-  return x
-
-
-##  PDFStream type
-##
-class PDFStream(PDFObject):
-  
-  def __init__(self, dic, rawdata, decipher=None):
-    self.dic = dic
-    self.rawdata = rawdata
-    self.decipher = decipher
-    self.data = None
-    self.objid = None
-    self.genno = None
-    return
-
-  def set_objid(self, objid, genno):
-    self.objid = objid
-    self.genno = genno
-    return
-  
-  def __repr__(self):
-    return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
-
-  def decode(self):
-    assert self.data == None and self.rawdata != None
-    data = self.rawdata
-    if self.decipher:
-      # Handle encryption
-      data = self.decipher(self.objid, self.genno, data)
-    if 'Filter' not in self.dic:
-      self.data = data
-      self.rawdata = None
-      return
-    filters = self.dic['Filter']
-    if not isinstance(filters, list):
-      filters = [ filters ]
-    for f in filters:
-      if f in LITERALS_FLATE_DECODE:
-        import zlib
-        # will get errors if the document is encrypted.
-        data = zlib.decompress(data)
-      elif f in LITERALS_LZW_DECODE:
-        try:
-          from cStringIO import StringIO
-        except ImportError:
-          from StringIO import StringIO
-        data = ''.join(LZWDecoder(StringIO(data)).run())
-      elif f in LITERALS_ASCII85_DECODE:
-        import ascii85
-        data = ascii85.ascii85decode(data)
-      elif f == LITERAL_CRYPT:
-        raise PDFEncryptionError('/Crypt filter is unsupported')
-      else:
-        raise PDFNotImplementedError('Unsupported filter: %r' % f)
-      # apply predictors
-      params = self.dic.get('DecodeParms', {})
-      if 'Predictor' in params:
-        pred = int_value(params['Predictor'])
-        if pred:
-          if pred != 12:
-            raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
-          if 'Columns' not in params:
-            raise PDFValueError('Columns undefined for predictor=12')
-          columns = int_value(params['Columns'])
-          buf = ''
-          ent0 = '\x00' * columns
-          for i in xrange(0, len(data), columns+1):
-            pred = data[i]
-            ent1 = data[i+1:i+1+columns]
-            if pred == '\x02':
-              ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
-            buf += ent1
-            ent0 = ent1
-          data = buf
-    self.data = data
-    self.rawdata = None
-    return
-
-  def get_data(self):
-    if self.data == None:
-      self.decode()
-    return self.data
-
-  def get_rawdata(self):
-    return self.rawdata
-  
-
-##  PDFPage
-##
-class PDFPage(object):
-  
-  def __init__(self, doc, pageid, attrs):
-    self.doc = doc
-    self.pageid = pageid
-    self.attrs = dict_value(attrs)
-    self.lastmod = self.attrs.get('LastModified')
-    self.resources = resolve1(self.attrs['Resources'])
-    self.mediabox = resolve1(self.attrs['MediaBox'])
-    if 'CropBox' in self.attrs:
-      self.cropbox = resolve1(self.attrs['CropBox'])
-    else:
-      self.cropbox = self.mediabox
-    self.rotate = self.attrs.get('Rotate', 0)
-    self.annots = self.attrs.get('Annots')
-    self.beads = self.attrs.get('B')
-    if 'Contents' in self.attrs:
-      contents = resolve1(self.attrs['Contents'])
-    else:
-      contents = []
-    if not isinstance(contents, list):
-      contents = [ contents ]
-    self.contents = contents
-    return
-
-  def __repr__(self):
-    return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)


 ##  XRefs
+##

 ##  PDFXRef
 ##
@ -296,7 +44,7 @@ class PDFXRef(object):
    return

  def objids(self):
-    return self.offsets.keys()
+    return self.offsets.iterkeys()

  def load(self, parser):
    while 1:
@ -330,10 +78,11 @@ class PDFXRef(object):
    self.load_trailer(parser)
    return
  
+  KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
  def load_trailer(self, parser):
    try:
      (_,kwd) = parser.nexttoken()
-      assert kwd == KEYWORD_TRAILER
+      assert kwd is self.KEYWORD_TRAILER
      (_,dic) = parser.nextobject()
    except PSEOF:
      x = parser.pop(1)
@ -350,7 +99,7 @@ class PDFXRef(object):
      raise
    if use != 'n':
      if STRICT:
-        raise PDFValueError('Unused objid=%r' % objid)
+        raise PDFSyntaxError('Unused objid=%r' % objid)
    return (None, pos)


@ -367,14 +116,14 @@ class PDFXRefStream(object):
    return

  def objids(self):
-    return range(self.objid0, self.objid1+1)
+    return xrange(self.objid0, self.objid1)

  def load(self, parser):
    (_,objid) = parser.nexttoken() # ignored
    (_,genno) = parser.nexttoken() # ignored
    (_,kwd) = parser.nexttoken()
    (_,stream) = parser.nextobject()
-    if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
+    if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
      raise PDFNoValidXRef('Invalid PDF stream spec.')
    size = stream.dic['Size']
    (start, nobjs) = stream.dic.get('Index', (0,size))
@ -402,6 +151,37 @@ class PDFXRefStream(object):
      return (objid, index)


+##  PDFPage
+##
+class PDFPage(object):
+  
+  def __init__(self, doc, pageid, attrs):
+    self.doc = doc
+    self.pageid = pageid
+    self.attrs = dict_value(attrs)
+    self.lastmod = resolve1(self.attrs.get('LastModified'))
+    self.resources = resolve1(self.attrs['Resources'])
+    self.mediabox = resolve1(self.attrs['MediaBox'])
+    if 'CropBox' in self.attrs:
+      self.cropbox = resolve1(self.attrs['CropBox'])
+    else:
+      self.cropbox = self.mediabox
+    self.rotate = self.attrs.get('Rotate', 0)
+    self.annots = self.attrs.get('Annots')
+    self.beads = self.attrs.get('B')
+    if 'Contents' in self.attrs:
+      contents = resolve1(self.attrs['Contents'])
+    else:
+      contents = []
+    if not isinstance(contents, list):
+      contents = [ contents ]
+    self.contents = contents
+    return
+
+  def __repr__(self):
+    return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
+
+
 ##  PDFDocument
 ##
 ##  A PDFDocument object represents a PDF document.
@ -463,15 +243,16 @@ class PDFDocument(object):
  def set_root(self, root):
    self.root = root
    self.catalog = dict_value(self.root)
-    if self.catalog.get('Type') != LITERAL_CATALOG:
+    if self.catalog.get('Type') is not LITERAL_CATALOG:
      if STRICT:
-        raise PDFValueError('Catalog not found!')
+        raise PDFSyntaxError('Catalog not found!')
    return
  
  # initialize(password='')
  #   Perform the initialization with a given password.
  #   This step is mandatory even if there's no password associated
  #   with the document.
+  PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
  def initialize(self, password=''):
    if not self.encryption:
      self.is_printable = self.is_modifiable = self.is_extractable = True
@ -494,7 +275,7 @@ class PDFDocument(object):
    self.is_modifiable = bool(P & 8)
    self.is_extractable = bool(P & 16)
    # Algorithm 3.2
-    password = (password+PASSWORD_PADDING)[:32] # 1
+    password = (password+self.PASSWORD_PADDING)[:32] # 1
    hash = md5.md5(password) # 2
    hash.update(O) # 3
    hash.update(struct.pack('<l', P)) # 4
@ -512,7 +293,7 @@ class PDFDocument(object):
      u1 = Arcfour(key).process(password)
    elif R == 3:
      # Algorithm 3.5
-      hash = md5.md5(PASSWORD_PADDING) # 2
+      hash = md5.md5(self.PASSWORD_PADDING) # 2
      hash.update(docid[0]) # 3
      x = Arcfour(key).process(hash.digest()[:16]) # 4
      for i in xrange(1,19+1):
@ -536,6 +317,7 @@ class PDFDocument(object):
    key = hash.digest()[:min(len(key),16)]
    return Arcfour(key).process(data)

+  KEYWORD_OBJ = PSKeywordTable.intern('obj')
  def getobj(self, objid):
    if not self.ready:
      raise PDFException('PDFDocument not initialized')
@ -554,11 +336,11 @@ class PDFDocument(object):
          pass
      else:
        if STRICT:
-          raise PDFValueError('Cannot locate objid=%r' % objid)
+          raise PDFSyntaxError('Cannot locate objid=%r' % objid)
        return None
      if strmid:
        stream = stream_value(self.getobj(strmid))
-        if stream.dic['Type'] != LITERAL_OBJSTM:
+        if stream.dic['Type'] is not LITERAL_OBJSTM:
          if STRICT:
            raise PDFSyntaxError('Not a stream object: %r' % stream)
        try:
@ -589,7 +371,7 @@ class PDFDocument(object):
        (_,genno) = self.parser.nexttoken() # genno
        #assert objid1 == objid, (objid, objid1)
        (_,kwd) = self.parser.nexttoken()
-        if kwd != KEYWORD_OBJ:
+        if kwd is not self.KEYWORD_OBJ:
          raise PDFSyntaxError('Invalid object spec: offset=%r' % index)
        (_,obj) = self.parser.nextobject()
        if isinstance(obj, PDFStream):
@ -611,13 +393,13 @@ class PDFDocument(object):
      for (k,v) in parent.iteritems():
        if k in self.INHERITABLE_ATTRS and k not in tree:
          tree[k] = v
-      if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
+      if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
        if 1 <= self.debug:
          print >>stderr, 'Pages: Kids=%r' % tree['Kids']
        for c in tree['Kids']:
          for x in search(c, tree):
            yield x
-      elif tree.get('Type') == LITERAL_PAGE:
+      elif tree.get('Type') is LITERAL_PAGE:
        if 1 <= self.debug:
          print >>stderr, 'Page: %r' % tree
        yield (obj.objid, tree)
@ -683,15 +465,20 @@ class PDFParser(PSStackParser):
  def __repr__(self):
    return '<PDFParser>'

+  KEYWORD_R = PSKeywordTable.intern('R')
+  KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
+  KEYWORD_STREAM = PSKeywordTable.intern('stream')
+  KEYWORD_XREF = PSKeywordTable.intern('xref')
+  KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
  def do_keyword(self, pos, token):
-    if token in (KEYWORD_XREF, KEYWORD_STARTXREF):
+    if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF):
      self.add_results(*self.pop(1))
      return
-    if token == KEYWORD_ENDOBJ:
+    if token is self.KEYWORD_ENDOBJ:
      self.add_results(*self.pop(4))
      return
    
-    if token == KEYWORD_R:
+    if token is self.KEYWORD_R:
      # reference to indirect object
      try:
        ((_,objid), (_,genno)) = self.pop(2)
@ -702,7 +489,7 @@ class PDFParser(PSStackParser):
        pass
      return
      
-    if token == KEYWORD_STREAM:
+    if token is self.KEYWORD_STREAM:
      # stream object
      ((_,dic),) = self.pop(1)
      dic = dict_value(dic)
@ -710,7 +497,7 @@ class PDFParser(PSStackParser):
        objlen = int_value(dic['Length'])
      except KeyError:
        if STRICT:
-          raise PDFValueError('/Length is undefined: %r' % dic)
+          raise PDFSyntaxError('/Length is undefined: %r' % dic)
        objlen = 0
      self.seek(pos)
      try:
@ -785,7 +572,7 @@ class PDFParser(PSStackParser):
          xref = PDFXRefStream()
          xref.load(self)
        else:
-          if token != KEYWORD_XREF:
+          if token is not self.KEYWORD_XREF:
            raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % 
                                 (pos, token))
          self.nextline()
@ -835,6 +622,7 @@ class PDFParser(PSStackParser):
      yield xref
    return

+
 ##  PDFObjStrmParser
 ##
 class PDFObjStrmParser(PDFParser):
--- a/pdflib/pdftypes.py
+++ b/pdflib/pdftypes.py
@ -0,0 +1,222 @@
+#!/usr/bin/env python
+import sys, zlib
+stderr = sys.stderr
+from pdflib.lzw import LZWDecoder
+from pdflib.psparser import PSException, PSObject, \
+     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
+     literal_name, keyword_name, STRICT
+
+LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
+LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl'))
+LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW'))
+LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85'))
+
+
+##  PDF Objects
+##
+class PDFObject(PSObject): pass
+
+class PDFException(PSException): pass
+class PDFTypeError(PDFException): pass
+class PDFValueError(PDFException): pass
+class PDFNotImplementedError(PSException): pass
+
+
+##  PDFObjRef
+##
+class PDFObjRef(PDFObject):
+  
+  def __init__(self, doc, objid, _):
+    if objid == 0:
+      if STRICT:
+        raise PDFValueError('PDF object id cannot be 0.')
+    self.doc = doc
+    self.objid = objid
+    #self.genno = genno  # Never used.
+    return
+
+  def __repr__(self):
+    return '<PDFObjRef:%d>' % (self.objid)
+
+  def resolve(self):
+    return self.doc.getobj(self.objid)
+
+
+# resolve
+def resolve1(x):
+  '''
+  Resolve an object. If this is an array or dictionary,
+  it may still contains some indirect objects inside.
+  '''
+  while isinstance(x, PDFObjRef):
+    x = x.resolve()
+  return x
+
+def resolve_all(x):
+  '''
+  Recursively resolve X and all the internals.
+  Make sure there is no indirect reference within the nested object.
+  This procedure might be slow.
+  '''
+  while isinstance(x, PDFObjRef):
+    x = x.resolve()
+  if isinstance(x, list):
+    x = [ resolve_all(v) for v in x ]
+  elif isinstance(x, dict):
+    for (k,v) in x.iteritems():
+      x[k] = resolve_all(v)
+  return x
+
+def decipher_all(decipher, objid, genno, x):
+  '''
+  Recursively decipher X.
+  '''
+  if isinstance(x, str):
+    return decipher(objid, genno, x)
+  if isinstance(x, list):
+    x = [ decipher_all(decipher, objid, genno, v) for v in x ]
+  elif isinstance(x, dict):
+    for (k,v) in x.iteritems():
+      x[k] = decipher_all(decipher, objid, genno, v)
+  return x
+
+# Type cheking
+def int_value(x):
+  x = resolve1(x)
+  if not isinstance(x, int):
+    if STRICT:
+      raise PDFTypeError('Integer required: %r' % x)
+    return 0
+  return x
+
+def float_value(x):
+  x = resolve1(x)
+  if not isinstance(x, float):
+    if STRICT:
+      raise PDFTypeError('Float required: %r' % x)
+    return 0.0
+  return x
+
+def num_value(x):
+  x = resolve1(x)
+  if not (isinstance(x, int) or isinstance(x, float)):
+    if STRICT:
+      raise PDFTypeError('Int or Float required: %r' % x)
+    return 0
+  return x
+
+def str_value(x):
+  x = resolve1(x)
+  if not isinstance(x, str):
+    if STRICT:
+      raise PDFTypeError('String required: %r' % x)
+    return ''
+  return x
+
+def list_value(x):
+  x = resolve1(x)
+  if not (isinstance(x, list) or isinstance(x, tuple)):
+    if STRICT:
+      raise PDFTypeError('List required: %r' % x)
+    return []
+  return x
+
+def dict_value(x):
+  x = resolve1(x)
+  if not isinstance(x, dict):
+    if STRICT:
+      raise PDFTypeError('Dict required: %r' % x)
+    return {}
+  return x
+
+def stream_value(x):
+  x = resolve1(x)
+  if not isinstance(x, PDFStream):
+    if STRICT:
+      raise PDFTypeError('PDFStream required: %r' % x)
+    return PDFStream({}, '')
+  return x
+
+
+##  PDFStream type
+##
+class PDFStream(PDFObject):
+  
+  def __init__(self, dic, rawdata, decipher=None):
+    self.dic = dic
+    self.rawdata = rawdata
+    self.decipher = decipher
+    self.data = None
+    self.objid = None
+    self.genno = None
+    return
+
+  def set_objid(self, objid, genno):
+    self.objid = objid
+    self.genno = genno
+    return
+  
+  def __repr__(self):
+    return '<PDFStream(%r): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
+
+  def decode(self):
+    assert self.data == None and self.rawdata != None
+    data = self.rawdata
+    if self.decipher:
+      # Handle encryption
+      data = self.decipher(self.objid, self.genno, data)
+    if 'Filter' not in self.dic:
+      self.data = data
+      self.rawdata = None
+      return
+    filters = self.dic['Filter']
+    if not isinstance(filters, list):
+      filters = [ filters ]
+    for f in filters:
+      if f in LITERALS_FLATE_DECODE:
+        # will get errors if the document is encrypted.
+        data = zlib.decompress(data)
+      elif f in LITERALS_LZW_DECODE:
+        try:
+          from cStringIO import StringIO
+        except ImportError:
+          from StringIO import StringIO
+        data = ''.join(LZWDecoder(StringIO(data)).run())
+      elif f in LITERALS_ASCII85_DECODE:
+        import ascii85
+        data = ascii85.ascii85decode(data)
+      elif f == LITERAL_CRYPT:
+        raise PDFEncryptionError('/Crypt filter is unsupported')
+      else:
+        raise PDFNotImplementedError('Unsupported filter: %r' % f)
+      # apply predictors
+      params = self.dic.get('DecodeParms', {})
+      if 'Predictor' in params:
+        pred = int_value(params['Predictor'])
+        if pred:
+          if pred != 12:
+            raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
+          if 'Columns' not in params:
+            raise PDFValueError('Columns undefined for predictor=12')
+          columns = int_value(params['Columns'])
+          buf = ''
+          ent0 = '\x00' * columns
+          for i in xrange(0, len(data), columns+1):
+            pred = data[i]
+            ent1 = data[i+1:i+1+columns]
+            if pred == '\x02':
+              ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
+            buf += ent1
+            ent0 = ent1
+          data = buf
+    self.data = data
+    self.rawdata = None
+    return
+
+  def get_data(self):
+    if self.data == None:
+      self.decode()
+    return self.data
+
+  def get_rawdata(self):
+    return self.rawdata
--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@ -1,7 +1,8 @@
 #!/usr/bin/env python
 import sys, re
 stderr = sys.stderr
-from utils import choplist
+
+from pdflib.utils import choplist

 STRICT = 0

--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -4,6 +4,8 @@ from struct import unpack

 ##  Matrix operations
 ##
+MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0)
+
 def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
  '''Multiplies two matrices.'''
  return (a0*a1+c0*b1,    b0*a1+d0*b1,