warning removal.

code cleanup. cmap bug fixed. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@168 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-01-01 03:09:26 +00:00 · 2010-01-01 03:09:26 +00:00 · 98c8367339
parent 7093bdbdfa
commit 98c8367339
10 changed files with 208 additions and 165 deletions
--- a/34
+++ b/34
@ -19,14 +19,8 @@ clean:
 	-$(RM) -r build dist
 	-cd $(PACKAGE) && $(MAKE) clean
 	-cd tools && $(MAKE) clean
-	-cd samples && $(MAKE) clean

-distclean: clean cmap_clean
-
-test:
-	cd samples && $(MAKE) test
-check:
-	cd $(PACKAGE) && make check
+distclean: clean test_clean cmap_clean

 commit: distclean
 	$(SVN) commit
@ -39,13 +33,23 @@ WEBDIR=$$HOME/Site/unixuser.org/python/$(PACKAGE)
 publish:
 	$(CP) docs/*.html $(WEBDIR)

+test:
+	cd samples && $(MAKE) test
+test_clean:
+	-cd samples && $(MAKE) clean
+
 CONV_CMAP=$(PYTHON) tools/conv_cmap.py
-CMAPDIR=pdfminer/cmap
-CMAPRSRC=cmaprsrc
-cmap: cmaprsrc
-	$(CONV_CMAP) $(CMAPDIR) Adobe-CNS1 $(CMAPRSRC)/cid2code_Adobe_CNS1.txt cp950 big5
-	$(CONV_CMAP) $(CMAPDIR) Adobe-GB1 $(CMAPRSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
-	$(CONV_CMAP) $(CMAPDIR) Adobe-Japan1 $(CMAPRSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
-	$(CONV_CMAP) $(CMAPDIR) Adobe-Korea1 $(CMAPRSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
+CMAPSRC=cmaprsrc
+CMAPDST=pdfminer/cmap
+cmap: $(CMAPDST)/TO_UNICODE_Adobe_CNS1.py $(CMAPDST)/TO_UNICODE_Adobe_GB1.py \
+	$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py $(CMAPDST)/TO_UNICODE_Adobe_Korea1.py
 cmap_clean:
-	cd $(CMAPDIR) && make cmap_clean
+	cd $(CMAPDST) && make cmap_clean
+$(CMAPDST)/TO_UNICODE_Adobe_CNS1.py:
+	$(CONV_CMAP) $(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt cp950 big5
+$(CMAPDST)/TO_UNICODE_Adobe_GB1.py:
+	$(CONV_CMAP) $(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt cp936 gb2312
+$(CMAPDST)/TO_UNICODE_Adobe_Japan1.py:
+	$(CONV_CMAP) $(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt cp932 euc-jp
+$(CMAPDST)/TO_UNICODE_Adobe_Korea1.py:
+	$(CONV_CMAP) $(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt cp949 euc-kr
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Dec 20 01:25:02 JST 2009
+Last Modified: Fri Jan  1 12:04:47 JST 2010
 <!-- hhmts end -->
 </div>

@ -27,7 +27,7 @@ Last Modified: Sun Dec 20 01:25:02 JST 2009
 <li> <a href="#intro">What's It?</a>
 <li> <a href="#source">Download</a>
 <li> <a href="#install">Install</a> 
-&nbsp; <small>(<a href="#cmap">for non-ASCII languages</a>)</small>
+&nbsp; <small>(<a href="#cmap">for East Asian languages</a>)</small>
 <li> <a href="#usage">How to Use</a>
 &nbsp; <small>(<a href="#pdf2txt">pdf2txt.py</a>, <a href="#dumppdf">dumppdf.py</a>)</small>
 <li> <a href="#todos">TODOs</a>
@ -54,7 +54,7 @@ PDF parser that can be used for other purposes instead of text analysis.
 <ul>
 <li> Written entirely in Python. (for version 2.4 or newer)
 <li> PDF-1.7 specification support. (well, almost)
-<li> Non-ASCII languages and vertical writing scripts support.
+<li> East Asian languages and vertical writing scripts support.
 <li> Various font types (Type1, TrueType, Type3, and CID) support.
 <li> Basic encryption (RC4) support.
 <li> PDF to HTML conversion (with a sample converter web app).
@ -125,8 +125,8 @@ W o r l d

 <p>
 <a name="cmap"></a>
-<h3>For non-ASCII languages</h3>
-In order to handle non-ASCII languages (e.g. Japanese),
+<h3>For East Asian languages</h3>
+In order to handle East Asian languages (Chinese or Japanese, etc.),
 you need to install an additional data called <code>CMap</code>,
 which is originally distributed by Adobe. CMap is now included
 in the pdfminer package, but not installed by default.
@ -163,9 +163,6 @@ direction (horizontal or vertical) for each text portion.
 You need to provide a password for protected PDF documents when its access is restricted.
 You cannot extract any text from a PDF document which does not have extraction permission.
 <p>
-For non-ASCII languages, you can specify the output encoding 
-(such as UTF-8).
-<p>
 <strong>Note:</strong> Not all characters in a PDF can be safely converted to Unicode.

 <p>
@ -194,7 +191,7 @@ Page numbers are starting from one.
 By default, it extracts texts from all the pages.
 <p>
 <dt> <code>-c <em>codec</em></code> 
-<dd> Specifies the output codec for non-ASCII texts.
+<dd> Specifies the output codec.
 <p>
 <dt> <code>-t <em>type</em></code> 
 <dd> Specifies the output format. The following formats are currently supported.
@ -344,6 +341,9 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
+<li> 2009/12/24: RunLengthDecode filter added. Thanks to Troy Bollinger.
+<li> 2009/12/20: Experimental polygon shape extraction added. Thanks to Yusuf Dewaswala for reporting.
 <li> 2009/12/19: CMap resources are now the part of the package. Thanks to Adobe for open-sourcing them.
 <li> 2009/11/29: Password encryption bug fixed. Thanks to Yannick Gingras.
 <li> 2009/10/31: SGML output format is changed and renamed as XML.
--- a/pdfminer/Makefile
+++ b/pdfminer/Makefile
@ -1,12 +1,7 @@
 # Makefile for pdfminer

-PYCHECKER=pychecker --limit=0
-
 all:

 clean:
 	-rm *.pyc *.pyo
 	cd cmap && make clean
-
-check:
-	$(PYCHECKER) *.py
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -90,14 +90,14 @@ class UnicodeMap(object):

    debug = 0

-    def __init__(self, cid2unicode=None):
-        self.cid2unicode = cid2unicode or {}
+    def __init__(self, cid2unichr=None):
+        self.cid2unichr = cid2unichr or {}
        return

-    def get_unicode(self, cid):
+    def get_unichr(self, cid):
        if self.debug:
-            print >>sys.stderr, 'get_unicode: %r, %r' % (self, cid)
-        return self.cid2unicode.get(cid)
+            print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
+        return self.cid2unichr[cid]


 ##  FileCMap
@ -151,16 +151,16 @@ class FileUnicodeMap(UnicodeMap):
        self.attrs[k] = v
        return

-    def add_cid2unicode(self, cid, code):
+    def add_cid2unichr(self, cid, code):
        assert isinstance(cid, int)
        if isinstance(code, PSLiteral):
            # Interpret as an Adobe glyph name.
-            self.cid2unicode[cid] = name2unicode(code.name)
+            self.cid2unichr[cid] = unichr(name2unicode(code.name))
        elif isinstance(code, str):
            # Interpret as UTF-16BE.
-            self.cid2unicode[cid] = unpack('>H', code)[0]
+            self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
        elif isinstance(code, int):
-            self.cid2unicode[cid] = code
+            self.cid2unichr[cid] = unichr(code)
        else:
            raise TypeError(code)
        return
@ -189,10 +189,10 @@ class PyUnicodeMap(UnicodeMap):
    
    def __init__(self, name, module, vertical):
        if vertical:
-            cid2unicode = module.CID2UNICODE_V
+            cid2unichr = module.CID2UNICHR_V
        else:
-            cid2unicode = module.CID2UNICODE_H
-        UnicodeMap.__init__(self, cid2unicode)
+            cid2unichr = module.CID2UNICHR_H
+        UnicodeMap.__init__(self, cid2unichr)
        self.name = name
        return

@ -333,7 +333,7 @@ class CMapParser(PSStackParser):
                #assert s1 <= e1
                if isinstance(code, list):
                    for i in xrange(e1-s1+1):
-                        self.cmap.add_cid2unicode(s1+i, code[i])
+                        self.cmap.add_cid2unichr(s1+i, code[i])
                else:
                    var = code[-4:]
                    base = nunpack(var)
@ -341,7 +341,7 @@ class CMapParser(PSStackParser):
                    vlen = len(var)
                    for i in xrange(e1-s1+1):
                        x = prefix+pack('>L',base+i)[-vlen:]
-                        self.cmap.add_cid2unicode(s1+i, x)
+                        self.cmap.add_cid2unichr(s1+i, x)
            return

        if name == 'beginbfchar':
@ -351,7 +351,7 @@ class CMapParser(PSStackParser):
            objs = [ obj for (_,obj) in self.popall() ]
            for (cid,code) in choplist(2, objs):
                if isinstance(cid, str) and isinstance(code, str):
-                    self.cmap.add_cid2unicode(nunpack(cid), code)
+                    self.cmap.add_cid2unichr(nunpack(cid), code)
            return

        if name == 'beginnotdefrange':
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@ -405,9 +405,10 @@ class PDFSimpleFont(PDFFont):

    def to_unichr(self, cid):
        if self.unicode_map:
-            code = self.unicode_map.get_unicode(cid)
-            if code is not None:
-                return unichr(code)
+            try:
+                return self.unicode_map.get_unichr(cid)
+            except KeyError:
+                pass
        try:
            return self.encoding[cid]
        except KeyError:
@ -571,12 +572,11 @@ class PDFCIDFont(PDFFont):
        return self.disps.get(cid, self.default_disp)

    def to_unichr(self, cid):
-        if not self.unicode_map:
+        try:
+            if not self.unicode_map: raise KeyError(cid)
+            return self.unicode_map.get_unichr(cid)
+        except KeyError:
            raise PDFUnicodeNotDefined(self.cidcoding, cid)
-        code = self.unicode_map.get_unicode(cid)
-        if code is not None:
-            return unichr(code)
-        raise PDFUnicodeNotDefined(self.cidcoding, cid)


 # main
--- a/pdfminer/pdfinterp.py
+++ b/pdfminer/pdfinterp.py
@ -766,7 +766,9 @@ class PDFTextExtractionNotAllowed(PDFInterpreterError): pass

 def process_pdf(rsrc, device, fp, pagenos=None, maxpages=0, password=''):
    doc = PDFDocument()
-    parser = PDFParser(doc, fp)
+    parser = PDFParser(fp)
+    parser.set_document(doc)
+    doc.set_parser(parser)
    doc.initialize(password)
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -1,9 +1,12 @@
 #!/usr/bin/env python
 import sys
 import re
-import md5
 import struct
 from sys import stderr
+try:
+    import hashlib as md5
+except ImportError:
+    import md5
 try:
    from cStringIO import StringIO
 except ImportError:
@ -19,7 +22,7 @@ from pdftypes import int_value, float_value, num_value
 from pdftypes import str_value, list_value, dict_value, stream_value
 from arcfour import Arcfour
 from utils import choplist, nunpack
-from utils import decode_text
+from utils import decode_text, ObjIdRange


 ##  Exceptions
@ -39,57 +42,29 @@ LITERAL_CATALOG = LIT('Catalog')

 ##  XRefs
 ##
-class XRefObjRange(object):
-    def __init__(self, start, nobjs):
-        self.start = start
-        self.nobjs = nobjs
-        return
-
-    def __repr__(self):
-        return '<XRefObjRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
-
-    def get_start_id(self):
-        return self.start
-
-    def get_end_id(self):
-        return self.start + self.nobjs - 1
-
-    def get_nobjs(self):
-        return self.nobjs
-
 class PDFBaseXRef(object):
-    def __init__(self):
-        self.objid_ranges = None
-        return

-    def objids(self):
-        if self.objid_ranges:
-            for objid_range in self.objid_ranges:
-                for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
-                    yield objid
-        return
+    def get_trailer(self):
+        raise NotImplementedError
+
+    def get_pos(self, objid):
+        raise KeyError(objid)


 ##  PDFXRef
 ##
 class PDFXRef(PDFBaseXRef):
+    
    def __init__(self):
-        PDFBaseXRef.__init__(self)
-        self.offsets = None
+        self.offsets = {}
        self.trailer = {}
        return

-    def __repr__(self):
-        return '<PDFXRef: objs=%d>' % len(self.offsets)
-
    def load(self, parser, debug=0):
-        self.offsets = {}
-        self.objid_ranges = []
        while 1:
            try:
                (pos, line) = parser.nextline()
-                if not line.strip():
-                    continue
+                if not line.strip(): continue
            except PSEOF:
                raise PDFNoValidXRef('Unexpected EOF - file corrupted?')
            if not line:
@ -104,8 +79,6 @@ class PDFXRef(PDFBaseXRef):
                (start, nobjs) = map(long, f)
            except ValueError:
                raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
-            self.newoffsets = {}
-            self.objid_ranges.append(XRefObjRange(start, nobjs))
            for objid in xrange(start, start+nobjs):
                try:
                    (_, line) = parser.nextline()
@ -133,10 +106,33 @@ class PDFXRef(PDFBaseXRef):
            if not x:
                raise PDFNoValidXRef('Unexpected EOF - file corrupted')
            (_,dic) = x[0]
-        self.trailer.update( dict_value(dic))
+        self.trailer.update(dict_value(dic))
        return

-    def getpos(self, objid):
+    PDFOBJ_CUE = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
+    def load_fallback(self, parser, debug=0):
+        parser.seek(0)
+        while 1:
+            try:
+                (pos, line) = parser.nextline()
+            except PSEOF:
+                break
+            if line.startswith('trailer'):
+                parser.seek(pos)
+                self.load_trailer(parser)
+                if 1 <= debug:
+                    print >>stderr, 'trailer: %r' % self.get_trailer()
+                break
+            m = self.PDFOBJ_CUE.match(line)
+            if not m: continue
+            (objid, genno) = m.groups()
+            self.offsets[int(objid)] = (0, pos)
+        return
+
+    def get_trailer(self):
+        return self.trailer
+
+    def get_pos(self, objid):
        try:
            (genno, pos) = self.offsets[objid]
        except KeyError:
@ -149,10 +145,10 @@ class PDFXRef(PDFBaseXRef):
 class PDFXRefStream(PDFBaseXRef):

    def __init__(self):
-        PDFBaseXRef.__init__(self)
        self.data = None
        self.entlen = None
        self.fl1 = self.fl2 = self.fl3 = None
+        self.objid_ranges = []
        return

    def __repr__(self):
@ -169,17 +165,22 @@ class PDFXRefStream(PDFBaseXRef):
        index_array = stream.dic.get('Index', (0,size))
        if len(index_array) % 2 != 0:
            raise PDFSyntaxError('Invalid index number')
-        self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ]
+        self.objid_ranges.extend( ObjIdRange(start, nobjs) 
+                                  for (start,nobjs) in choplist(2, index_array) )
        (self.fl1, self.fl2, self.fl3) = stream.dic['W']
        self.data = stream.get_data()
        self.entlen = self.fl1+self.fl2+self.fl3
        self.trailer = stream.dic
        if debug:
            print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
-                             (', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3)))
+                             (', '.join(map(repr, self.objid_ranges),
+                                        self.fl1, self.fl2, self.fl3)))
        return

-    def getpos(self, objid):
+    def get_trailer(self):
+        return self.trailer
+
+    def get_pos(self, objid):
        offset = 0
        found = False
        for objid_range in self.objid_ranges:
@ -207,14 +208,35 @@ class PDFXRefStream(PDFBaseXRef):

 ##  PDFPage
 ##
-##  A PDFPage object is nothing more than a bunch of keys and values
-##  that describe the properties of the page and point to its contents,
-##  and has nothing to do with a real graphical entity. For a real graphical
-##  object, look at layout.LTPage.
-##
 class PDFPage(object):

+    """An object that holds the information about a page.
+
+    A PDFPage object is merely a convenience class that has a set
+    of keys and values, which describe the properties of a page
+    and point to its contents.
+
+    Attributes:
+      doc: a PDFDocument object.
+      pageid: any Python object that can uniquely identify the page.
+      attrs: a dictionary of page attributes.
+      contents: a list of PDFStream objects that represents the page content.
+      lastmod: the last modified time of the page.
+      resources: a list of resources used by the page.
+      mediabox: the physical size of the page.
+      cropbox: the crop rectangle of the page.
+      rotate: the page rotation (in degree).
+      annots: the page annotations.
+      beads: a chain that represents natural reading order.
+    """
+
    def __init__(self, doc, pageid, attrs):
+        """Initialize a page object.
+        
+        doc: a PDFDocument object.
+        pageid: any Python object that can uniquely identify the page.
+        attrs: a dictionary of page attributes.
+        """
        self.doc = doc
        self.pageid = pageid
        self.attrs = dict_value(attrs)
@ -243,13 +265,23 @@ class PDFPage(object):

 ##  PDFDocument
 ##
-##  A PDFDocument object represents a PDF document.
-##  Since a PDF file is usually pretty big, normally it is not loaded
-##  at once. Rather it is parsed dynamically as processing goes.
-##  A PDF parser is associated with the document.
-##
 class PDFDocument(object):

+    """PDFDocument object represents a PDF document.
+
+    Since a PDF file can be very big, normally it is not loaded at
+    once. Each PDF document has a PDF parser object associated,
+    and the data stream is parsed dynamically as processing goes.
+
+    Typical usage:
+      doc = PDFDocument()
+      parser = PDFParser(fp)
+      parser.set_document(doc)
+      doc.set_parser(parser)
+      doc.initialize(password)
+    
+    """
+
    debug = 0

    def __init__(self):
@ -261,24 +293,23 @@ class PDFDocument(object):
        self.parser = None
        self.encryption = None
        self.decipher = None
-        self.ready = False
+        self._initialized = False
        return

-    # set_parser(parser)
-    #   Associates the document with an (already initialized) parser object.
    def set_parser(self, parser):
+        "Set the document to use a given PDFParser object."
        if self.parser: return
        self.parser = parser
        # The document is set to be temporarily ready during collecting
        # all the basic information about the document, e.g.
        # the header, the encryption information, and the access rights
        # for the document.
-        self.ready = True
+        self._initialized = True
        # Retrieve the information of each header that was appended
        # (maybe multiple times) at the end of the document.
        self.xrefs = parser.read_xref()
        for xref in self.xrefs:
-            trailer = xref.trailer
+            trailer = xref.get_trailer()
            if not trailer: continue
            # If there's an encryption info, remember it.
            if 'Encrypt' in trailer:
@ -293,7 +324,7 @@ class PDFDocument(object):
        # The document is set to be non-ready again, until all the
        # proper initialization (asking the password key and
        # verifying the access permission, so on) is finished.
-        self.ready = False
+        self._initialized = False
        return

    # set_root(root)
@ -315,7 +346,7 @@ class PDFDocument(object):
    def initialize(self, password=''):
        if not self.encryption:
            self.is_printable = self.is_modifiable = self.is_extractable = True
-            self.ready = True
+            self._initialized = True
            return
        (docid, param) = self.encryption
        if literal_name(param['Filter']) != 'Standard':
@ -367,7 +398,7 @@ class PDFDocument(object):
            raise PDFPasswordIncorrect
        self.decrypt_key = key
        self.decipher = self.decrypt_rc4  # XXX may be AES
-        self.ready = True
+        self._initialized = True
        return

    def decrypt_rc4(self, objid, genno, data):
@ -378,7 +409,7 @@ class PDFDocument(object):

    KEYWORD_OBJ = KWD('obj')
    def getobj(self, objid):
-        if not self.ready:
+        if not self._initialized:
            raise PDFException('PDFDocument not initialized')
        #assert self.xrefs
        if 2 <= self.debug:
@ -389,7 +420,7 @@ class PDFDocument(object):
        else:
            for xref in self.xrefs:
                try:
-                    (strmid, index) = xref.getpos(objid)
+                    (strmid, index) = xref.get_pos(objid)
                    break
                except KeyError:
                    pass
@ -411,7 +442,7 @@ class PDFDocument(object):
                if strmid in self.parsed_objs:
                    objs = self.parsed_objs[strmid]
                else:
-                    parser = PDFObjStrmParser(self, stream.get_data())
+                    parser = PDFObjStrmParser(stream.get_data())
                    objs = []
                    try:
                        while 1:
@ -458,7 +489,7 @@ class PDFDocument(object):

    INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
    def get_pages(self):
-        if not self.ready:
+        if not self._initialized:
            raise PDFException('PDFDocument is not initialized')
        #assert self.xrefs
        def search(obj, parent):
@ -529,14 +560,15 @@ class PDFDocument(object):
 ##
 class PDFParser(PSStackParser):

-    def __init__(self, doc, fp):
+    def __init__(self, fp):
        PSStackParser.__init__(self, fp)
-        self.doc = doc
-        self.doc.set_parser(self)
+        self.doc = None
        return

-    def __repr__(self):
-        return '<PDFParser>'
+    def set_document(self, doc):
+        "Associates the parser with a PDFDocument object."
+        self.doc = doc
+        return

    KEYWORD_R = KWD('R')
    KEYWORD_ENDOBJ = KWD('endobj')
@ -647,7 +679,7 @@ class PDFParser(PSStackParser):
            xref = PDFXRef()
            xref.load(self, debug=self.debug)
        xrefs.append(xref)
-        trailer = xref.trailer
+        trailer = xref.get_trailer()
        if 1 <= self.debug:
            print >>stderr, 'trailer: %r' % trailer
        if 'XRefStm' in trailer:
@ -669,26 +701,8 @@ class PDFParser(PSStackParser):
            # fallback
            if 1 <= self.debug:
                print >>stderr, 'no xref, fallback'
-            self.seek(0)
-            pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
-            offsets = {}
            xref = PDFXRef()
-            while 1:
-                try:
-                    (pos, line) = self.nextline()
-                except PSEOF:
-                    break
-                if line.startswith('trailer'):
-                    xref.offsets = offsets
-                    self.seek(pos)
-                    xref.load_trailer(self)
-                    if 1 <= self.debug:
-                        print >>stderr, 'trailer: %r' % xref.trailer
-                    continue
-                m = pat.match(line)
-                if not m: continue
-                (objid, genno) = m.groups()
-                offsets[int(objid)] = (0, pos)
+            xref.load_fallback(self)
            xrefs.append(xref)
        return xrefs

@ -697,8 +711,8 @@ class PDFParser(PSStackParser):
 ##
 class PDFObjStrmParser(PSStackParser):

-    def __init__(self, doc, data):
-        PDFParser.__init__(self, doc, StringIO(data))
+    def __init__(self, data):
+        PSStackParser.__init__(self, StringIO(data))
        return

    def flush(self):
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@ -135,3 +135,27 @@ def enc(x, codec='ascii'):
    '''Encodes a string for SGML/XML/HTML'''
    x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
    return x.encode(codec, 'xmlcharrefreplace')
+
+
+##  ObjIdRange
+##
+class ObjIdRange(object):
+
+    "A utility class to represent a range of object IDs."
+    
+    def __init__(self, start, nobjs):
+        self.start = start
+        self.nobjs = nobjs
+        return
+
+    def __repr__(self):
+        return '<ObjIdRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
+
+    def get_start_id(self):
+        return self.start
+
+    def get_end_id(self):
+        return self.start + self.nobjs - 1
+
+    def get_nobjs(self):
+        return self.nobjs
--- a/tools/conv_cmap.py
+++ b/tools/conv_cmap.py
@ -12,7 +12,7 @@ def process_cid2code(fp, check_codecs=[]):
        else:
            return (name+'-H', name+'-V')

-    def get_unicode(codes):
+    def get_unichr(codes):
        # determine the "most popular" candidate.
        d = {}
        for code in codes:
@ -26,7 +26,7 @@ def process_cid2code(fp, check_codecs=[]):
                except UnicodeError:
                    pass
        chars = sorted(d.keys(), key=lambda char:d[char], reverse=True)
-        return ord(chars[0])
+        return chars[0]

    def put(dmap, code, cid, force=False):
        for b in code[:-1]:
@ -45,8 +45,8 @@ def process_cid2code(fp, check_codecs=[]):
    names = []
    code2cid = {} # {'cmapname': ...}
    is_vertical = {}
-    cid2unicode_h = {} # {cid: unicode}
-    cid2unicode_v = {} # {cid: unicode}
+    cid2unichr_h = {} # {cid: unichr}
+    cid2unichr_v = {} # {cid: unichr}
    
    for line in fp:
        line = line.strip()
@ -95,21 +95,21 @@ def process_cid2code(fp, check_codecs=[]):
                    put(hmap, code, cid, True)
                if name.endswith('-UTF8'):
                    if hcodes:
-                        cid2unicode_h[cid] = get_unicode(hcodes)
+                        cid2unichr_h[cid] = get_unichr(hcodes)
                    if vcodes:
-                        cid2unicode_v[cid] = get_unicode(vcodes)
+                        cid2unichr_v[cid] = get_unichr(vcodes)
            else:
                for code in hcodes:
                    put(hmap, code, cid)
                    put(vmap, code, cid)
                if name.endswith('-UTF8') and hcodes:
-                    code = get_unicode(hcodes)
-                    if cid not in cid2unicode_h:
-                        cid2unicode_h[cid] = code
-                    if cid not in cid2unicode_v:
-                        cid2unicode_v[cid] = code
+                    code = get_unichr(hcodes)
+                    if cid not in cid2unichr_h:
+                        cid2unichr_h[cid] = code
+                    if cid not in cid2unichr_v:
+                        cid2unichr_v[cid] = code

-    return (code2cid, is_vertical, cid2unicode_h, cid2unicode_v)
+    return (code2cid, is_vertical, cid2unichr_h, cid2unichr_v)

 # main
 def main(argv):
@ -128,7 +128,7 @@ def main(argv):

    print >>sys.stderr, 'reading %r...' % src
    fp = file(src)
-    (code2cid, is_vertical, cid2unicode_h, cid2unicode_v) = process_cid2code(fp, check_codecs)
+    (code2cid, is_vertical, cid2unichr_h, cid2unichr_v) = process_cid2code(fp, check_codecs)
    fp.close()

    for (name, cmap) in code2cid.iteritems():
@ -146,8 +146,8 @@ def main(argv):
    fp = file(os.path.join(outdir, fname), 'w')
    print >>fp, '#!/usr/bin/env python'
    print >>fp, '#', fname
-    print >>fp, 'CID2UNICODE_H = %r' % cid2unicode_h
-    print >>fp, 'CID2UNICODE_V = %r' % cid2unicode_v
+    print >>fp, 'CID2UNICHR_H = %r' % cid2unichr_h
+    print >>fp, 'CID2UNICHR_V = %r' % cid2unichr_v
    fp.close()

    return 0
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -99,7 +99,9 @@ def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
-    parser = PDFParser(doc, fp)
+    parser = PDFParser(fp)
+    parser.set_document(doc)
+    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
@ -119,7 +121,9 @@ def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
-    parser = PDFParser(doc, fp)
+    parser = PDFParser(fp)
+    parser.set_document(doc)
+    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids: