several bugfixes.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@179 1aa58f4a-7d42-0410-adbc-911cccaed67c
2010-02-07 03:14:00 +00:00 · 2010-02-07 03:14:00 +00:00 · 538a605ac0
parent 63033599ce
commit 538a605ac0
6 changed files with 36 additions and 5 deletions
--- a/1
+++ b/1
@ -2,6 +2,7 @@ TODO
 Makefile
 README.txt
 setup.py
+docs/index.html
 pdfminer/Makefile
 pdfminer/__init__.py
 pdfminer/arcfour.py
--- a/docs/index.html
+++ b/docs/index.html
@ -19,7 +19,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Jan 31 11:11:26 JST 2010
+Last Modified: Sun Feb  7 12:13:27 JST 2010
 <!-- hhmts end -->
 </div>

@ -347,6 +347,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2010/02/07: Several bugfixes. Thanks to Hiroshi Manabe.
 <li> 2010/01/31: JPEG image extraction supported. Page rotation bug fixed. 
 <li> 2010/01/04: Python 2.6 warning removal. More doctest conversion.
 <li> 2010/01/01: CMap bug fix. Thanks to Winfried Plappert.
@ -399,7 +400,7 @@ no stream header is displayed for the ease of saving it to a file.
 (This is so-called MIT/X License)
 <p>
 <small>
-Copyright (c) 2004-2009  Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
+Copyright (c) 2004-2010  Yusuke Shinyama &lt;yusuke at cs dot nyu dot edu&gt;
 <p>
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
--- a/pdfminer/cmapdb.py
+++ b/pdfminer/cmapdb.py
@ -81,7 +81,12 @@ class IdentityCMap(object):
        return self.vertical

    def decode(self, code):
-        return unpack('>%dH' % (len(code)/2), code)
+        n = len(code)/2
+        if n:
+            return unpack('>%dH' % n, code)
+        else:
+            return ()
+        
            

 ##  UnicodeMap
@ -363,3 +368,15 @@ class CMapParser(PSStackParser):

        self.push((pos, token))
        return
+
+# test
+def main(argv):
+    args = argv[1:]
+    for fname in args:
+        fp = file(fname, 'rb')
+        cmap = FileUnicodeMap()
+        CMapParser(cmap, fp).run()
+        fp.close()
+    return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@ -52,7 +52,7 @@ class PDFPageAggregator(PDFTextDevice):

    def render_image(self, name, stream):
        assert isinstance(self.cur_item, LTFigure)
-        item = LTImage(name, stream['Filter'],
+        item = LTImage(name, stream.get('Filter'),
                       (stream['Width'], stream['Height']),
                       (self.cur_item.x0, self.cur_item.y0,
                        self.cur_item.x1, self.cur_item.y1),
--- a/pdfminer/pdfparser.py
+++ b/pdfminer/pdfparser.py
@ -47,6 +47,9 @@ class PDFBaseXRef(object):
    def get_trailer(self):
        raise NotImplementedError

+    def get_objids(self):
+        return []
+
    def get_pos(self, objid):
        raise KeyError(objid)

@ -132,6 +135,9 @@ class PDFXRef(PDFBaseXRef):
    def get_trailer(self):
        return self.trailer

+    def get_objids(self):
+        return self.offsets.iterkeys()
+
    def get_pos(self, objid):
        try:
            (genno, pos) = self.offsets[objid]
@ -180,6 +186,12 @@ class PDFXRefStream(PDFBaseXRef):
    def get_trailer(self):
        return self.trailer

+    def get_objids(self):
+        for objid_range in self.objid_ranges:
+            for x in xrange(objid_range.get_start_id(), objid <= objid_range.get_end_id()+1):
+                yield x
+        return
+
    def get_pos(self, objid):
        offset = 0
        found = False
--- a/tools/dumppdf.py
+++ b/tools/dumppdf.py
@ -86,7 +86,7 @@ def dumptrailers(out, doc):
 def dumpallobjs(out, doc, codec=None):
    out.write('<pdf>')
    for xref in doc.xrefs:
-        for objid in xref.objids():
+        for objid in xref.get_objids():
            try:
                obj = doc.getobj(objid)
                if obj is None: continue