From cb020514816e53b315a49cea2e445b33a5e8c7b9 Mon Sep 17 00:00:00 2001
From: "yusuke.shinyama.dummy"
 <yusuke.shinyama.dummy@1aa58f4a-7d42-0410-adbc-911cccaed67c>
Date: Thu, 3 Jul 2008 15:51:44 +0000
Subject: [PATCH] several bugfixes.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@41 1aa58f4a-7d42-0410-adbc-911cccaed67c
---
 README.html         |  6 +++---
 pdflib/pdfinterp.py |  2 +-
 pdflib/pdfparser.py | 11 ++++++-----
 tools/pdf2txt.py    |  9 ++++++---
 4 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/README.html b/README.html
index 40957e2..8772b56 100644
--- a/README.html
+++ b/README.html
@@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }
 <h1>PDFMiner</h1>
 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Jun 29 23:57:07 JST 2008
+Last Modified: Tue Jul  1 00:02:48 JST 2008
 <!-- hhmts end -->
 </div>
 
@@ -54,8 +54,8 @@ http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080629.tar.gz
 
 <P>
 <strong>Svn repository:</strong><br>
-<a href="http://code.google.com/p/pdfminerr/source/browse">
-http://code.google.com/p/pdfminerr/source/browse
+<a href="http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer">
+http://code.google.com/p/pdfminerr/source/browse/trunk/pdfminer
 </a>
 
 <P>
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 01fde03..d045769 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -856,7 +856,7 @@ class PDFPageInterpreter:
     return
   # setleading
   def do_TL(self, leading):
-    self.textstate.leading = leading
+    self.textstate.leading = -leading
     return
   # selectfont
   def do_Tf(self, fontid, fontsize):
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index 494b6db..8a4be0f 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -362,7 +362,7 @@ class PDFXRefStream(object):
     (_,genno) = parser.nexttoken() # ignored
     (_,kwd) = parser.nexttoken()
     (_,stream) = parser.nextobject()
-    if stream.dic['Type'] != LITERAL_XREF:
+    if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF:
       raise PDFNoValidXRef('invalid stream spec.')
     size = stream.dic['Size']
     (start, nobjs) = stream.dic.get('Index', (0,size))
@@ -450,7 +450,7 @@ class PDFDocument:
   def set_root(self, root):
     self.root = root
     self.catalog = dict_value(self.root)
-    if self.catalog['Type'] != LITERAL_CATALOG:
+    if self.catalog.get('Type') != LITERAL_CATALOG:
       if STRICT:
         raise PDFValueError('Catalog not found!')
     self.outline = self.catalog.get('Outline')
@@ -504,7 +504,7 @@ class PDFDocument:
       hash.update(docid[0]) # 3
       x = Arcfour(key).process(hash.digest()[:16]) # 4
       for i in xrange(1,19+1):
-        k = ''.join( chr(c ^ i) for c in key )
+        k = ''.join( chr(ord(c) ^ i) for c in key )
         x = Arcfour(k).process(x)
       u1 = x+x # 32bytes total
     if R == 2:
@@ -599,16 +599,17 @@ class PDFDocument:
       for (k,v) in parent.iteritems():
         if k in self.INHERITABLE_ATTRS and k not in tree:
           tree[k] = v
-      if tree['Type'] == LITERAL_PAGES:
+      if tree.get('Type') == LITERAL_PAGES and 'Kids' in tree:
         if 1 <= debug:
           print >>stderr, 'Pages: Kids=%r' % tree['Kids']
         for c in tree['Kids']:
           for x in search(c, tree):
             yield x
-      elif tree['Type'] == LITERAL_PAGE:
+      elif tree.get('Type') == LITERAL_PAGE:
         if 1 <= debug:
           print >>stderr, 'Page: %r' % tree
         yield tree
+    if 'Pages' not in self.catalog: return
     for (i,tree) in enumerate(search(self.catalog['Pages'], self.catalog)):
       yield PDFPage(self, i, tree)
     return 
diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py
index a802806..33d652b 100755
--- a/tools/pdf2txt.py
+++ b/tools/pdf2txt.py
@@ -205,7 +205,7 @@ class TextConverter(PDFDevice):
 # pdf2txt
 class TextExtractionNotAllowed(RuntimeError): pass
 
-def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=10, html=False, password='', debug=0):
+def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=0, html=False, password='', debug=0):
   device = TextConverter(rsrc, debug=debug)
   doc = PDFDocument(debug=debug)
   fp = file(fname, 'rb')
@@ -238,7 +238,7 @@ def main(argv):
     print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0]
     return 100
   try:
-    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:')
+    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:')
   except getopt.GetoptError:
     return usage()
   if not args: return usage()
@@ -247,6 +247,7 @@ def main(argv):
   cdbcmapdir = 'CDBCMap'
   codec = 'ascii'
   pages = set()
+  maxpages = 0
   html = False
   password = ''
   outfp = stdout
@@ -255,6 +256,7 @@ def main(argv):
     elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
     elif k == '-P': password = v
     elif k == '-c': codec = v
+    elif k == '-m': maxpages = int(v)
     elif k == '-C': cmapdir = v
     elif k == '-D': cdbcmapdir = v
     elif k == '-H': html = True
@@ -263,7 +265,8 @@ def main(argv):
   CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
   rsrc = PDFResourceManager(debug=debug)
   for fname in args:
-    pdf2txt(outfp, rsrc, fname, pages, codec, html=html, password=password, debug=debug)
+    pdf2txt(outfp, rsrc, fname, pages, codec, 
+            maxpages=maxpages, html=html, password=password, debug=debug)
   return
 
 if __name__ == '__main__': sys.exit(main(sys.argv))