fix dumppdf and pdf2txt

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@31 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-06-21 17:58:28 +00:00 · 2008-06-21 17:58:28 +00:00 · 3a1ee1992e
parent b86ed8be3c
commit 3a1ee1992e
3 changed files with 21 additions and 14 deletions
--- a/dumppdf.py
+++ b/dumppdf.py
@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None):
 # dumptrailers
 def dumptrailers(out, doc):
  for xref in doc.xrefs:
-    out.write('<trailer objid="%d-%d">\n' %
+    out.write('<trailer>\n')
              (xref.objid0, xref.objid1-1))
    dumpxml(out, xref.trailer)
    out.write('\n</trailer>\n\n')
  return
 # dumpallobjs
-def dumpallobjs(out, doc):
+def dumpallobjs(out, doc, codec=None):
  out.write('<pdf>')
  for xref in doc.xrefs:
-    for objid in xrange(xref.objid0, xref.objid1+1):
+    for objid in xref.objids():
      try:
        obj = doc.getobj(objid)
        out.write('<object id="%d">\n' % objid)
-        dumpxml(out, obj)
+        dumpxml(out, obj, codec=codec)
        out.write('\n</object>\n\n')
      except:
        pass
@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='',
      if page.pageid in pageids:
        dumpxml(outfp, page.attrs)
  if dumpall:
-    dumpallobjs(outfp, doc)
+    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pageids) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -61,7 +61,7 @@ class TextItem:
    self.matrix = matrix
    self.font = font
    (a,b,c,d,tx,ty) = self.matrix
-    (self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
+    (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
    self.width = abs(self.width)
    self.origin = (tx,ty)
    self.direction = 0
@ -69,18 +69,20 @@ class TextItem:
      self.direction = 1
      (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
      (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
-      self.bbox = (tx, ty+descent, self.width, self.fontsize)
+      self.bbox = (tx, ty+descent, self.width, self.height)
    else:
      self.direction = 2
      mindisp = min( d for (d,_) in text )
      (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
-      self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
+      self.bbox = (tx-mindisp, ty+self.width, self.height, self.width)
    self.text = ''.join( c for (_,c) in text )
    (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
    self.fontsize = max(w,h)
    return
  def __repr__(self):
-    return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
+    return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
-            (self.matrix, self.font, self.fontsize, self.width, self.text))
+            (self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
  def dump(self, outfp, codec):
    def e(x):
--- a/pdfparser.py
+++ b/pdfparser.py
@ -283,6 +283,9 @@ class PDFXRef(object):
    self.offsets = None
    return
  def objids(self):
    return self.offsets.keys()
  def load(self, parser):
    while 1:
      try:
@ -332,7 +335,7 @@ class PDFXRef(object):
    try:
      (genno, pos, use) = self.offsets[objid]
    except KeyError:
-      raise PDFValueError('object not found: %r' % objid)
+      raise
    if use != 'n':
      if STRICT:
        raise PDFValueError('unused objid=%r' % objid)
@ -351,6 +354,9 @@ class PDFXRefStream(object):
    self.fl1 = self.fl2 = self.fl3 = None
    return
  def objids(self):
    return range(self.objid0, self.objid1+1)
  def load(self, parser):
    (_,objid) = parser.nexttoken() # ignored
    (_,genno) = parser.nexttoken() # ignored
@ -370,7 +376,7 @@ class PDFXRefStream(object):
  def getpos(self, objid):
    if objid < self.objid0 or self.objid1 <= objid:
-      raise IndexError(objid)
+      raise KeyError(objid)
    i = self.entlen * (objid-self.objid0)
    ent = self.data[i:i+self.entlen]
    f1 = nunpack(ent[:self.fl1], 1)
@ -532,7 +538,7 @@ class PDFDocument:
        try:
          (strmid, index) = xref.getpos(objid)
          break
-        except IndexError:
+        except KeyError:
          pass
      else:
        if STRICT: