fix dumppdf and pdf2txt

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@31 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-06-21 17:58:28 +00:00
parent b86ed8be3c
commit 3a1ee1992e
3 changed files with 21 additions and 14 deletions

View File

@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None):
# dumptrailers
def dumptrailers(out, doc):
for xref in doc.xrefs:
out.write('<trailer objid="%d-%d">\n' %
(xref.objid0, xref.objid1-1))
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n')
return
# dumpallobjs
def dumpallobjs(out, doc):
def dumpallobjs(out, doc, codec=None):
out.write('<pdf>')
for xref in doc.xrefs:
for objid in xrange(xref.objid0, xref.objid1+1):
for objid in xref.objids():
try:
obj = doc.getobj(objid)
out.write('<object id="%d">\n' % objid)
dumpxml(out, obj)
dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n')
except:
pass
@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='',
if page.pageid in pageids:
dumpxml(outfp, page.attrs)
if dumpall:
dumpallobjs(outfp, doc)
dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pageids) and (not dumpall):
dumptrailers(outfp, doc)
fp.close()

View File

@ -61,7 +61,7 @@ class TextItem:
self.matrix = matrix
self.font = font
(a,b,c,d,tx,ty) = self.matrix
(self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
self.width = abs(self.width)
self.origin = (tx,ty)
self.direction = 0
@ -69,18 +69,20 @@ class TextItem:
self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
self.bbox = (tx, ty+descent, self.width, self.fontsize)
self.bbox = (tx, ty+descent, self.width, self.height)
else:
self.direction = 2
mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
self.bbox = (tx-mindisp, ty+self.width, self.height, self.width)
self.text = ''.join( c for (_,c) in text )
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
self.fontsize = max(w,h)
return
def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.width, self.text))
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
def dump(self, outfp, codec):
def e(x):

View File

@ -283,6 +283,9 @@ class PDFXRef(object):
self.offsets = None
return
def objids(self):
return self.offsets.keys()
def load(self, parser):
while 1:
try:
@ -332,7 +335,7 @@ class PDFXRef(object):
try:
(genno, pos, use) = self.offsets[objid]
except KeyError:
raise PDFValueError('object not found: %r' % objid)
raise
if use != 'n':
if STRICT:
raise PDFValueError('unused objid=%r' % objid)
@ -351,6 +354,9 @@ class PDFXRefStream(object):
self.fl1 = self.fl2 = self.fl3 = None
return
def objids(self):
return range(self.objid0, self.objid1+1)
def load(self, parser):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
@ -370,7 +376,7 @@ class PDFXRefStream(object):
def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid:
raise IndexError(objid)
raise KeyError(objid)
i = self.entlen * (objid-self.objid0)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
@ -532,7 +538,7 @@ class PDFDocument:
try:
(strmid, index) = xref.getpos(objid)
break
except IndexError:
except KeyError:
pass
else:
if STRICT: