fix dumppdf and pdf2txt

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@31 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-06-21 17:58:28 +00:00
parent b86ed8be3c
commit 3a1ee1992e
3 changed files with 21 additions and 14 deletions

View File

@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None):
# dumptrailers # dumptrailers
def dumptrailers(out, doc): def dumptrailers(out, doc):
for xref in doc.xrefs: for xref in doc.xrefs:
out.write('<trailer objid="%d-%d">\n' % out.write('<trailer>\n')
(xref.objid0, xref.objid1-1))
dumpxml(out, xref.trailer) dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n') out.write('\n</trailer>\n\n')
return return
# dumpallobjs # dumpallobjs
def dumpallobjs(out, doc): def dumpallobjs(out, doc, codec=None):
out.write('<pdf>') out.write('<pdf>')
for xref in doc.xrefs: for xref in doc.xrefs:
for objid in xrange(xref.objid0, xref.objid1+1): for objid in xref.objids():
try: try:
obj = doc.getobj(objid) obj = doc.getobj(objid)
out.write('<object id="%d">\n' % objid) out.write('<object id="%d">\n' % objid)
dumpxml(out, obj) dumpxml(out, obj, codec=codec)
out.write('\n</object>\n\n') out.write('\n</object>\n\n')
except: except:
pass pass
@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='',
if page.pageid in pageids: if page.pageid in pageids:
dumpxml(outfp, page.attrs) dumpxml(outfp, page.attrs)
if dumpall: if dumpall:
dumpallobjs(outfp, doc) dumpallobjs(outfp, doc, codec=codec)
if (not objids) and (not pageids) and (not dumpall): if (not objids) and (not pageids) and (not dumpall):
dumptrailers(outfp, doc) dumptrailers(outfp, doc)
fp.close() fp.close()

View File

@ -61,7 +61,7 @@ class TextItem:
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
(a,b,c,d,tx,ty) = self.matrix (a,b,c,d,tx,ty) = self.matrix
(self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
self.width = abs(self.width) self.width = abs(self.width)
self.origin = (tx,ty) self.origin = (tx,ty)
self.direction = 0 self.direction = 0
@ -69,18 +69,20 @@ class TextItem:
self.direction = 1 self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
self.bbox = (tx, ty+descent, self.width, self.fontsize) self.bbox = (tx, ty+descent, self.width, self.height)
else: else:
self.direction = 2 self.direction = 2
mindisp = min( d for (d,_) in text ) mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0)) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width) self.bbox = (tx-mindisp, ty+self.width, self.height, self.width)
self.text = ''.join( c for (_,c) in text ) self.text = ''.join( c for (_,c) in text )
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
self.fontsize = max(w,h)
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' % return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
(self.matrix, self.font, self.fontsize, self.width, self.text)) (self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
def dump(self, outfp, codec): def dump(self, outfp, codec):
def e(x): def e(x):

View File

@ -283,6 +283,9 @@ class PDFXRef(object):
self.offsets = None self.offsets = None
return return
def objids(self):
return self.offsets.keys()
def load(self, parser): def load(self, parser):
while 1: while 1:
try: try:
@ -332,7 +335,7 @@ class PDFXRef(object):
try: try:
(genno, pos, use) = self.offsets[objid] (genno, pos, use) = self.offsets[objid]
except KeyError: except KeyError:
raise PDFValueError('object not found: %r' % objid) raise
if use != 'n': if use != 'n':
if STRICT: if STRICT:
raise PDFValueError('unused objid=%r' % objid) raise PDFValueError('unused objid=%r' % objid)
@ -351,6 +354,9 @@ class PDFXRefStream(object):
self.fl1 = self.fl2 = self.fl3 = None self.fl1 = self.fl2 = self.fl3 = None
return return
def objids(self):
return range(self.objid0, self.objid1+1)
def load(self, parser): def load(self, parser):
(_,objid) = parser.nexttoken() # ignored (_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored
@ -370,7 +376,7 @@ class PDFXRefStream(object):
def getpos(self, objid): def getpos(self, objid):
if objid < self.objid0 or self.objid1 <= objid: if objid < self.objid0 or self.objid1 <= objid:
raise IndexError(objid) raise KeyError(objid)
i = self.entlen * (objid-self.objid0) i = self.entlen * (objid-self.objid0)
ent = self.data[i:i+self.entlen] ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1) f1 = nunpack(ent[:self.fl1], 1)
@ -532,7 +538,7 @@ class PDFDocument:
try: try:
(strmid, index) = xref.getpos(objid) (strmid, index) = xref.getpos(objid)
break break
except IndexError: except KeyError:
pass pass
else: else:
if STRICT: if STRICT: