fix dumppdf and pdf2txt
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@31 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
b86ed8be3c
commit
3a1ee1992e
11
dumppdf.py
11
dumppdf.py
|
@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None):
|
|||
# dumptrailers
|
||||
def dumptrailers(out, doc):
|
||||
for xref in doc.xrefs:
|
||||
out.write('<trailer objid="%d-%d">\n' %
|
||||
(xref.objid0, xref.objid1-1))
|
||||
out.write('<trailer>\n')
|
||||
dumpxml(out, xref.trailer)
|
||||
out.write('\n</trailer>\n\n')
|
||||
return
|
||||
|
||||
# dumpallobjs
|
||||
def dumpallobjs(out, doc):
|
||||
def dumpallobjs(out, doc, codec=None):
|
||||
out.write('<pdf>')
|
||||
for xref in doc.xrefs:
|
||||
for objid in xrange(xref.objid0, xref.objid1+1):
|
||||
for objid in xref.objids():
|
||||
try:
|
||||
obj = doc.getobj(objid)
|
||||
out.write('<object id="%d">\n' % objid)
|
||||
dumpxml(out, obj)
|
||||
dumpxml(out, obj, codec=codec)
|
||||
out.write('\n</object>\n\n')
|
||||
except:
|
||||
pass
|
||||
|
@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='',
|
|||
if page.pageid in pageids:
|
||||
dumpxml(outfp, page.attrs)
|
||||
if dumpall:
|
||||
dumpallobjs(outfp, doc)
|
||||
dumpallobjs(outfp, doc, codec=codec)
|
||||
if (not objids) and (not pageids) and (not dumpall):
|
||||
dumptrailers(outfp, doc)
|
||||
fp.close()
|
||||
|
|
12
pdf2txt.py
12
pdf2txt.py
|
@ -61,7 +61,7 @@ class TextItem:
|
|||
self.matrix = matrix
|
||||
self.font = font
|
||||
(a,b,c,d,tx,ty) = self.matrix
|
||||
(self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
||||
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
||||
self.width = abs(self.width)
|
||||
self.origin = (tx,ty)
|
||||
self.direction = 0
|
||||
|
@ -69,18 +69,20 @@ class TextItem:
|
|||
self.direction = 1
|
||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
||||
self.bbox = (tx, ty+descent, self.width, self.fontsize)
|
||||
self.bbox = (tx, ty+descent, self.width, self.height)
|
||||
else:
|
||||
self.direction = 2
|
||||
mindisp = min( d for (d,_) in text )
|
||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
|
||||
self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
|
||||
self.bbox = (tx-mindisp, ty+self.width, self.height, self.width)
|
||||
self.text = ''.join( c for (_,c) in text )
|
||||
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
|
||||
self.fontsize = max(w,h)
|
||||
return
|
||||
|
||||
def __repr__(self):
|
||||
return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
|
||||
(self.matrix, self.font, self.fontsize, self.width, self.text))
|
||||
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
|
||||
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
|
||||
|
||||
def dump(self, outfp, codec):
|
||||
def e(x):
|
||||
|
|
12
pdfparser.py
12
pdfparser.py
|
@ -283,6 +283,9 @@ class PDFXRef(object):
|
|||
self.offsets = None
|
||||
return
|
||||
|
||||
def objids(self):
|
||||
return self.offsets.keys()
|
||||
|
||||
def load(self, parser):
|
||||
while 1:
|
||||
try:
|
||||
|
@ -332,7 +335,7 @@ class PDFXRef(object):
|
|||
try:
|
||||
(genno, pos, use) = self.offsets[objid]
|
||||
except KeyError:
|
||||
raise PDFValueError('object not found: %r' % objid)
|
||||
raise
|
||||
if use != 'n':
|
||||
if STRICT:
|
||||
raise PDFValueError('unused objid=%r' % objid)
|
||||
|
@ -351,6 +354,9 @@ class PDFXRefStream(object):
|
|||
self.fl1 = self.fl2 = self.fl3 = None
|
||||
return
|
||||
|
||||
def objids(self):
|
||||
return range(self.objid0, self.objid1+1)
|
||||
|
||||
def load(self, parser):
|
||||
(_,objid) = parser.nexttoken() # ignored
|
||||
(_,genno) = parser.nexttoken() # ignored
|
||||
|
@ -370,7 +376,7 @@ class PDFXRefStream(object):
|
|||
|
||||
def getpos(self, objid):
|
||||
if objid < self.objid0 or self.objid1 <= objid:
|
||||
raise IndexError(objid)
|
||||
raise KeyError(objid)
|
||||
i = self.entlen * (objid-self.objid0)
|
||||
ent = self.data[i:i+self.entlen]
|
||||
f1 = nunpack(ent[:self.fl1], 1)
|
||||
|
@ -532,7 +538,7 @@ class PDFDocument:
|
|||
try:
|
||||
(strmid, index) = xref.getpos(objid)
|
||||
break
|
||||
except IndexError:
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if STRICT:
|
||||
|
|
Loading…
Reference in New Issue