fix dumppdf and pdf2txt
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@31 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
b86ed8be3c
commit
3a1ee1992e
11
dumppdf.py
11
dumppdf.py
|
@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None):
|
||||||
# dumptrailers
|
# dumptrailers
|
||||||
def dumptrailers(out, doc):
|
def dumptrailers(out, doc):
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
out.write('<trailer objid="%d-%d">\n' %
|
out.write('<trailer>\n')
|
||||||
(xref.objid0, xref.objid1-1))
|
|
||||||
dumpxml(out, xref.trailer)
|
dumpxml(out, xref.trailer)
|
||||||
out.write('\n</trailer>\n\n')
|
out.write('\n</trailer>\n\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
# dumpallobjs
|
# dumpallobjs
|
||||||
def dumpallobjs(out, doc):
|
def dumpallobjs(out, doc, codec=None):
|
||||||
out.write('<pdf>')
|
out.write('<pdf>')
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
for objid in xrange(xref.objid0, xref.objid1+1):
|
for objid in xref.objids():
|
||||||
try:
|
try:
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
out.write('<object id="%d">\n' % objid)
|
out.write('<object id="%d">\n' % objid)
|
||||||
dumpxml(out, obj)
|
dumpxml(out, obj, codec=codec)
|
||||||
out.write('\n</object>\n\n')
|
out.write('\n</object>\n\n')
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='',
|
||||||
if page.pageid in pageids:
|
if page.pageid in pageids:
|
||||||
dumpxml(outfp, page.attrs)
|
dumpxml(outfp, page.attrs)
|
||||||
if dumpall:
|
if dumpall:
|
||||||
dumpallobjs(outfp, doc)
|
dumpallobjs(outfp, doc, codec=codec)
|
||||||
if (not objids) and (not pageids) and (not dumpall):
|
if (not objids) and (not pageids) and (not dumpall):
|
||||||
dumptrailers(outfp, doc)
|
dumptrailers(outfp, doc)
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
12
pdf2txt.py
12
pdf2txt.py
|
@ -61,7 +61,7 @@ class TextItem:
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.font = font
|
self.font = font
|
||||||
(a,b,c,d,tx,ty) = self.matrix
|
(a,b,c,d,tx,ty) = self.matrix
|
||||||
(self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
||||||
self.width = abs(self.width)
|
self.width = abs(self.width)
|
||||||
self.origin = (tx,ty)
|
self.origin = (tx,ty)
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
|
@ -69,18 +69,20 @@ class TextItem:
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
||||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
||||||
self.bbox = (tx, ty+descent, self.width, self.fontsize)
|
self.bbox = (tx, ty+descent, self.width, self.height)
|
||||||
else:
|
else:
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
mindisp = min( d for (d,_) in text )
|
mindisp = min( d for (d,_) in text )
|
||||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
|
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
|
||||||
self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
|
self.bbox = (tx-mindisp, ty+self.width, self.height, self.width)
|
||||||
self.text = ''.join( c for (_,c) in text )
|
self.text = ''.join( c for (_,c) in text )
|
||||||
|
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
|
||||||
|
self.fontsize = max(w,h)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
|
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
|
||||||
(self.matrix, self.font, self.fontsize, self.width, self.text))
|
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
def e(x):
|
def e(x):
|
||||||
|
|
12
pdfparser.py
12
pdfparser.py
|
@ -283,6 +283,9 @@ class PDFXRef(object):
|
||||||
self.offsets = None
|
self.offsets = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def objids(self):
|
||||||
|
return self.offsets.keys()
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
|
@ -332,7 +335,7 @@ class PDFXRef(object):
|
||||||
try:
|
try:
|
||||||
(genno, pos, use) = self.offsets[objid]
|
(genno, pos, use) = self.offsets[objid]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise PDFValueError('object not found: %r' % objid)
|
raise
|
||||||
if use != 'n':
|
if use != 'n':
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('unused objid=%r' % objid)
|
raise PDFValueError('unused objid=%r' % objid)
|
||||||
|
@ -351,6 +354,9 @@ class PDFXRefStream(object):
|
||||||
self.fl1 = self.fl2 = self.fl3 = None
|
self.fl1 = self.fl2 = self.fl3 = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def objids(self):
|
||||||
|
return range(self.objid0, self.objid1+1)
|
||||||
|
|
||||||
def load(self, parser):
|
def load(self, parser):
|
||||||
(_,objid) = parser.nexttoken() # ignored
|
(_,objid) = parser.nexttoken() # ignored
|
||||||
(_,genno) = parser.nexttoken() # ignored
|
(_,genno) = parser.nexttoken() # ignored
|
||||||
|
@ -370,7 +376,7 @@ class PDFXRefStream(object):
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
if objid < self.objid0 or self.objid1 <= objid:
|
if objid < self.objid0 or self.objid1 <= objid:
|
||||||
raise IndexError(objid)
|
raise KeyError(objid)
|
||||||
i = self.entlen * (objid-self.objid0)
|
i = self.entlen * (objid-self.objid0)
|
||||||
ent = self.data[i:i+self.entlen]
|
ent = self.data[i:i+self.entlen]
|
||||||
f1 = nunpack(ent[:self.fl1], 1)
|
f1 = nunpack(ent[:self.fl1], 1)
|
||||||
|
@ -532,7 +538,7 @@ class PDFDocument:
|
||||||
try:
|
try:
|
||||||
(strmid, index) = xref.getpos(objid)
|
(strmid, index) = xref.getpos(objid)
|
||||||
break
|
break
|
||||||
except IndexError:
|
except KeyError:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
|
|
Loading…
Reference in New Issue