diff --git a/dumppdf.py b/dumppdf.py index 58def03..6e644f5 100755 --- a/dumppdf.py +++ b/dumppdf.py @@ -73,21 +73,20 @@ def dumpxml(out, obj, codec=None): # dumptrailers def dumptrailers(out, doc): for xref in doc.xrefs: - out.write('\n' % - (xref.objid0, xref.objid1-1)) + out.write('\n') dumpxml(out, xref.trailer) out.write('\n\n\n') return # dumpallobjs -def dumpallobjs(out, doc): +def dumpallobjs(out, doc, codec=None): out.write('') for xref in doc.xrefs: - for objid in xrange(xref.objid0, xref.objid1+1): + for objid in xref.objids(): try: obj = doc.getobj(objid) out.write('\n' % objid) - dumpxml(out, obj) + dumpxml(out, obj, codec=codec) out.write('\n\n\n') except: pass @@ -116,7 +115,7 @@ def dumppdf(outfp, fname, objids, pageids, password='', if page.pageid in pageids: dumpxml(outfp, page.attrs) if dumpall: - dumpallobjs(outfp, doc) + dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pageids) and (not dumpall): dumptrailers(outfp, doc) fp.close() diff --git a/pdf2txt.py b/pdf2txt.py index 1872d87..efe9949 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -61,7 +61,7 @@ class TextItem: self.matrix = matrix self.font = font (a,b,c,d,tx,ty) = self.matrix - (self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) + (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) self.width = abs(self.width) self.origin = (tx,ty) self.direction = 0 @@ -69,18 +69,20 @@ class TextItem: self.direction = 1 (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) - self.bbox = (tx, ty+descent, self.width, self.fontsize) + self.bbox = (tx, ty+descent, self.width, self.height) else: self.direction = 2 mindisp = min( d for (d,_) in text ) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0)) - self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width) + self.bbox = (tx-mindisp, ty+self.width, self.height, self.width) self.text = ''.join( c for (_,c) in text ) + (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize)) + self.fontsize = max(w,h) return def __repr__(self): - return ('' % - (self.matrix, self.font, self.fontsize, self.width, self.text)) + return ('' % + (self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) def dump(self, outfp, codec): def e(x): diff --git a/pdfparser.py b/pdfparser.py index e576ab8..d73d5de 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -283,6 +283,9 @@ class PDFXRef(object): self.offsets = None return + def objids(self): + return self.offsets.keys() + def load(self, parser): while 1: try: @@ -332,7 +335,7 @@ class PDFXRef(object): try: (genno, pos, use) = self.offsets[objid] except KeyError: - raise PDFValueError('object not found: %r' % objid) + raise if use != 'n': if STRICT: raise PDFValueError('unused objid=%r' % objid) @@ -351,6 +354,9 @@ class PDFXRefStream(object): self.fl1 = self.fl2 = self.fl3 = None return + def objids(self): + return range(self.objid0, self.objid1+1) + def load(self, parser): (_,objid) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored @@ -370,7 +376,7 @@ class PDFXRefStream(object): def getpos(self, objid): if objid < self.objid0 or self.objid1 <= objid: - raise IndexError(objid) + raise KeyError(objid) i = self.entlen * (objid-self.objid0) ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) @@ -532,7 +538,7 @@ class PDFDocument: try: (strmid, index) = xref.getpos(objid) break - except IndexError: + except KeyError: pass else: if STRICT: