diff --git a/dumppdf.py b/dumppdf.py index 6e644f5..c86cf3a 100755 --- a/dumppdf.py +++ b/dumppdf.py @@ -143,8 +143,8 @@ def main(argv): outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 - elif k == '-i': objids.append(int(v)) - elif k == '-p': pageids.add(int(v)) + elif k == '-i': objids.extend( int(x) for x in v.split(',') ) + elif k == '-p': pageids.update( int(x) for x in v.split(',') ) elif k == '-P': password = v elif k == '-a': dumpall = True elif k == '-r': codec = 'raw' diff --git a/pdf2txt.py b/pdf2txt.py index d4832bb..129108a 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -178,7 +178,8 @@ class TextConverter(PDFDevice): (wmode, x*scale, (offset-y)*scale, item.fontsize*scale)) outfp.write(enc(item.text, codec)) outfp.write('\n') - outfp.write('\n') + outfp.write('\n' % codec) + outfp.write('\n') for page in self.pages: (x0,y0,x1,y1) = page.bbox offset += y1 diff --git a/pdfinterp.py b/pdfinterp.py index 15bc330..01fde03 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -593,23 +593,22 @@ class PDFPageInterpreter: def __init__(self): self.font = None self.fontsize = 0 - self.reset() - return - def __repr__(self): - return ('' % - (self.font, self.fontsize, self.matrix, - self.charspace, self.wordspace, self.scaling, self.leading, - self.render, self.rise)) - def reset(self): self.charspace = 0 self.wordspace = 0 self.scaling = 100 self.leading = 0 self.render = 0 self.rise = 0 - # + self.reset() + return + def __repr__(self): + return ('' % + (self.font, self.fontsize, self.charspace, self.wordspace, + self.scaling, self.leading, self.render, self.rise, + self.matrix, self.linematrix)) + def reset(self): self.matrix = MATRIX_IDENTITY self.linematrix = (0, 0) return @@ -881,15 +880,17 @@ class PDFPageInterpreter: # text-move def do_Td(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e+tx,f+ty) + self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) self.textstate.linematrix = (0, 0) + #print >>stderr, 'Td(%r,%r): %r' % (tx,ty,self.textstate) return # text-move def do_TD(self, tx, ty): (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e+tx,f+ty) - self.textstate.leading = -ty + self.textstate.matrix = (a,b,c,d,tx*a+ty*c+e,tx*b+ty*d+f) + self.textstate.leading = ty self.textstate.linematrix = (0, 0) + #print >>stderr, 'TD(%r,%r): %r' % (tx,ty,self.textstate) return # textmatrix def do_Tm(self, a,b,c,d,e,f): @@ -899,12 +900,13 @@ class PDFPageInterpreter: # nextline def do_T_a(self): (a,b,c,d,e,f) = self.textstate.matrix - self.textstate.matrix = (a,b,c,d,e,f+self.textstate.leading) + self.textstate.matrix = (a,b,c,d,self.textstate.leading*c+e,self.textstate.leading*d+f) self.textstate.linematrix = (0, 0) return # show-pos def do_TJ(self, seq): + #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) textstate = self.textstate font = textstate.font (a,b,c,d,e,f) = textstate.matrix