From 71be16febe06e6ec8939302e0a2403513c13ed84 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Thu, 25 Dec 2008 15:09:54 +0000 Subject: [PATCH] wordspace handling improved. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@55 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/page.py | 97 ++++++++++++++++++++++++++++++--------------- pdflib/pdf2txt.py | 20 +++++++--- pdflib/pdfinterp.py | 27 +++++++++---- pdflib/pdfparser.py | 2 +- samples/simple1.pdf | 8 +++- tools/dumppdf.py | 2 +- 6 files changed, 108 insertions(+), 48 deletions(-) diff --git a/pdflib/page.py b/pdflib/page.py index bc4e63a..d33dd0c 100644 --- a/pdflib/page.py +++ b/pdflib/page.py @@ -3,7 +3,7 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdfinterp import PDFDevice, PDFUnicodeNotDefined, \ - mult_matrix, apply_matrix + mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix ## PageItem @@ -37,47 +37,73 @@ class FigureItem(PageItem): ## class TextItem(object): - def __init__(self, matrix, font, fontsize, width, text): + SPACE_WIDTH = 0.6 + + def __init__(self, matrix, font, fontsize, charspace, scaling, text): self.matrix = matrix self.font = font - (a,b,c,d,tx,ty) = self.matrix + (_,_,_,_,tx,ty) = self.matrix self.origin = (tx,ty) self.direction = 0 + self.text = '' if not self.font.is_vertical(): + spwidth = int(font.char_width(32) * self.SPACE_WIDTH) # space width self.direction = 1 - (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize)) - self.width = abs(self.width) - (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) - (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) + (_,descent) = apply_matrix_norm(self.matrix, (0,font.descent * fontsize * .001)) ty += descent - self.bbox = (tx, ty, tx+self.width, ty+self.height) + w = 0 + dx = 0 + prev = ' ' + for t in text: + if isinstance(t, tuple): + if prev != ' ' and spwidth < dx: + self.text += ' ' + (_,char) = t + self.text += char + prev = char + dx = 0 + w += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 + else: + dx -= t + w += t * fontsize * .001 * scaling * .01 + self.adv = (w, 0) + (w,h) = apply_matrix_norm(self.matrix, (w,fontsize)) + self.bbox = (tx, ty, tx+w, ty+h) else: self.direction = 2 - (self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width)) - self.width = abs(self.width) - (disp,_) = text[0] - (_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001)) - tx -= self.width/2 + disp = 0 + h = 0 + for t in text: + if isinstance(t, tuple): + (disp,char) = t + (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) + self.text += char + h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 + break + for t in text: + if isinstance(t, tuple): + (_,char) = t + self.text += char + h += (font.char_width(ord(char)) * fontsize * .001 + charspace) * scaling * .01 + self.adv = (0, h) + (w,h) = apply_matrix_norm(self.matrix, (fontsize,h)) + tx -= w/2 ty += disp - self.bbox = (tx, ty+self.height, tx+self.width, ty) - self.text = ''.join( c for (_,c) in text ) - (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize)) - self.fontsize = max(w,h) + self.bbox = (tx, ty+h, tx+w, ty) + self.fontsize = max(apply_matrix_norm(self.matrix, (fontsize,fontsize))) return def __repr__(self): - return ('' % - (self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) + return ('' % + (self.matrix, self.font, self.fontsize, self.bbox, self.text)) -## TextConverter +## PageAggregator ## -class TextConverter(PDFDevice): +class PageAggregator(PDFDevice): - def __init__(self, rsrc, outfp, codec='utf-8', debug=0): + def __init__(self, rsrc, debug=0): PDFDevice.__init__(self, rsrc, debug=debug) - self.outfp = outfp - self.codec = codec self.pageno = 0 self.stack = [] return @@ -109,14 +135,12 @@ class TextConverter(PDFDevice): print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) return None - def render_string(self, textstate, textmatrix, size, seq, ratio=0.6): + def render_string(self, textstate, textmatrix, seq): font = textstate.font - spwidth = int(-font.char_width(32) * ratio) # space width text = [] for x in seq: if isinstance(x, int) or isinstance(x, float): - if not font.is_vertical() and x <= spwidth: - text.append((0, ' ')) + text.append(x) else: chars = font.decode(x) for cid in chars: @@ -125,11 +149,20 @@ class TextConverter(PDFDevice): text.append((font.char_disp(cid), char)) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args - s = self.handle_undefined_char(cidcoding, cid) - if s: - text.append(s) + unc = self.handle_undefined_char(cidcoding, cid) + if unc: + text.append(unc) + if cid == 32 and not font.is_multibyte(): + if text: + item = TextItem(mult_matrix(textmatrix, self.ctm), + font, textstate.fontsize, textstate.charspace, textstate.scaling, text) + self.cur_item.add(item) + (dx,dy) = item.adv + dx += textstate.wordspace * textstate.scaling * .01 + textmatrix = translate_matrix(textmatrix, (dx, dy)) + text = [] if text: item = TextItem(mult_matrix(textmatrix, self.ctm), - font, textstate.fontsize, size, text) + font, textstate.fontsize, textstate.charspace, textstate.scaling, text) self.cur_item.add(item) return diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index e648805..f557a21 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -2,11 +2,11 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \ +from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect +from pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined -from pdflib.cmap import CMapDB -from pdflib.page import PageItem, FigureItem, TextItem, TextConverter +from cmap import CMapDB +from page import PageItem, FigureItem, TextItem, PageAggregator def enc(x, codec): @@ -18,6 +18,16 @@ def encprops(props, codec): return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) +## TextConverter +class TextConverter(PageAggregator): + + def __init__(self, rsrc, outfp, codec='ascii', debug=0): + PageAggregator.__init__(self, rsrc, debug=debug) + self.outfp = outfp + self.codec = codec + return + + ## SGMLConverter ## class SGMLConverter(TextConverter): @@ -156,7 +166,7 @@ class TagExtractor(PDFDevice): # pdf2txt class TextExtractionNotAllowed(RuntimeError): pass -def convert(rsrc, device, fname, pagenos, maxpages=0, password='', debug=0): +def convert(rsrc, device, fname, pagenos=None, maxpages=0, password='', debug=0): doc = PDFDocument(debug=debug) fp = file(fname, 'rb') parser = PDFParser(doc, fp, debug=debug) diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 05dd9cf..7bf3cfc 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -73,10 +73,16 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): a0*c1+c0*d1, b0*c1+d0*d1, a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) +def translate_matrix((a,b,c,d,e,f), (x,y)): + return (a,b,c,d,e+x,f+y) + def apply_matrix((a,b,c,d,e,f), (x,y)): '''Applies a matrix to coordinates.''' return (a*x+c*y+e, b*x+d*y+f) +def apply_matrix_norm((a,b,c,d,e,f), (x,y)): + return (a*x+c*y, b*x+d*y) + ## Fonts ## @@ -103,6 +109,9 @@ class PDFFont(object): def is_vertical(self): return False + def is_multibyte(self): + return False + def decode(self, bytes): return map(ord, bytes) @@ -372,6 +381,9 @@ class PDFCIDFont(PDFFont): def is_vertical(self): return self.vertical + + def is_multibyte(self): + return True def decode(self, bytes): return self.cmap.decode(bytes) @@ -498,7 +510,7 @@ class PDFDevice(object): def end_figure(self, name): return - def render_string(self, textstate, textmatrix, size, seq): + def render_string(self, textstate, textmatrix, seq): raise NotImplementedError def render_image(self, stream, size, matrix): raise NotImplementedError @@ -928,15 +940,16 @@ class PDFPageInterpreter(object): def do_TJ(self, seq): #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) textstate = self.textstate + matrix = translate_matrix(textstate.matrix, textstate.linematrix) + self.device.render_string(textstate, matrix, seq) font = textstate.font - (a,b,c,d,e,f) = textstate.matrix - (lx,ly) = textstate.linematrix s = ''.join( x for x in seq if isinstance(x, str) ) n = sum( x for x in seq if not isinstance(x, str) ) - w = ((font.string_width(s)-n)/1000.0 * textstate.fontsize + - len(s) * textstate.charspace + - s.count(' ')*textstate.wordspace) * textstate.scaling / 100.0 - self.device.render_string(textstate, (a,b,c,d,e+lx,f+ly), w, seq) + w = (font.string_width(s)-n)*.001 * textstate.fontsize + len(s) * textstate.charspace + if not font.is_multibyte(): + w += s.count(' ')*textstate.wordspace + w *= (textstate.scaling * .01) + (lx,ly) = textstate.linematrix if font.is_vertical(): ly += w else: diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 92e1651..49367c7 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -586,7 +586,7 @@ class PDFDocument(object): self.parser.seek(index) (_,objid1) = self.parser.nexttoken() # objid (_,genno) = self.parser.nexttoken() # genno - assert objid1 == objid, (objid, objid1) + #assert objid1 == objid, (objid, objid1) (_,kwd) = self.parser.nexttoken() if kwd != KEYWORD_OBJ: raise PDFSyntaxError('Invalid object spec: offset=%r' % index) diff --git a/samples/simple1.pdf b/samples/simple1.pdf index 7ebe4b2..fd0e4db 100644 --- a/samples/simple1.pdf +++ b/samples/simple1.pdf @@ -32,11 +32,15 @@ endobj >> endobj 5 0 obj -<< /Length 46 >> +<< /Length 86 >> stream BT /F1 24 Tf -1 0 0 1 100 700 TD +100 600 Td +0 Tw +( Hello World ) Tj +0 100 Td +100 Tw ( Hello World ) Tj ET endstream diff --git a/tools/dumppdf.py b/tools/dumppdf.py index a42a91a..0bb0a4a 100755 --- a/tools/dumppdf.py +++ b/tools/dumppdf.py @@ -89,7 +89,7 @@ def dumpallobjs(out, doc, codec=None): dumpxml(out, obj, codec=codec) out.write('\n\n\n') except: - pass + raise dumptrailers(out, doc) out.write('') return