From 585dd59b70113b4d02131b8b3e339a9696ba3088 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Thu, 23 Jul 2009 14:03:58 +0000 Subject: [PATCH] git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@124 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 218 ++++++++++++++++++------------------------ pdfminer/layout.py | 5 +- pdfminer/pdfdevice.py | 59 +++++++++++- pdfminer/pdfinterp.py | 20 +--- 4 files changed, 155 insertions(+), 147 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 9df3ace..24e37b1 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,131 +1,11 @@ #!/usr/bin/env python import sys -from pdfminer.pdfdevice import PDFDevice +from pdfminer.pdfdevice import PDFDevice, PDFTextDevice from pdfminer.pdffont import PDFUnicodeNotDefined from pdfminer.layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextItem, LTTextBox, LTTextLine -from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc +from pdfminer.utils import apply_matrix_pt, enc -## PDFPageAggregator -## -class PDFPageAggregator(PDFDevice): - - def __init__(self, rsrc, pageno=1, laparams=None): - PDFDevice.__init__(self, rsrc) - self.laparams = laparams - self.undefined_char = '?' - self.pageno = pageno - self.stack = [] - return - - def begin_page(self, page): - self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate) - return - - def end_page(self, _): - assert not self.stack - assert isinstance(self.cur_item, LTPage) - self.cur_item.fixate() - if self.laparams: - self.cur_item.analyze_layout(self.laparams) - self.pageno += 1 - return self.cur_item - - def begin_figure(self, name, bbox, matrix): - self.stack.append(self.cur_item) - self.cur_item = LTFigure(name, bbox, matrix) - return - - def end_figure(self, _): - fig = self.cur_item - self.cur_item.fixate() - self.cur_item = self.stack.pop() - self.cur_item.add(fig) - return - - def handle_undefined_char(self, cidcoding, cid): - if self.debug: - print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) - return self.undefined_char - - def paint_path(self, gstate, stroke, fill, evenodd, path): - shape = ''.join(x[0] for x in path) - if shape == 'ml': # horizontal/vertical line - (_,x0,y0) = path[0] - (_,x1,y1) = path[1] - (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) - (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) - if y0 == y1: - # horizontal ruler - self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) - elif x0 == x1: - # vertical ruler - self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) - elif shape == 'mlllh': - # rectangle - (_,x0,y0) = path[0] - (_,x1,y1) = path[1] - (_,x2,y2) = path[2] - (_,x3,y3) = path[3] - (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) - (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) - (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) - (x3,y3) = apply_matrix_pt(self.ctm, (x3,y2)) - if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or - (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): - self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) - return - - def render_chars(self, textmatrix, textstate, chars): - if not chars: return (0, 0) - item = LTTextItem(textmatrix, textstate.font, textstate.fontsize, - textstate.charspace, textstate.scaling, chars) - self.cur_item.add(item) - return item.adv - - def render_string(self, textstate, textmatrix, seq): - font = textstate.font - textmatrix = mult_matrix(textmatrix, self.ctm) - scaling = textstate.scaling * .01 - dxscale = scaling / (font.hscale*1000) * .01 - wordspace = textstate.wordspace * scaling - chars = [] - for x in seq: - if isinstance(x, int) or isinstance(x, float): - (dx,dy) = self.render_chars(textmatrix, textstate, chars) - textmatrix = translate_matrix(textmatrix, (dx-x*dxscale, dy)) - chars = [] - else: - for cid in font.decode(x): - try: - char = font.to_unicode(cid) - except PDFUnicodeNotDefined, e: - (cidcoding, cid) = e.args - char = self.handle_undefined_char(cidcoding, cid) - chars.append((char, cid)) - if cid == 32 and textstate.wordspace and not font.is_multibyte(): - (dx,dy) = self.render_chars(textmatrix, textstate, chars) - textmatrix = translate_matrix(textmatrix, (dx+wordspace, dy)) - chars = [] - self.render_chars(textmatrix, textstate, chars) - return - - -## PDFConverter -## -class PDFConverter(PDFPageAggregator): - - def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): - PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) - self.outfp = outfp - self.codec = codec - return - - def write(self, text): - self.outfp.write(enc(text, self.codec)) - return - - ## TagExtractor ## class TagExtractor(PDFDevice): @@ -138,12 +18,12 @@ class TagExtractor(PDFDevice): self.tag = None return - def render_string(self, textstate, textmatrix, seq): + def render_string(self, textstate, seq): font = textstate.font text = '' - for x in seq: - if not isinstance(x, str): continue - chars = font.decode(x) + for obj in seq: + if not isinstance(obj, str): continue + chars = font.decode(obj) for cid in chars: try: char = font.to_unicode(cid) @@ -186,6 +66,92 @@ class TagExtractor(PDFDevice): return +## PDFPageAggregator +## +class PDFPageAggregator(PDFTextDevice): + + def __init__(self, rsrc, pageno=1, laparams=None): + PDFTextDevice.__init__(self, rsrc) + self.laparams = laparams + self.pageno = pageno + self.stack = [] + return + + def begin_page(self, page): + self.cur_item = LTPage(self.pageno, page.mediabox, page.rotate) + return + + def end_page(self, _): + assert not self.stack + assert isinstance(self.cur_item, LTPage) + self.cur_item.fixate() + if self.laparams: + self.cur_item.analyze_layout(self.laparams) + self.pageno += 1 + return self.cur_item + + def begin_figure(self, name, bbox, matrix): + self.stack.append(self.cur_item) + self.cur_item = LTFigure(name, bbox, matrix) + return + + def end_figure(self, _): + fig = self.cur_item + self.cur_item.fixate() + self.cur_item = self.stack.pop() + self.cur_item.add(fig) + return + + def paint_path(self, gstate, stroke, fill, evenodd, path): + shape = ''.join(x[0] for x in path) + if shape == 'ml': # horizontal/vertical line + (_,x0,y0) = path[0] + (_,x1,y1) = path[1] + (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) + (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) + if y0 == y1: + # horizontal ruler + self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) + elif x0 == x1: + # vertical ruler + self.cur_item.add(LTLine(gstate.linewidth, 'V', (x0,y0,x1,y1))) + elif shape == 'mlllh': + # rectangle + (_,x0,y0) = path[0] + (_,x1,y1) = path[1] + (_,x2,y2) = path[2] + (_,x3,y3) = path[3] + (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) + (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) + (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) + (x3,y3) = apply_matrix_pt(self.ctm, (x3,y2)) + if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or + (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): + self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) + return + + def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): + if not chars: return (0, 0) + item = LTTextItem(matrix, font, fontsize, charspace, scaling, chars) + self.cur_item.add(item) + return item.adv + + +## PDFConverter +## +class PDFConverter(PDFPageAggregator): + + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, laparams=None): + PDFPageAggregator.__init__(self, rsrc, pageno=pageno, laparams=laparams) + self.outfp = outfp + self.codec = codec + return + + def write(self, text): + self.outfp.write(enc(text, self.codec)) + return + + ## SGMLConverter ## class SGMLConverter(PDFConverter): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 375142c..b7d7549 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -270,7 +270,7 @@ class LTTextItem(LayoutItem, LTText): self.vertical = self.font.is_vertical() self.text = ''.join( char for (char,_) in chars ) adv = sum( font.char_width(cid) for (_,cid) in chars ) - adv = (adv * fontsize + len(chars)*charspace) * scaling * .01 + adv = (adv * fontsize + len(chars)*charspace) * scaling size = (font.get_ascent() - font.get_descent()) * fontsize if not self.vertical: # horizontal text @@ -410,6 +410,7 @@ def tsort(objs, f): go = dict( (obj,[]) for obj in objs ) for obj1 in objs: for obj2 in objs: + if obj1 is obj2: continue if f(obj1, obj2): # obj1 -> obj2 go[obj1].append(obj2) gi[obj2].append(obj1) @@ -478,7 +479,7 @@ class LTPage(LayoutContainer): elif obj1.voverlap(obj2): return obj1.x1 < obj2.x0 else: - return obj2.y1 < obj1.y1 and obj1.x1 < obj2.x0 + return obj1.x0 < obj2.x0 and obj2.y1 < obj1.y1 lines = ClusterSet.build(textobjs, laparams.char_margin, 0, (lambda id,objs: LTTextLine(id, objs, 'H', laparams.word_margin)), hline) diff --git a/pdfminer/pdfdevice.py b/pdfminer/pdfdevice.py index 3163624..497b4e8 100644 --- a/pdfminer/pdfdevice.py +++ b/pdfminer/pdfdevice.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +from pdfminer.utils import mult_matrix, translate_matrix + + ## PDFDevice ## class PDFDevice(object): @@ -39,7 +42,59 @@ class PDFDevice(object): def paint_path(self, graphicstate, stroke, fill, evenodd, path): return - def render_string(self, textstate, textmatrix, seq): - return def render_image(self, stream, size): return + def render_string(self, textstate, seq): + return + + +## PDFTextDevice +## +class PDFTextDevice(PDFDevice): + + def handle_undefined_char(self, cidcoding, cid): + if self.debug: + print >>sys.stderr, 'undefined: %r, %r' % (cidcoding, cid) + return '?' + + def render_chars(self, matrix, font, fontsize, charspace, scaling, chars): + return (0, 0) + + def render_string(self, textstate, seq): + matrix = mult_matrix(textstate.matrix, self.ctm) + font = textstate.font + fontsize = textstate.fontsize + charspace = textstate.charspace + scaling = textstate.scaling * .01 + wordspace = textstate.wordspace * scaling + dxscale = scaling / (font.hscale*1000) * .01 + chars = [] + (x,y) = textstate.linematrix + for obj in seq: + if isinstance(obj, int) or isinstance(obj, float): + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + x += dx-obj*dxscale + y += dy + chars = [] + else: + for cid in font.decode(obj): + try: + char = font.to_unicode(cid) + except PDFUnicodeNotDefined, e: + (cidcoding, cid) = e.args + char = self.handle_undefined_char(cidcoding, cid) + chars.append((char, cid)) + if cid == 32 and textstate.wordspace and not font.is_multibyte(): + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + x += dx + wordspace + y += dy + chars = [] + if chars: + (dx,dy) = self.render_chars(translate_matrix(matrix, (x,y)), font, + fontsize, charspace, scaling, chars) + x += dx + y += dy + textstate.linematrix = (x,y) + return diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 8d1dca0..967a62d 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -49,6 +49,8 @@ class PDFTextState(object): self.render = 0 self.rise = 0 self.reset() + # self.matrix is set + # self.linematrix is set return def __repr__(self): @@ -630,23 +632,7 @@ class PDFPageInterpreter(object): # show-pos def do_TJ(self, seq): #print >>stderr, 'TJ(%r): %r' % (seq,self.textstate) - textstate = self.textstate - textmatrix = translate_matrix(textstate.matrix, textstate.linematrix) - self.device.render_string(textstate, textmatrix, seq) - font = textstate.font - s = ''.join( x for x in seq if isinstance(x, str) ) - w = ((font.string_width(s) - sum( x for x in seq if not isinstance(x, str) )*.001) * textstate.fontsize + - len(s) * textstate.charspace) - (lx,ly) = textstate.linematrix - if font.is_vertical(): - # advance vertically - ly += w * (textstate.scaling * .01) - else: - # advance horizontally - if not font.is_multibyte(): - w += s.count(' ')*textstate.wordspace - lx += w * (textstate.scaling * .01) - textstate.linematrix = (lx,ly) + self.device.render_string(self.textstate, seq) return # show def do_Tj(self, s):