From fa678ccb98eb99d05c6fb8d213e73bcac9091317 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 16 May 2009 04:16:00 +0000 Subject: [PATCH] clustering bug fixed again git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@102 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/converter.py | 22 ++++++++++++--------- pdflib/layout.py | 48 +++++++++++++++++++++++++-------------------- pdflib/pdfdevice.py | 6 +++--- pdflib/pdfinterp.py | 29 ++++++++++++--------------- pdflib/utils.py | 6 +++--- 5 files changed, 58 insertions(+), 53 deletions(-) diff --git a/pdflib/converter.py b/pdflib/converter.py index 678dec6..aae324a 100644 --- a/pdflib/converter.py +++ b/pdflib/converter.py @@ -1,8 +1,9 @@ #!/usr/bin/env python +import sys from pdfdevice import PDFDevice from pdffont import PDFUnicodeNotDefined from layout import LayoutContainer, LTPage, LTText, LTLine, LTRect, LTFigure, LTTextBox -from utils import mult_matrix, translate_matrix, enc +from utils import mult_matrix, translate_matrix, apply_matrix_pt, enc ## PDFPageAggregator @@ -30,9 +31,9 @@ class PDFPageAggregator(PDFDevice): self.cur_item.group_text(self.cluster_margin) return self.cur_item - def begin_figure(self, name, bbox): + def begin_figure(self, name, bbox, matrix): self.stack.append(self.cur_item) - self.cur_item = LTFigure(name, bbox) + self.cur_item = LTFigure(name, bbox, matrix) return def end_figure(self, _): @@ -47,11 +48,13 @@ class PDFPageAggregator(PDFDevice): print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) return self.undefined_char - def paint_path(self, gstate, matrix, stroke, fill, evenodd, path): + def paint_path(self, gstate, stroke, fill, evenodd, path): shape = ''.join(x[0] for x in path) if shape == 'ml': # horizontal/vertical line (_,x0,y0) = path[0] (_,x1,y1) = path[1] + (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) + (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) if y0 == y1: # horizontal ruler self.cur_item.add(LTLine(gstate.linewidth, 'H', (x0,y0,x1,y1))) @@ -64,6 +67,10 @@ class PDFPageAggregator(PDFDevice): (_,x1,y1) = path[1] (_,x2,y2) = path[2] (_,x3,y3) = path[3] + (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0)) + (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1)) + (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2)) + (x3,y3) = apply_matrix_pt(self.ctm, (x3,y2)) if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)): self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2))) @@ -130,9 +137,6 @@ class TagExtractor(PDFDevice): self.tag = None return - def render_image(self, stream, size, matrix): - return - def render_string(self, textstate, textmatrix, seq): font = textstate.font text = '' @@ -204,7 +208,7 @@ class SGMLConverter(PDFConverter): elif isinstance(item, LTRect): self.outfp.write('' % (item.linewidth, item.get_bbox())) elif isinstance(item, LTFigure): - self.outfp.write('
\n' % (item.id, item.get_bbox())) + self.outfp.write('
\n' % (item.id)) for child in item: render(child) self.outfp.write('
\n') @@ -268,7 +272,7 @@ class HTMLConverter(PDFConverter): self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTLine) or isinstance(item, LTRect): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) - elif isinstance(item, LayoutContainer): + elif isinstance(item, LTTextBox): self.write_rect('blue', 1, item.x0, self.yoffset-item.y1, item.width, item.height) for child in item: render(child) diff --git a/pdflib/layout.py b/pdflib/layout.py index 25d3673..2705e5b 100644 --- a/pdflib/layout.py +++ b/pdflib/layout.py @@ -150,7 +150,7 @@ class ClusterSet(object): # add(objs): groups text objects if necessary. def add(self, objs): - group = self.klass(objs, self.i) + group = self.klass(self.i, objs) self.i += 1 for obj in objs: if obj in self.clusters: @@ -164,7 +164,16 @@ class ClusterSet(object): r = set(self.clusters.itervalues()) for group in r: group.fixate() - return r + return list(r) + +def group_objs(objs, ratio, klass): + plane = Plane(objs) + cset = ClusterSet(klass) + for obj in objs: + margin = abs(obj.get_margin(ratio)) + neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) + cset.add(neighbors) + return cset.finish() ## LayoutItem @@ -256,16 +265,6 @@ class LayoutContainer(LayoutItem): self.weight = sum( obj.get_weight() for obj in self.objs ) return - def group_objs(self, ratio, klass): - plane = Plane(self.objs) - cset = ClusterSet(klass) - for obj in self.objs: - margin = abs(obj.get_margin(ratio)) - neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) - cset.add(neighbors) - self.objs = cset.finish() - return - def get_weight(self): return self.weight @@ -301,14 +300,6 @@ class LTRect(LayoutItem): return -## LTFigure -## -class LTFigure(LayoutContainer): - - def __repr__(self): - return ('
' % (self.id, self.get_bbox())) - - ## LTText ## class LTText(LayoutItem): @@ -361,6 +352,19 @@ class LTText(LayoutItem): return self.vertical +## LTFigure +## +class LTFigure(LayoutContainer): + + def __init__(self, id, bbox, matrix): + LayoutContainer.__init__(self, id, bbox) + self.matrix = matrix + return + + def __repr__(self): + return ('
' % (self.id, self.get_bbox(), self.ctm)) + + ## LTTextBox ## ## A set of text objects that are grouped within @@ -446,7 +450,9 @@ class LTPage(LayoutContainer): return def group_text(self, ratio): - self.group_objs(ratio, LTTextBox) + textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ] + otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ] + self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs if self.get_direction() == 'H': lines = reorder_vh(self.objs, +1) else: diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index bf341cc..3163624 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -32,14 +32,14 @@ class PDFDevice(object): return def end_page(self, page): return - def begin_figure(self, name, bbox): + def begin_figure(self, name, bbox, matrix): return def end_figure(self, name): return - def paint_path(self, graphicstate, matrix, stroke, fill, evenodd, path): + def paint_path(self, graphicstate, stroke, fill, evenodd, path): return def render_string(self, textstate, textmatrix, seq): return - def render_image(self, stream, size, matrix): + def render_image(self, stream, size): return diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 6bf0b03..611bd93 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -12,7 +12,7 @@ from psparser import PSException, PSTypeError, PSEOF, \ from pdftypes import PDFException, PDFStream, PDFObjRef, \ resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value -from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY +from utils import choplist, mult_matrix, translate_matrix, MATRIX_IDENTITY from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfcolor import PDFColorSpace, PREDEFINED_COLORSPACE, \ @@ -424,7 +424,7 @@ class PDFPageInterpreter(object): # stroke def do_S(self): - self.device.paint_path(self.graphicstate, self.ctm, True, False, False, self.curpath) + self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] return # close-and-stroke @@ -434,24 +434,24 @@ class PDFPageInterpreter(object): return # fill def do_f(self): - self.device.paint_path(self.graphicstate, self.ctm, False, True, False, self.curpath) + self.device.paint_path(self.graphicstate, False, True, False, self.curpath) self.curpath = [] return # fill (obsolete) do_F = do_f # fill-even-odd def do_f_a(self): - self.device.paint_path(self.graphicstate, self.ctm, False, True, True, self.curpath) + self.device.paint_path(self.graphicstate, False, True, True, self.curpath) self.curpath = [] return # fill-and-stroke def do_B(self): - self.device.paint_path(self.graphicstate, self.ctm, True, True, False, self.curpath) + self.device.paint_path(self.graphicstate, True, True, False, self.curpath) self.curpath = [] return # fill-and-stroke-even-odd def do_B_a(self): - self.device.paint_path(self.graphicstate, self.ctm, True, True, True, self.curpath) + self.device.paint_path(self.graphicstate, True, True, True, self.curpath) self.curpath = [] return # close-fill-and-stroke @@ -686,20 +686,15 @@ class PDFPageInterpreter(object): subtype = xobj.dic.get('Subtype') if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() - (x0,y0,x1,y1) = list_value(xobj.dic['BBox']) - ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm) - (x0,y0) = apply_matrix(ctm, (x0,y0)) - (x1,y1) = apply_matrix(ctm, (x1,y1)) - bbox = (x0,y0,x1,y1) - self.device.begin_figure(xobjid, bbox) - interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm) + bbox = list_value(xobj.dic['BBox']) + matrix = list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)) + self.device.begin_figure(xobjid, bbox, matrix) + interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=mult_matrix(matrix, self.ctm)) self.device.end_figure(xobjid) elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: - (x0,y0) = apply_matrix(self.ctm, (0,0)) - (x1,y1) = apply_matrix(self.ctm, (1,1)) - self.device.begin_figure(xobjid, (x0,y0,x1,y1)) + self.device.begin_figure(xobjid, (0,0,1,1), MATRIX_IDENTITY) (w,h) = (xobj.dic['Width'], xobj.dic['Height']) - self.device.render_image(xobj, (w,h), self.ctm) + self.device.render_image(xobj, (w,h)) self.device.end_figure(xobjid) else: # unsupported xobject type. diff --git a/pdflib/utils.py b/pdflib/utils.py index c07dd10..ed5afc8 100644 --- a/pdflib/utils.py +++ b/pdflib/utils.py @@ -15,12 +15,12 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): def translate_matrix((a,b,c,d,e,f), (x,y)): return (a,b,c,d,e+x,f+y) -def apply_matrix((a,b,c,d,e,f), (x,y)): - '''Applies a matrix to coordinates.''' +def apply_matrix_pt((a,b,c,d,e,f), (x,y)): + '''Applies a matrix to a point.''' return (a*x+c*y+e, b*x+d*y+f) def apply_matrix_norm((a,b,c,d,e,f), (p,q)): - '''equiv to apply_matrix(M, (p,q)) - apply_matrix(M, (0,0))''' + '''equiv to apply_matrix_pt(M, (p,q)) - apply_matrix_pt(M, (0,0))''' return (a*p+c*q, b*p+d*q)