diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 48442f3..9480b22 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -27,28 +27,26 @@ class PDFLayoutAnalyzer(PDFTextDevice): (x0,y0) = apply_matrix_pt(ctm, (x0,y0)) (x1,y1) = apply_matrix_pt(ctm, (x1,y1)) mediabox = (0, 0, abs(x0-x1), abs(y0-y1)) - self.cur_item = LTPage(self.pageno, mediabox) + self.cur_item = LTPage(self.pageno, mediabox, laparams=self.laparams) return def end_page(self, page): assert not self.stack assert isinstance(self.cur_item, LTPage) - self.cur_item.fixate() - self.cur_item.analyze(self.laparams) + self.cur_item.finish() self.pageno += 1 self.receive_layout(self.cur_item) return def begin_figure(self, name, bbox, matrix): self.stack.append(self.cur_item) - self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) + self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm), laparams=self.laparams) return def end_figure(self, _): fig = self.cur_item assert isinstance(self.cur_item, LTFigure) - self.cur_item.fixate() - self.cur_item.analyze(self.laparams) + self.cur_item.finish() self.cur_item = self.stack.pop() self.cur_item.add(fig) return @@ -175,7 +173,7 @@ class TextConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTText): - self.write(item.get_text()) + self.write(item.text) if isinstance(item, LTTextBox): self.write('\n') if self.showpageno: @@ -190,17 +188,17 @@ class TextConverter(PDFConverter): class HTMLConverter(PDFConverter): RECT_COLORS = { - 'char': 'green', - 'figure': 'yellow', - 'textline': 'magenta', - 'polygon': 'black', + #'char': 'green', + #'figure': 'yellow', + #'textline': 'magenta', 'textbox': 'cyan', 'textgroup': 'red', + 'polygon': 'black', 'page': 'gray', } TEXT_COLORS = { + 'textbox': 'blue', 'char': 'black', - 'textbox': 'black', } def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, @@ -275,7 +273,7 @@ class HTMLConverter(PDFConverter): item.width*self.scale, item.height*self.scale)) return render(ltpage) - if self.debug and ltpage.layout: + if ltpage.layout: def show_layout(item): if isinstance(item, LTTextGroup): self.write_rect('textgroup', 1, item.x0, item.y1, item.width, item.height) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index ac3ac55..df81d80 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 import sys from utils import apply_matrix_pt, get_bound, INF -from utils import bsearch, bbox2str, matrix2str +from utils import bsearch, bbox2str, matrix2str, Plane from pdffont import PDFUnicodeNotDefined @@ -55,8 +55,6 @@ class LTItem(object): (self.__class__.__name__, bbox2str(self.bbox))) def set_bbox(self, (x0,y0,x1,y1)): - if x1 < x0: (x0,x1) = (x1,x0) - if y1 < y0: (y0,y1) = (y1,y0) self.x0 = x0 self.y0 = y0 self.x1 = x1 @@ -65,7 +63,7 @@ class LTItem(object): self.height = y1-y0 self.bbox = (x0, y0, x1, y1) return - + def is_hoverlap(self, obj): assert isinstance(obj, LTItem) return obj.x0 <= self.x1 and self.x0 <= obj.x1 @@ -169,10 +167,7 @@ class LTText(object): def __repr__(self): return ('<%s %r>' % - (self.__class__.__name__, self.get_text())) - - def get_text(self): - return self.text + (self.__class__.__name__, self.text)) ## LTAnon @@ -222,9 +217,13 @@ class LTChar(LTItem, LTText): bur = (self.adv, ty+height) (a,b,c,d,e,f) = self.matrix self.upright = (0 < a*d*scaling and b*c <= 0) - bbox = (apply_matrix_pt(self.matrix, bll) + - apply_matrix_pt(self.matrix, bur)) - LTItem.__init__(self, bbox) + (x0,y0) = apply_matrix_pt(self.matrix, bll) + (x1,y1) = apply_matrix_pt(self.matrix, bur) + if x1 < x0: + (x0,x1) = (x1,x0) + if y1 < y0: + (y0,y1) = (y1,y0) + LTItem.__init__(self, (x0,y0,x1,y1)) if self.font.is_vertical(): self.size = self.width else: @@ -236,11 +235,12 @@ class LTChar(LTItem, LTText): return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' % (self.__class__.__name__, bbox2str(self.bbox), matrix2str(self.matrix), self.font, self.fontsize, - self.adv, self.get_text())) + self.adv, self.text)) else: return '' % self.text def is_compatible(self, obj): + """Returns True if two characters can coexist in the same line.""" return True @@ -248,81 +248,80 @@ class LTChar(LTItem, LTText): ## class LTContainer(LTItem): - def __init__(self, objs=None, bbox=(0,0,0,0)): + def __init__(self, bbox): LTItem.__init__(self, bbox) - if objs: - self._objs = objs[:] - else: - self._objs = [] + self._objs = [] return def __iter__(self): - return iter(self.get_objs()) + return iter(self._objs) def __len__(self): - return len(self.get_objs()) + return len(self._objs) def add(self, obj): self._objs.append(obj) return - def merge(self, container): - self._objs.extend(container._objs) + def extend(self, objs): + for obj in objs: + self.add(obj) return - def get_objs(self): - return self._objs - # fixate(): determines its boundery. - def fixate(self): - if not self.width and self._objs: - (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) - for obj in self._objs: - bx0 = min(bx0, obj.x0) - by0 = min(by0, obj.y0) - bx1 = max(bx1, obj.x1) - by1 = max(by1, obj.y1) - self.set_bbox((bx0, by0, bx1, by1)) +## LTExpandableContainer +## +class LTExpandableContainer(LTContainer): + + def __init__(self): + LTContainer.__init__(self, (+INF,+INF,-INF,-INF)) return + def add(self, obj): + LTContainer.add(self, obj) + self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0), + max(self.x1, obj.x1), max(self.y1, obj.y1))) + return + + def finish(self): + return self + ## LTTextLine ## -class LTTextLine(LTContainer, LTText): +class LTTextLine(LTExpandableContainer, LTText): - def __init__(self, laparams=None): - self.laparams = laparams - LTContainer.__init__(self) + def __init__(self, word_margin): + LTExpandableContainer.__init__(self) + self.word_margin = word_margin return def __repr__(self): return ('<%s %s %r>' % - (self.__class__.__name__, bbox2str(self.bbox), - self.get_text())) + (self.__class__.__name__, bbox2str(self.bbox), self.text)) - def get_text(self): - return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) ) + def finish(self): + LTContainer.add(self, LTAnon('\n')) + self.text = ''.join( obj.text for obj in self if isinstance(obj, LTText) ) + return LTExpandableContainer.finish(self) def find_neighbors(self, plane, ratio): raise NotImplementedError class LTTextLineHorizontal(LTTextLine): - def get_objs(self): - if self.laparams is None: - for obj in self._objs: - yield obj - return - word_margin = self.laparams.word_margin - x1 = INF - for obj in csort(self._objs, key=lambda obj: obj.x0): - if isinstance(obj, LTChar) and word_margin: - margin = word_margin * obj.width - if x1 < obj.x0-margin: - yield LTAnon(' ') - yield obj - x1 = obj.x1 - yield LTAnon('\n') + def __init__(self, word_margin): + LTTextLine.__init__(self, word_margin) + self._x1 = +INF + return + + def add(self, obj): + if isinstance(obj, LTChar) and self.word_margin: + margin = self.word_margin * obj.width + if self._x1 < obj.x0-margin: + LTContainer.add(self, LTAnon(' ')) + self._x1 = obj.x1 + LTTextLine.add(self, obj) return def find_neighbors(self, plane, ratio): @@ -332,23 +331,20 @@ class LTTextLineHorizontal(LTTextLine): class LTTextLineVertical(LTTextLine): - def get_objs(self): - if self.laparams is None: - for obj in self._objs: - yield obj - return - word_margin = self.laparams.word_margin - y0 = -INF - for obj in csort(self._objs, key=lambda obj: -obj.y1): - if isinstance(obj, LTChar) and word_margin: - margin = word_margin * obj.height - if obj.y1+margin < y0: - yield LTAnon(' ') - yield obj - y0 = obj.y0 - yield LTAnon('\n') + def __init__(self, word_margin): + LTTextLine.__init__(self, word_margin) + self._y0 = -INF return + def add(self, obj): + if isinstance(obj, LTChar) and self.word_margin: + margin = self.word_margin * obj.height + if obj.y1+margin < self._y0: + LTContainer.add(self, LTAnon(' ')) + self._y0 = obj.y0 + LTTextLine.add(self, obj) + return + def find_neighbors(self, plane, ratio): w = ratio*self.width objs = plane.find((self.x0-w, self.y0, self.x1+w, self.y1)) @@ -360,110 +356,84 @@ class LTTextLineVertical(LTTextLine): ## A set of text objects that are grouped within ## a certain rectangular area. ## -class LTTextBox(LTContainer): +class LTTextBox(LTExpandableContainer): - def __init__(self, objs): - LTContainer.__init__(self, objs=objs) + def __init__(self): + LTExpandableContainer.__init__(self) self.index = None return def __repr__(self): return ('<%s(%s) %s %r...>' % (self.__class__.__name__, self.index, - bbox2str(self.bbox), self.get_text()[:20])) + bbox2str(self.bbox), self.text[:20])) - def get_text(self): - return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) ) + def finish(self): + self.text = ''.join( obj.text for obj in self if isinstance(obj, LTTextLine) ) + return LTExpandableContainer.finish(self) class LTTextBoxHorizontal(LTTextBox): - def get_objs(self): - return csort(self._objs, key=lambda obj: -obj.y1) + def finish(self): + self._objs = csort(self._objs, key=lambda obj: -obj.y1) + return LTTextBox.finish(self) class LTTextBoxVertical(LTTextBox): - def get_objs(self): - return csort(self._objs, key=lambda obj: -obj.x1) + def finish(self): + self._objs = csort(self._objs, key=lambda obj: -obj.x1) + return LTTextBox.finish(self) ## LTTextGroup ## -class LTTextGroup(LTContainer): +class LTTextGroup(LTExpandableContainer): def __init__(self, objs): - LTContainer.__init__(self, objs=objs) - LTContainer.fixate(self) + LTExpandableContainer.__init__(self) + self.extend(objs) return class LTTextGroupLRTB(LTTextGroup): - def get_objs(self): + def finish(self): # reorder the objects from top-left to bottom-right. - return csort(self._objs, key=lambda obj: obj.x0+obj.x1-(obj.y0+obj.y1)) + self._objs = csort(self._objs, key=lambda obj: obj.x0+obj.x1-(obj.y0+obj.y1)) + return LTTextGroup.finish(self) class LTTextGroupTBRL(LTTextGroup): - def get_objs(self): + def finish(self): # reorder the objects from top-right to bottom-left. - return csort(self._objs, key=lambda obj: -(obj.x0+obj.x1)-(obj.y0+obj.y1)) + self._objs = csort(self._objs, key=lambda obj: -(obj.x0+obj.x1)-(obj.y0+obj.y1)) + return LTTextGroup.finish(self) -## Plane +## LTLayoutContainer ## -## A data structure for objects placed on a plane. -## Can efficiently find objects in a certain rectangular area. -## It maintains two parallel lists of objects, each of -## which is sorted by its x or y coordinate. -## -class Plane(object): +class LTLayoutContainer(LTContainer): - def __init__(self, objs): - self.xobjs = [] - self.yobjs = [] - self.idxs = dict( (obj,i) for (i,obj) in enumerate(objs) ) - for obj in objs: - self.place(obj) - self.xobjs.sort() - self.yobjs.sort() + def __init__(self, bbox, laparams=None): + LTContainer.__init__(self, bbox) + self.laparams = laparams + self.layout = None return - - # place(obj): place an object in a certain area. - def place(self, obj): - assert isinstance(obj, LTItem) - self.xobjs.append((obj.x0, obj)) - self.xobjs.append((obj.x1, obj)) - self.yobjs.append((obj.y0, obj)) - self.yobjs.append((obj.y1, obj)) - return - - # find(): finds objects that are in a certain area. - def find(self, (x0,y0,x1,y1)): - i0 = bsearch(self.xobjs, x0)[0] - i1 = bsearch(self.xobjs, x1)[1] - xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] ) - i0 = bsearch(self.yobjs, y0)[0] - i1 = bsearch(self.yobjs, y1)[1] - yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] ) - xobjs.intersection_update(yobjs) - return sorted(xobjs, key=lambda obj: self.idxs[obj]) - - -## LTAnalyzer -## -class LTAnalyzer(LTContainer): - - def analyze(self, laparams=None): + + def finish(self): """Perform the layout analysis.""" - if laparams is None: return + if self.laparams is None: return # textobjs is a list of LTChar objects, i.e. # it has all the individual characters in the page. - (textobjs, otherobjs) = self.get_textobjs(self._objs, laparams) + (textobjs, otherobjs) = self.get_textobjs(self._objs) if not textobjs: return - textlines = list(self.get_textlines(textobjs, laparams)) - assert sum( len(line._objs) for line in textlines ) == len(textobjs) - textboxes = list(self.get_textboxes(textlines, laparams)) - assert sum( len(box._objs) for box in textboxes ) == len(textlines) - top = self.group_textboxes(textboxes, laparams) + textlines = list(self.get_textlines(textobjs, + self.laparams.line_overlap, + self.laparams.char_margin, + self.laparams.word_margin)) + assert len(textobjs) <= sum( len(line._objs) for line in textlines ) + textboxes = list(self.get_textboxes(textlines, self.laparams.line_margin)) + assert len(textlines) == sum( len(box._objs) for box in textboxes ) + top = self.group_textboxes(textboxes) def assign_index(obj, i): if isinstance(obj, LTTextBox): obj.index = i @@ -476,9 +446,9 @@ class LTAnalyzer(LTContainer): textboxes.sort(key=lambda box:box.index) self._objs = textboxes + otherobjs self.layout = top - return + return self - def get_textobjs(self, objs, laparams): + def get_textobjs(self, objs): """Split all the objects in the page into text-related objects and others.""" textobjs = [] otherobjs = [] @@ -489,15 +459,15 @@ class LTAnalyzer(LTContainer): otherobjs.append(obj) return (textobjs, otherobjs) - def get_textlines(self, objs, laparams): + def get_textlines(self, objs, line_overlap, char_margin, word_margin): obj0 = None line = None for obj1 in objs: if obj0 is not None: k = 0 if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and - min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and - obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin): + min(obj0.height, obj1.height) * line_overlap < obj0.voverlap(obj1) and + obj0.hdistance(obj1) < max(obj0.width, obj1.width) * char_margin): # obj0 and obj1 is horizontally aligned: # # +------+ - - - @@ -510,8 +480,8 @@ class LTAnalyzer(LTContainer): # (char_margin) k |= 1 if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and - min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and - obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin): + min(obj0.width, obj1.width) * line_overlap < obj0.hoverlap(obj1) and + obj0.vdistance(obj1) < max(obj0.height, obj1.height) * char_margin): # obj0 and obj1 is vertically aligned: # # +------+ @@ -531,59 +501,59 @@ class LTAnalyzer(LTContainer): (k & 2 and isinstance(line, LTTextLineVertical)) ): line.add(obj1) elif line is not None: - line.fixate() - yield line + yield line.finish() line = None else: if k == 2: - line = LTTextLineVertical(laparams) + line = LTTextLineVertical(word_margin) line.add(obj0) line.add(obj1) elif k == 1: - line = LTTextLineHorizontal(laparams) + line = LTTextLineHorizontal(word_margin) line.add(obj0) line.add(obj1) else: - line = LTTextLineHorizontal(laparams) + line = LTTextLineHorizontal(word_margin) line.add(obj0) - line.fixate() - yield line + yield line.finish() line = None obj0 = obj1 if line is None: - line = LTTextLineHorizontal(laparams) + line = LTTextLineHorizontal(word_margin) line.add(obj0) - line.fixate() - yield line + yield line.finish() return - def get_textboxes(self, lines, laparams): + def get_textboxes(self, lines, line_margin): plane = Plane(lines) - groups = {} for line in lines: - neighbors = line.find_neighbors(plane, laparams.line_margin) + plane.add(line) + plane.finish() + boxes = {} + for line in lines: + neighbors = line.find_neighbors(plane, line_margin) assert line in neighbors, line - members = neighbors[:] + members = [] for obj1 in neighbors: - if obj1 in groups: - members.extend(groups.pop(obj1)) - members = list(uniq(members)) + members.append(obj1) + if obj1 in boxes: + members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): - group = LTTextBoxHorizontal(members) + box = LTTextBoxHorizontal() else: - group = LTTextBoxVertical(members) - for obj in members: - groups[obj] = group + box = LTTextBoxVertical() + for obj in uniq(members): + box.add(obj) + boxes[obj] = box done = set() for line in lines: - group = groups[line] - if group in done: continue - done.add(group) - group.fixate() - yield group + box = boxes[line] + if box in done: continue + done.add(box) + yield box.finish() return - def group_textboxes(self, textboxes, laparams): + def group_textboxes(self, boxes): def dist(obj1, obj2): """A distance function between two TextBoxes. @@ -599,43 +569,44 @@ class LTAnalyzer(LTContainer): return ((max(obj1.x1,obj2.x1) - min(obj1.x0,obj2.x0)) * (max(obj1.y1,obj2.y1) - min(obj1.y0,obj2.y0)) - (obj1.width*obj1.height + obj2.width*obj2.height)) - textboxes = textboxes[:] + boxes = boxes[:] # XXX this is slow when there're many textboxes. - while 2 <= len(textboxes): + while 2 <= len(boxes): mindist = INF minpair = None - textboxes = csort(textboxes, key=lambda obj: obj.width*obj.height) - for i in xrange(len(textboxes)): - for j in xrange(i+1, len(textboxes)): - (obj1, obj2) = (textboxes[i], textboxes[j]) + boxes = csort(boxes, key=lambda obj: obj.width*obj.height) + for i in xrange(len(boxes)): + for j in xrange(i+1, len(boxes)): + (obj1, obj2) = (boxes[i], boxes[j]) d = dist(obj1, obj2) if d < mindist: mindist = d minpair = (obj1, obj2) assert minpair (obj1, obj2) = minpair - textboxes.remove(obj1) - textboxes.remove(obj2) - if isinstance(obj1, LTTextBoxHorizontal): - group = LTTextGroupLRTB([obj1, obj2]) - else: + boxes.remove(obj1) + boxes.remove(obj2) + if (isinstance(obj1, LTTextBoxVertical) or + isinstance(obj1, LTTextGroupTBRL)): group = LTTextGroupTBRL([obj1, obj2]) - textboxes.append(group) - assert len(textboxes) == 1 - return textboxes.pop() + else: + group = LTTextGroupLRTB([obj1, obj2]) + boxes.append(group.finish()) + assert len(boxes) == 1 + return boxes.pop() ## LTFigure ## -class LTFigure(LTAnalyzer): +class LTFigure(LTLayoutContainer): - def __init__(self, name, bbox, matrix): + def __init__(self, name, bbox, matrix, laparams=None): self.name = name self.matrix = matrix (x,y,w,h) = bbox bbox = get_bound( apply_matrix_pt(matrix, (p,q)) for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) - LTAnalyzer.__init__(self, bbox=bbox) + LTLayoutContainer.__init__(self, bbox, laparams=laparams) return def __repr__(self): @@ -643,21 +614,19 @@ class LTFigure(LTAnalyzer): (self.__class__.__name__, self.name, bbox2str(self.bbox), matrix2str(self.matrix))) - def analyze(self, laparams=None): - if laparams is not None and laparams.all_texts: - LTAnalyzer.analyze(self, laparams=laparams) - return + def finish(self): + if self.laparams is None or not self.laparams.all_texts: return + return LTLayoutContainer.finish(self) ## LTPage ## -class LTPage(LTAnalyzer): +class LTPage(LTLayoutContainer): - def __init__(self, pageid, bbox, rotate=0): - LTAnalyzer.__init__(self, bbox=bbox) + def __init__(self, pageid, bbox, rotate=0, laparams=None): + LTLayoutContainer.__init__(self, bbox, laparams=laparams) self.pageid = pageid self.rotate = rotate - self.layout = None return def __repr__(self): diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 8652c71..627e701 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -182,6 +182,54 @@ class ObjIdRange(object): return self.nobjs +## Plane +## +## A data structure for objects placed on a plane. +## Can efficiently find objects in a certain rectangular area. +## It maintains two parallel lists of objects, each of +## which is sorted by its x or y coordinate. +## +class Plane(object): + + def __init__(self, objs): + self._idxs = {} + self._xobjs = [] + self._yobjs = [] + return + + def __repr__(self): + return ('' % list(self)) + + def __iter__(self): + return self._idxs.iterkeys() + + # add(obj): place an object in a certain area. + def add(self, obj): + self._idxs[obj] = len(self._idxs) + self._xobjs.append((obj.x0, obj)) + self._xobjs.append((obj.x1, obj)) + self._yobjs.append((obj.y0, obj)) + self._yobjs.append((obj.y1, obj)) + return + + # finish() + def finish(self): + self._xobjs.sort() + self._yobjs.sort() + return + + # find(): finds objects that are in a certain area. + def find(self, (x0,y0,x1,y1)): + i0 = bsearch(self._xobjs, x0)[0] + i1 = bsearch(self._xobjs, x1)[1] + xobjs = set( obj for (_,obj) in self._xobjs[i0:i1] ) + i0 = bsearch(self._yobjs, y0)[0] + i1 = bsearch(self._yobjs, y1)[1] + yobjs = set( obj for (_,obj) in self._yobjs[i0:i1] ) + xobjs.intersection_update(yobjs) + return sorted(xobjs, key=lambda obj: self._idxs[obj]) + + # create_bmp def create_bmp(data, bits, width, height): info = pack('