diff --git a/pdfminer/converter.py b/pdfminer/converter.py index b963443..c7ba6b5 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -329,11 +329,11 @@ class HTMLConverter(PDFConverter): return def receive_layout(self, ltpage): - def show_layout(item): + def show_group(item): if isinstance(item, LTTextGroup): self.place_border('textgroup', 1, item) for child in item: - show_layout(child) + show_group(child) return def render(item): if isinstance(item, LTPage): @@ -345,8 +345,9 @@ class HTMLConverter(PDFConverter): self.write('Page %s\n' % (item.pageid, item.pageid)) for child in item: render(child) - if item.layout: - show_layout(item.layout) + if item.groups is not None: + for group in item.groups: + show_group(group) elif isinstance(item, LTCurve): self.place_border('curve', 1, item) elif isinstance(item, LTFigure): @@ -419,14 +420,14 @@ class XMLConverter(PDFConverter): return def receive_layout(self, ltpage): - def show_layout(item): + def show_group(item): if isinstance(item, LTTextBox): self.outfp.write('\n' % (item.index, bbox2str(item.bbox))) elif isinstance(item, LTTextGroup): self.outfp.write('\n' % bbox2str(item.bbox)) for child in item: - show_layout(child) + show_group(child) self.outfp.write('\n') return def render(item): @@ -435,9 +436,10 @@ class XMLConverter(PDFConverter): (item.pageid, bbox2str(item.bbox), item.rotate)) for child in item: render(child) - if item.layout: + if item.groups is not None: self.outfp.write('\n') - show_layout(item.layout) + for group in item.groups: + show_group(group) self.outfp.write('\n') self.outfp.write('\n') elif isinstance(item, LTLine): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index db4a33c..4096330 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -4,6 +4,24 @@ from utils import INF, Plane, get_bound, uniq, csort, fsplit from utils import bbox2str, matrix2str, apply_matrix_pt +## IndexAssigner +## +class IndexAssigner(object): + + def __init__(self, index=0): + self.index = index + return + + def run(self, obj): + if isinstance(obj, LTTextBox): + obj.index = self.index + self.index += 1 + elif isinstance(obj, LTTextGroup): + for x in obj: + self.run(obj) + return + + ## LAParams ## class LAParams(object): @@ -438,7 +456,7 @@ class LTLayoutContainer(LTContainer): def __init__(self, bbox): LTContainer.__init__(self, bbox) - self.layout = None + self.groups = None return def analyze(self, laparams): @@ -455,20 +473,14 @@ class LTLayoutContainer(LTContainer): obj.analyze(laparams) textboxes = list(self.get_textboxes(laparams, textlines)) assert len(textlines) == sum( len(box._objs) for box in textboxes ) - top = self.group_textboxes(laparams, textboxes) - top.analyze(laparams) - def assign_index(obj, i): - if isinstance(obj, LTTextBox): - obj.index = i - i += 1 - elif isinstance(obj, LTTextGroup): - for x in obj: - i = assign_index(x, i) - return i - assign_index(top, 0) + groups = self.group_textboxes(laparams, textboxes) + assigner = IndexAssigner() + for group in groups: + group.analyze(laparams) + assigner.run(group) textboxes.sort(key=lambda box:box.index) self._objs = textboxes + otherobjs + empties - self.layout = top + self.groups = groups return def get_textlines(self, laparams, objs): @@ -614,7 +626,7 @@ class LTLayoutContainer(LTContainer): dists.sort() plane.add(group) assert len(plane) == 1 - return list(plane)[0] + return list(plane) ## LTFigure