From 3305c07ba2f2c4e610161b30b1e7349fefdd8618 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 17 Oct 2010 05:13:39 +0000 Subject: [PATCH] layout analysis improved git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@245 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/cmapdb.py | 2 +- pdfminer/converter.py | 13 ++-- pdfminer/layout.py | 147 +++++++++++++++++++++--------------------- pdfminer/utils.py | 12 ++++ samples/Makefile | 3 + 5 files changed, 97 insertions(+), 80 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index e6120eb..358f691 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -121,7 +121,7 @@ class FileCMap(CMap): return '' % self.attrs.get('CMapName') def is_vertical(self): - return self.attrs.get('WMode', 0) + return self.attrs.get('WMode', 0) != 0 def set_attr(self, k, v): self.attrs[k] = v diff --git a/pdfminer/converter.py b/pdfminer/converter.py index cf63b35..e6224b2 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -170,11 +170,11 @@ class TextConverter(PDFConverter): def receive_layout(self, ltpage): def render(item): - if isinstance(item, LTText): - self.write(item.text) - elif isinstance(item, LTContainer): + if isinstance(item, LTContainer): for child in item: render(child) + elif isinstance(item, LTText): + self.write(item.get_text()) if isinstance(item, LTTextBox): self.write('\n') if self.showpageno: @@ -231,20 +231,21 @@ class HTMLConverter(PDFConverter): elif isinstance(item, LTChar): self.write_text(item.text, item.x0, item.y1, item.get_size()) if self.debug: - self.write_rect('red', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTPolygon): self.write_rect('black', 1, item.x0, item.y1, item.width, item.height) elif isinstance(item, LTTextLine): + self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTTextBox): - self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) if self.debug: self.write_text(str(item.index+1), item.x0, item.y1, 20) elif isinstance(item, LTFigure): - self.write_rect('green', 1, item.x0, item.y1, item.width, item.height) + self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height) for child in item: render(child) elif isinstance(item, LTImage): diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 7ee60df..bc93ff0 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,24 +1,10 @@ #!/usr/bin/env python import sys -from sys import maxint as INF -from utils import apply_matrix_pt +from utils import apply_matrix_pt, get_bound, INF from utils import bsearch, bbox2str, matrix2str from pdffont import PDFUnicodeNotDefined - -## get_bounds -## -def get_bounds(pts): - """Compute a minimal rectangle that covers all the points.""" - (x0, y0, x1, y1) = (INF, INF, -INF, -INF) - for (x,y) in pts: - x0 = min(x0, x) - y0 = min(y0, y) - x1 = max(x1, x) - y1 = max(y1, y) - return (x0,y0,x1,y1) - def uniq(objs): done = set() for obj in objs: @@ -39,7 +25,7 @@ class LAParams(object): def __init__(self, writing_mode='lr-tb', line_overlap=0.5, - char_margin=3.0, + char_margin=2.0, line_margin=0.5, word_margin=0.1, all_texts=False): @@ -52,8 +38,8 @@ class LAParams(object): return def __repr__(self): - return ('' % - (self.writing_mode, self.char_margin, self.line_margin, self.word_margin, self.all_texts)) + return ('' % + (self.char_margin, self.line_margin, self.word_margin, self.all_texts)) ## LTItem @@ -65,7 +51,8 @@ class LTItem(object): return def __repr__(self): - return ('' % bbox2str(self.bbox)) + return ('<%s %s>' % + (self.__class__.__name__, bbox2str(self.bbox))) def set_bbox(self, (x0,y0,x1,y1)): if x1 < x0: (x0,x1) = (x1,x0) @@ -123,7 +110,7 @@ class LTPolygon(LTItem): def __init__(self, linewidth, pts): self.pts = pts self.linewidth = linewidth - LTItem.__init__(self, get_bounds(pts)) + LTItem.__init__(self, get_bound(pts)) return def get_pts(self): @@ -167,7 +154,9 @@ class LTImage(LTItem): def __repr__(self): (w,h) = self.srcsize - return '' % (self.name, w, h) + return ('<%s(%s) %s %dx%d>' % + (self.__class__.__name__, self.name, + bbox2str(self.bbox), w, h)) ## LTText @@ -179,10 +168,11 @@ class LTText(object): return def __repr__(self): - return '' % self.text + return ('<%s %r>' % + (self.__class__.__name__, self.get_text())) - def is_upright(self): - return True + def get_text(self): + return self.text ## LTAnon @@ -239,20 +229,18 @@ class LTChar(LTItem, LTText): def __repr__(self): if self.debug: - return ('' % - (matrix2str(self.matrix), self.font, self.fontsize, - bbox2str(self.bbox), self.adv, self.text)) + return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' % + (self.__class__.__name__, bbox2str(self.bbox), + matrix2str(self.matrix), self.font, self.fontsize, + self.adv, self.get_text())) else: return '' % self.text def get_size(self): return max(self.width, self.height) - def is_vertical(self): - return self.font.is_vertical() - - def is_upright(self): - return self.upright + def is_compatible(self, obj): + return True ## LTContainer @@ -267,9 +255,6 @@ class LTContainer(LTItem): self._objs = [] return - def __repr__(self): - return ('' % bbox2str(self.bbox)) - def __iter__(self): return iter(self.get_objs()) @@ -302,15 +287,17 @@ class LTContainer(LTItem): ## LTTextLine ## -class LTTextLine(LTContainer): +class LTTextLine(LTContainer, LTText): - def __init__(self, word_margin=0): - self.word_margin = word_margin + def __init__(self, laparams=None): + self.laparams = laparams LTContainer.__init__(self) return def __repr__(self): - return ('' % bbox2str(self.bbox)) + return ('<%s %s %r>' % + (self.__class__.__name__, bbox2str(self.bbox), + self.get_text())) def get_text(self): return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) ) @@ -320,14 +307,16 @@ class LTTextLine(LTContainer): class LTTextLineHorizontal(LTTextLine): - def __repr__(self): - return ('' % bbox2str(self.bbox)) - def get_objs(self): + if self.laparams is None: + for obj in self._objs: + yield obj + return + word_margin = self.laparams.word_margin x1 = INF for obj in csort(self._objs, key=lambda obj: obj.x0): - if isinstance(obj, LTChar) and self.word_margin: - margin = self.word_margin * obj.width + if isinstance(obj, LTChar) and word_margin: + margin = word_margin * obj.width if x1 < obj.x0-margin: yield LTAnon(' ') yield obj @@ -342,14 +331,16 @@ class LTTextLineHorizontal(LTTextLine): class LTTextLineVertical(LTTextLine): - def __repr__(self): - return ('' % bbox2str(self.bbox)) - def get_objs(self): + if self.laparams is None: + for obj in self._objs: + yield obj + return + word_margin = self.laparams.word_margin y0 = -INF for obj in csort(self._objs, key=lambda obj: -obj.y1): - if isinstance(obj, LTChar) and self.word_margin: - margin = self.word_margin * obj.height + if isinstance(obj, LTChar) and word_margin: + margin = word_margin * obj.height if obj.y1+margin < y0: yield LTAnon(' ') yield obj @@ -376,7 +367,9 @@ class LTTextBox(LTContainer): return def __repr__(self): - return ('' % (self.index, bbox2str(self.bbox), self.get_text()[:20])) + return ('<%s(%s) %s %r...>' % + (self.__class__.__name__, self.index, + bbox2str(self.bbox), self.get_text()[:20])) def get_text(self): return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) ) @@ -489,7 +482,7 @@ class LTAnalyzer(LTContainer): textobjs = [] otherobjs = [] for obj in objs: - if isinstance(obj, LTText) and obj.is_upright(): + if isinstance(obj, LTChar): textobjs.append(obj) else: otherobjs.append(obj) @@ -499,11 +492,9 @@ class LTAnalyzer(LTContainer): obj0 = None line = None for obj1 in objs: - if obj0 is None: - obj0 = obj1 - else: + if obj0 is not None: k = 0 - if (obj0.is_voverlap(obj1) and + if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin): # obj0 and obj1 is horizontally aligned: @@ -517,7 +508,7 @@ class LTAnalyzer(LTContainer): # |<--->| # (char_margin) k |= 1 - if (obj0.is_hoverlap(obj1) and + if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin): # obj0 and obj1 is vertically aligned: @@ -538,22 +529,29 @@ class LTAnalyzer(LTContainer): if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or (k & 2 and isinstance(line, LTTextLineVertical)) ): line.add(obj1) - elif line is None: - if k == 2: - line = LTTextLineVertical(laparams.word_margin) - else: - line = LTTextLineHorizontal(laparams.word_margin) - line.add(obj0) - line.add(obj1) - else: + elif line is not None: line.fixate() yield line line = None - obj0 = obj1 + else: + if k == 2: + line = LTTextLineVertical(laparams) + line.add(obj0) + line.add(obj1) + elif k == 1: + line = LTTextLineHorizontal(laparams) + line.add(obj0) + line.add(obj1) + else: + line = LTTextLineHorizontal(laparams) + line.add(obj0) + line.fixate() + yield line + line = None + obj0 = obj1 if line is None: - line = LTTextLineHorizontal(laparams.word_margin) - if obj0 is not None: - line.add(obj0) + line = LTTextLineHorizontal(laparams) + line.add(obj0) line.fixate() yield line return @@ -633,14 +631,15 @@ class LTFigure(LTAnalyzer): self.name = name self.matrix = matrix (x,y,w,h) = bbox - bbox = get_bounds( apply_matrix_pt(matrix, (p,q)) - for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) + bbox = get_bound( apply_matrix_pt(matrix, (p,q)) + for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) ) LTAnalyzer.__init__(self, bbox=bbox) return def __repr__(self): - return ('
' % - (self.name, bbox2str(self.bbox), matrix2str(self.matrix))) + return ('<%s(%s) %s matrix=%s>' % + (self.__class__.__name__, self.name, + bbox2str(self.bbox), matrix2str(self.matrix))) def analyze(self, laparams=None): if laparams is not None and laparams.all_texts: @@ -660,4 +659,6 @@ class LTPage(LTAnalyzer): return def __repr__(self): - return ('' % (self.pageid, bbox2str(self.bbox), self.rotate)) + return ('<%s(%r) %s rotate=%r>' % + (self.__class__.__name__, self.pageid, + bbox2str(self.bbox), self.rotate)) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 9ff18ab..88893c9 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from sys import maxint as INF from struct import pack, unpack @@ -28,6 +29,17 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)): ## Utility functions ## +# get_bound +def get_bound(pts): + '''Compute a minimal rectangle that covers all the points.''' + (x0, y0, x1, y1) = (INF, INF, -INF, -INF) + for (x,y) in pts: + x0 = min(x0, x) + y0 = min(y0, y) + x1 = max(x1, x) + y1 = max(y1, y) + return (x0,y0,x1,y1) + # pick def pick(seq, func, maxobj=None): '''Picks the object obj where func(obj) has the highest value.''' diff --git a/samples/Makefile b/samples/Makefile index e2c98fa..d652ed5 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -48,6 +48,9 @@ XMLS_NONFREE= \ nonfree/naacl06-shinyama.xml \ nonfree/nlp2004slides.xml +all: + $(MAKE) test CMP=cmp + test: htmls texts xmls clean: