layout analysis improved
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@245 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
bc1303e901
commit
3305c07ba2
|
@ -121,7 +121,7 @@ class FileCMap(CMap):
|
||||||
return '<CMap: %s>' % self.attrs.get('CMapName')
|
return '<CMap: %s>' % self.attrs.get('CMapName')
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_vertical(self):
|
||||||
return self.attrs.get('WMode', 0)
|
return self.attrs.get('WMode', 0) != 0
|
||||||
|
|
||||||
def set_attr(self, k, v):
|
def set_attr(self, k, v):
|
||||||
self.attrs[k] = v
|
self.attrs[k] = v
|
||||||
|
|
|
@ -170,11 +170,11 @@ class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def receive_layout(self, ltpage):
|
def receive_layout(self, ltpage):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTText):
|
if isinstance(item, LTContainer):
|
||||||
self.write(item.text)
|
|
||||||
elif isinstance(item, LTContainer):
|
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
|
elif isinstance(item, LTText):
|
||||||
|
self.write(item.get_text())
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write('\n')
|
self.write('\n')
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
|
@ -231,20 +231,21 @@ class HTMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.write_text(item.text, item.x0, item.y1, item.get_size())
|
self.write_text(item.text, item.x0, item.y1, item.get_size())
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_rect('red', 1, item.x0, item.y1, item.width, item.height)
|
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTPolygon):
|
elif isinstance(item, LTPolygon):
|
||||||
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
|
self.write_rect('black', 1, item.x0, item.y1, item.width, item.height)
|
||||||
elif isinstance(item, LTTextLine):
|
elif isinstance(item, LTTextLine):
|
||||||
|
self.write_rect('magenta', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTTextBox):
|
elif isinstance(item, LTTextBox):
|
||||||
self.write_rect('blue', 1, item.x0, item.y1, item.width, item.height)
|
self.write_rect('cyan', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
if self.debug:
|
if self.debug:
|
||||||
self.write_text(str(item.index+1), item.x0, item.y1, 20)
|
self.write_text(str(item.index+1), item.x0, item.y1, 20)
|
||||||
elif isinstance(item, LTFigure):
|
elif isinstance(item, LTFigure):
|
||||||
self.write_rect('green', 1, item.x0, item.y1, item.width, item.height)
|
self.write_rect('yellow', 1, item.x0, item.y1, item.width, item.height)
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
|
|
|
@ -1,24 +1,10 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import sys
|
import sys
|
||||||
from sys import maxint as INF
|
from utils import apply_matrix_pt, get_bound, INF
|
||||||
from utils import apply_matrix_pt
|
|
||||||
from utils import bsearch, bbox2str, matrix2str
|
from utils import bsearch, bbox2str, matrix2str
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## get_bounds
|
|
||||||
##
|
|
||||||
def get_bounds(pts):
|
|
||||||
"""Compute a minimal rectangle that covers all the points."""
|
|
||||||
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
|
||||||
for (x,y) in pts:
|
|
||||||
x0 = min(x0, x)
|
|
||||||
y0 = min(y0, y)
|
|
||||||
x1 = max(x1, x)
|
|
||||||
y1 = max(y1, y)
|
|
||||||
return (x0,y0,x1,y1)
|
|
||||||
|
|
||||||
def uniq(objs):
|
def uniq(objs):
|
||||||
done = set()
|
done = set()
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
|
@ -39,7 +25,7 @@ class LAParams(object):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
writing_mode='lr-tb',
|
writing_mode='lr-tb',
|
||||||
line_overlap=0.5,
|
line_overlap=0.5,
|
||||||
char_margin=3.0,
|
char_margin=2.0,
|
||||||
line_margin=0.5,
|
line_margin=0.5,
|
||||||
word_margin=0.1,
|
word_margin=0.1,
|
||||||
all_texts=False):
|
all_texts=False):
|
||||||
|
@ -52,8 +38,8 @@ class LAParams(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<LAParams: writing_mode=%r, char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
|
return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
|
||||||
(self.writing_mode, self.char_margin, self.line_margin, self.word_margin, self.all_texts))
|
(self.char_margin, self.line_margin, self.word_margin, self.all_texts))
|
||||||
|
|
||||||
|
|
||||||
## LTItem
|
## LTItem
|
||||||
|
@ -65,7 +51,8 @@ class LTItem(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<item bbox=%s>' % bbox2str(self.bbox))
|
return ('<%s %s>' %
|
||||||
|
(self.__class__.__name__, bbox2str(self.bbox)))
|
||||||
|
|
||||||
def set_bbox(self, (x0,y0,x1,y1)):
|
def set_bbox(self, (x0,y0,x1,y1)):
|
||||||
if x1 < x0: (x0,x1) = (x1,x0)
|
if x1 < x0: (x0,x1) = (x1,x0)
|
||||||
|
@ -123,7 +110,7 @@ class LTPolygon(LTItem):
|
||||||
def __init__(self, linewidth, pts):
|
def __init__(self, linewidth, pts):
|
||||||
self.pts = pts
|
self.pts = pts
|
||||||
self.linewidth = linewidth
|
self.linewidth = linewidth
|
||||||
LTItem.__init__(self, get_bounds(pts))
|
LTItem.__init__(self, get_bound(pts))
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_pts(self):
|
def get_pts(self):
|
||||||
|
@ -167,7 +154,9 @@ class LTImage(LTItem):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
(w,h) = self.srcsize
|
(w,h) = self.srcsize
|
||||||
return '<image %s %dx%d>' % (self.name, w, h)
|
return ('<%s(%s) %s %dx%d>' %
|
||||||
|
(self.__class__.__name__, self.name,
|
||||||
|
bbox2str(self.bbox), w, h))
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTText
|
||||||
|
@ -179,10 +168,11 @@ class LTText(object):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<text %r>' % self.text
|
return ('<%s %r>' %
|
||||||
|
(self.__class__.__name__, self.get_text()))
|
||||||
|
|
||||||
def is_upright(self):
|
def get_text(self):
|
||||||
return True
|
return self.text
|
||||||
|
|
||||||
|
|
||||||
## LTAnon
|
## LTAnon
|
||||||
|
@ -239,20 +229,18 @@ class LTChar(LTItem, LTText):
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
return ('<char matrix=%s font=%r fontsize=%.1f bbox=%s adv=%s text=%r>' %
|
return ('<%s %s matrix=%s font=%r fontsize=%.1f adv=%s text=%r>' %
|
||||||
(matrix2str(self.matrix), self.font, self.fontsize,
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
bbox2str(self.bbox), self.adv, self.text))
|
matrix2str(self.matrix), self.font, self.fontsize,
|
||||||
|
self.adv, self.get_text()))
|
||||||
else:
|
else:
|
||||||
return '<char %r>' % self.text
|
return '<char %r>' % self.text
|
||||||
|
|
||||||
def get_size(self):
|
def get_size(self):
|
||||||
return max(self.width, self.height)
|
return max(self.width, self.height)
|
||||||
|
|
||||||
def is_vertical(self):
|
def is_compatible(self, obj):
|
||||||
return self.font.is_vertical()
|
return True
|
||||||
|
|
||||||
def is_upright(self):
|
|
||||||
return self.upright
|
|
||||||
|
|
||||||
|
|
||||||
## LTContainer
|
## LTContainer
|
||||||
|
@ -267,9 +255,6 @@ class LTContainer(LTItem):
|
||||||
self._objs = []
|
self._objs = []
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<container %s>' % bbox2str(self.bbox))
|
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
return iter(self.get_objs())
|
return iter(self.get_objs())
|
||||||
|
|
||||||
|
@ -302,15 +287,17 @@ class LTContainer(LTItem):
|
||||||
|
|
||||||
## LTTextLine
|
## LTTextLine
|
||||||
##
|
##
|
||||||
class LTTextLine(LTContainer):
|
class LTTextLine(LTContainer, LTText):
|
||||||
|
|
||||||
def __init__(self, word_margin=0):
|
def __init__(self, laparams=None):
|
||||||
self.word_margin = word_margin
|
self.laparams = laparams
|
||||||
LTContainer.__init__(self)
|
LTContainer.__init__(self)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textline %s>' % bbox2str(self.bbox))
|
return ('<%s %s %r>' %
|
||||||
|
(self.__class__.__name__, bbox2str(self.bbox),
|
||||||
|
self.get_text()))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) )
|
return ''.join( obj.text for obj in self.get_objs() if isinstance(obj, LTText) )
|
||||||
|
@ -320,14 +307,16 @@ class LTTextLine(LTContainer):
|
||||||
|
|
||||||
class LTTextLineHorizontal(LTTextLine):
|
class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<textline-h %s>' % bbox2str(self.bbox))
|
|
||||||
|
|
||||||
def get_objs(self):
|
def get_objs(self):
|
||||||
|
if self.laparams is None:
|
||||||
|
for obj in self._objs:
|
||||||
|
yield obj
|
||||||
|
return
|
||||||
|
word_margin = self.laparams.word_margin
|
||||||
x1 = INF
|
x1 = INF
|
||||||
for obj in csort(self._objs, key=lambda obj: obj.x0):
|
for obj in csort(self._objs, key=lambda obj: obj.x0):
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and word_margin:
|
||||||
margin = self.word_margin * obj.width
|
margin = word_margin * obj.width
|
||||||
if x1 < obj.x0-margin:
|
if x1 < obj.x0-margin:
|
||||||
yield LTAnon(' ')
|
yield LTAnon(' ')
|
||||||
yield obj
|
yield obj
|
||||||
|
@ -342,14 +331,16 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
class LTTextLineVertical(LTTextLine):
|
class LTTextLineVertical(LTTextLine):
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return ('<textline-v %s>' % bbox2str(self.bbox))
|
|
||||||
|
|
||||||
def get_objs(self):
|
def get_objs(self):
|
||||||
|
if self.laparams is None:
|
||||||
|
for obj in self._objs:
|
||||||
|
yield obj
|
||||||
|
return
|
||||||
|
word_margin = self.laparams.word_margin
|
||||||
y0 = -INF
|
y0 = -INF
|
||||||
for obj in csort(self._objs, key=lambda obj: -obj.y1):
|
for obj in csort(self._objs, key=lambda obj: -obj.y1):
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and word_margin:
|
||||||
margin = self.word_margin * obj.height
|
margin = word_margin * obj.height
|
||||||
if obj.y1+margin < y0:
|
if obj.y1+margin < y0:
|
||||||
yield LTAnon(' ')
|
yield LTAnon(' ')
|
||||||
yield obj
|
yield obj
|
||||||
|
@ -376,7 +367,9 @@ class LTTextBox(LTContainer):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<textbox(%s) %s %r...>' % (self.index, bbox2str(self.bbox), self.get_text()[:20]))
|
return ('<%s(%s) %s %r...>' %
|
||||||
|
(self.__class__.__name__, self.index,
|
||||||
|
bbox2str(self.bbox), self.get_text()[:20]))
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) )
|
return ''.join( obj.get_text() for obj in self.get_objs() if isinstance(obj, LTTextLine) )
|
||||||
|
@ -489,7 +482,7 @@ class LTAnalyzer(LTContainer):
|
||||||
textobjs = []
|
textobjs = []
|
||||||
otherobjs = []
|
otherobjs = []
|
||||||
for obj in objs:
|
for obj in objs:
|
||||||
if isinstance(obj, LTText) and obj.is_upright():
|
if isinstance(obj, LTChar):
|
||||||
textobjs.append(obj)
|
textobjs.append(obj)
|
||||||
else:
|
else:
|
||||||
otherobjs.append(obj)
|
otherobjs.append(obj)
|
||||||
|
@ -499,11 +492,9 @@ class LTAnalyzer(LTContainer):
|
||||||
obj0 = None
|
obj0 = None
|
||||||
line = None
|
line = None
|
||||||
for obj1 in objs:
|
for obj1 in objs:
|
||||||
if obj0 is None:
|
if obj0 is not None:
|
||||||
obj0 = obj1
|
|
||||||
else:
|
|
||||||
k = 0
|
k = 0
|
||||||
if (obj0.is_voverlap(obj1) and
|
if (obj0.is_compatible(obj1) and obj0.is_voverlap(obj1) and
|
||||||
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
|
min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1) and
|
||||||
obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin):
|
obj0.hdistance(obj1) < min(obj0.width, obj1.width) * laparams.char_margin):
|
||||||
# obj0 and obj1 is horizontally aligned:
|
# obj0 and obj1 is horizontally aligned:
|
||||||
|
@ -517,7 +508,7 @@ class LTAnalyzer(LTContainer):
|
||||||
# |<--->|
|
# |<--->|
|
||||||
# (char_margin)
|
# (char_margin)
|
||||||
k |= 1
|
k |= 1
|
||||||
if (obj0.is_hoverlap(obj1) and
|
if (obj0.is_compatible(obj1) and obj0.is_hoverlap(obj1) and
|
||||||
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
|
min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1) and
|
||||||
obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin):
|
obj0.vdistance(obj1) < min(obj0.height, obj1.height) * laparams.char_margin):
|
||||||
# obj0 and obj1 is vertically aligned:
|
# obj0 and obj1 is vertically aligned:
|
||||||
|
@ -538,22 +529,29 @@ class LTAnalyzer(LTContainer):
|
||||||
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
if ( (k & 1 and isinstance(line, LTTextLineHorizontal)) or
|
||||||
(k & 2 and isinstance(line, LTTextLineVertical)) ):
|
(k & 2 and isinstance(line, LTTextLineVertical)) ):
|
||||||
line.add(obj1)
|
line.add(obj1)
|
||||||
elif line is None:
|
elif line is not None:
|
||||||
if k == 2:
|
|
||||||
line = LTTextLineVertical(laparams.word_margin)
|
|
||||||
else:
|
|
||||||
line = LTTextLineHorizontal(laparams.word_margin)
|
|
||||||
line.add(obj0)
|
|
||||||
line.add(obj1)
|
|
||||||
else:
|
|
||||||
line.fixate()
|
line.fixate()
|
||||||
yield line
|
yield line
|
||||||
line = None
|
line = None
|
||||||
obj0 = obj1
|
else:
|
||||||
|
if k == 2:
|
||||||
|
line = LTTextLineVertical(laparams)
|
||||||
|
line.add(obj0)
|
||||||
|
line.add(obj1)
|
||||||
|
elif k == 1:
|
||||||
|
line = LTTextLineHorizontal(laparams)
|
||||||
|
line.add(obj0)
|
||||||
|
line.add(obj1)
|
||||||
|
else:
|
||||||
|
line = LTTextLineHorizontal(laparams)
|
||||||
|
line.add(obj0)
|
||||||
|
line.fixate()
|
||||||
|
yield line
|
||||||
|
line = None
|
||||||
|
obj0 = obj1
|
||||||
if line is None:
|
if line is None:
|
||||||
line = LTTextLineHorizontal(laparams.word_margin)
|
line = LTTextLineHorizontal(laparams)
|
||||||
if obj0 is not None:
|
line.add(obj0)
|
||||||
line.add(obj0)
|
|
||||||
line.fixate()
|
line.fixate()
|
||||||
yield line
|
yield line
|
||||||
return
|
return
|
||||||
|
@ -633,14 +631,15 @@ class LTFigure(LTAnalyzer):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
(x,y,w,h) = bbox
|
(x,y,w,h) = bbox
|
||||||
bbox = get_bounds( apply_matrix_pt(matrix, (p,q))
|
bbox = get_bound( apply_matrix_pt(matrix, (p,q))
|
||||||
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
for (p,q) in ((x,y), (x+w,y), (x,y+h), (x+w,y+h)) )
|
||||||
LTAnalyzer.__init__(self, bbox=bbox)
|
LTAnalyzer.__init__(self, bbox=bbox)
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<figure %r bbox=%s matrix=%s>' %
|
return ('<%s(%s) %s matrix=%s>' %
|
||||||
(self.name, bbox2str(self.bbox), matrix2str(self.matrix)))
|
(self.__class__.__name__, self.name,
|
||||||
|
bbox2str(self.bbox), matrix2str(self.matrix)))
|
||||||
|
|
||||||
def analyze(self, laparams=None):
|
def analyze(self, laparams=None):
|
||||||
if laparams is not None and laparams.all_texts:
|
if laparams is not None and laparams.all_texts:
|
||||||
|
@ -660,4 +659,6 @@ class LTPage(LTAnalyzer):
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<page(%r) bbox=%s rotate=%r>' % (self.pageid, bbox2str(self.bbox), self.rotate))
|
return ('<%s(%r) %s rotate=%r>' %
|
||||||
|
(self.__class__.__name__, self.pageid,
|
||||||
|
bbox2str(self.bbox), self.rotate))
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
from sys import maxint as INF
|
||||||
from struct import pack, unpack
|
from struct import pack, unpack
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,6 +29,17 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
## Utility functions
|
## Utility functions
|
||||||
##
|
##
|
||||||
|
|
||||||
|
# get_bound
|
||||||
|
def get_bound(pts):
|
||||||
|
'''Compute a minimal rectangle that covers all the points.'''
|
||||||
|
(x0, y0, x1, y1) = (INF, INF, -INF, -INF)
|
||||||
|
for (x,y) in pts:
|
||||||
|
x0 = min(x0, x)
|
||||||
|
y0 = min(y0, y)
|
||||||
|
x1 = max(x1, x)
|
||||||
|
y1 = max(y1, y)
|
||||||
|
return (x0,y0,x1,y1)
|
||||||
|
|
||||||
# pick
|
# pick
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(seq, func, maxobj=None):
|
||||||
'''Picks the object obj where func(obj) has the highest value.'''
|
'''Picks the object obj where func(obj) has the highest value.'''
|
||||||
|
|
|
@ -48,6 +48,9 @@ XMLS_NONFREE= \
|
||||||
nonfree/naacl06-shinyama.xml \
|
nonfree/naacl06-shinyama.xml \
|
||||||
nonfree/nlp2004slides.xml
|
nonfree/nlp2004slides.xml
|
||||||
|
|
||||||
|
all:
|
||||||
|
$(MAKE) test CMP=cmp
|
||||||
|
|
||||||
test: htmls texts xmls
|
test: htmls texts xmls
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|
Loading…
Reference in New Issue