Revert changes unrelated to this branch.
parent
b0b8818a41
commit
2049462f6f
|
@ -1,111 +0,0 @@
|
||||||
# Byte-compiled / optimized / DLL files
|
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
|
|
||||||
# C extensions
|
|
||||||
*.so
|
|
||||||
|
|
||||||
# Distribution / packaging
|
|
||||||
.Python
|
|
||||||
env/
|
|
||||||
build/
|
|
||||||
develop-eggs/
|
|
||||||
dist/
|
|
||||||
downloads/
|
|
||||||
eggs/
|
|
||||||
lib/
|
|
||||||
lib64/
|
|
||||||
parts/
|
|
||||||
sdist/
|
|
||||||
var/
|
|
||||||
*.egg-info/
|
|
||||||
.installed.cfg
|
|
||||||
*.egg
|
|
||||||
|
|
||||||
# PyInstaller
|
|
||||||
# Usually these files are written by a python script from a template
|
|
||||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
||||||
*.manifest
|
|
||||||
*.spec
|
|
||||||
|
|
||||||
# Installer logs
|
|
||||||
pip-log.txt
|
|
||||||
pip-delete-this-directory.txt
|
|
||||||
|
|
||||||
# Unit test / coverage reports
|
|
||||||
htmlcov/
|
|
||||||
.tox/
|
|
||||||
.coverage
|
|
||||||
.cache
|
|
||||||
nosetests.xml
|
|
||||||
coverage.xml
|
|
||||||
|
|
||||||
# Translations
|
|
||||||
*.mo
|
|
||||||
*.pot
|
|
||||||
|
|
||||||
# Django stuff:
|
|
||||||
*.log
|
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
docs/_build/
|
|
||||||
|
|
||||||
# PyBuilder
|
|
||||||
target/
|
|
||||||
=======
|
|
||||||
# Intermediate documents
|
|
||||||
*.xps
|
|
||||||
|
|
||||||
# Password and Key Files
|
|
||||||
*.pem
|
|
||||||
*.p12
|
|
||||||
|
|
||||||
# Compiled source #
|
|
||||||
###################
|
|
||||||
*.pyc
|
|
||||||
*.com
|
|
||||||
*.class
|
|
||||||
*.dll
|
|
||||||
*.exe
|
|
||||||
*.o
|
|
||||||
*.so
|
|
||||||
|
|
||||||
# Mecurial Files?
|
|
||||||
*.i
|
|
||||||
*.d
|
|
||||||
*.mo
|
|
||||||
*.hg/
|
|
||||||
|
|
||||||
# Python data files #
|
|
||||||
*.shelf
|
|
||||||
*.shelve
|
|
||||||
|
|
||||||
# Don't track these files, they are output from scripts
|
|
||||||
|
|
||||||
# Subversion files
|
|
||||||
*.svn-base
|
|
||||||
all-wcprops
|
|
||||||
entries
|
|
||||||
|
|
||||||
# Logs and databases #
|
|
||||||
######################
|
|
||||||
bulkloader-log-*
|
|
||||||
*.log
|
|
||||||
*.sql
|
|
||||||
*.sql3
|
|
||||||
*.sql3-journal
|
|
||||||
*.sqlite
|
|
||||||
|
|
||||||
# OS generated files #
|
|
||||||
######################
|
|
||||||
.DS_Store?
|
|
||||||
ehthumbs.db
|
|
||||||
Icon?
|
|
||||||
Thumbs.db
|
|
||||||
|
|
||||||
# tmp files #
|
|
||||||
#############
|
|
||||||
~$*.doc
|
|
||||||
~WRL*.tmp
|
|
||||||
|
|
||||||
*.gz
|
|
|
@ -1,5 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import logging
|
|
||||||
from .utils import INF
|
from .utils import INF
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
from .utils import get_bound
|
from .utils import get_bound
|
||||||
|
@ -356,9 +355,6 @@ class LTTextLine(LTTextContainer):
|
||||||
LTContainer.add(self, LTAnno('\n'))
|
LTContainer.add(self, LTAnno('\n'))
|
||||||
return
|
return
|
||||||
|
|
||||||
def group_line_neighbors(self, objs, ratio):
|
|
||||||
return objs
|
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@ -372,67 +368,21 @@ class LTTextLineHorizontal(LTTextLine):
|
||||||
|
|
||||||
def add(self, obj):
|
def add(self, obj):
|
||||||
if isinstance(obj, LTChar) and self.word_margin:
|
if isinstance(obj, LTChar) and self.word_margin:
|
||||||
# Add a space between words if separated by more than word_margin
|
|
||||||
# apart. Use the max of obj width and height so narrow letters
|
|
||||||
# (i, l) are treated like wider letters, reducing extra spaces
|
|
||||||
margin = self.word_margin * max(obj.width, obj.height)
|
margin = self.word_margin * max(obj.width, obj.height)
|
||||||
if self._x1 < obj.x0-margin:
|
if self._x1 < obj.x0-margin:
|
||||||
# But only do it if there is not already a space.
|
LTContainer.add(self, LTAnno(' '))
|
||||||
last_was_alpha = self._objs and \
|
|
||||||
isinstance(self._objs[-1], LTChar) and \
|
|
||||||
self._objs[-1]._text == ' '
|
|
||||||
if not last_was_alpha:
|
|
||||||
LTContainer.add(self, LTAnno(' '))
|
|
||||||
self._x1 = obj.x1
|
self._x1 = obj.x1
|
||||||
LTTextLine.add(self, obj)
|
LTTextLine.add(self, obj)
|
||||||
return
|
return
|
||||||
|
|
||||||
def is_neighbor(self, obj, d, same_line = False):
|
|
||||||
# Horizontal lines can only connect with horizontal lines
|
|
||||||
if not isinstance(obj, LTTextLineHorizontal):
|
|
||||||
return False
|
|
||||||
# Ensure they are vertically close
|
|
||||||
if abs(obj.height-self.height) >= d:
|
|
||||||
return False
|
|
||||||
# Ensure that they have similar start or stop x positions
|
|
||||||
if not (abs(obj.x0-self.x0) < d or
|
|
||||||
abs(obj.x1-self.x1) < d or
|
|
||||||
# Or that they intersect eachother horizontally.
|
|
||||||
(obj.x0 < self.x0 and obj.x1 > self.x0) or
|
|
||||||
(obj.x0 > self.x0 and obj.x0 < self.x1)):
|
|
||||||
return False
|
|
||||||
if same_line and not (
|
|
||||||
# Ensure they have similar
|
|
||||||
(obj.y0 == self.y0) or (obj.y1 == self.y1) or
|
|
||||||
(obj.y0 < self.y0 and obj.y0 > self.y1) or
|
|
||||||
(obj.y0 > self.y0 and obj.y1 < self.y0)):
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def group_line_neighbors(self, objs, ratio):
|
|
||||||
'''
|
|
||||||
Given a set of objects that may or may not be on the same line as this,
|
|
||||||
add the objects that are on the same line.
|
|
||||||
|
|
||||||
Return the objects that are not on the same line.
|
|
||||||
'''
|
|
||||||
d = ratio*self.height
|
|
||||||
other_lines = []
|
|
||||||
for o in objs:
|
|
||||||
if o == self:
|
|
||||||
other_lines.append(o)
|
|
||||||
elif self.is_neighbor(o, d, same_line=True):
|
|
||||||
[self.add(oc) for oc in o]
|
|
||||||
# Clear out the old line
|
|
||||||
o._objs = []
|
|
||||||
else:
|
|
||||||
other_lines.append(o)
|
|
||||||
return other_lines
|
|
||||||
|
|
||||||
def find_neighbors(self, plane, ratio):
|
def find_neighbors(self, plane, ratio):
|
||||||
d = ratio*self.height
|
d = ratio*self.height
|
||||||
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
|
||||||
return [o for o in objs if self.is_neighbor(o, d)]
|
return [obj for obj in objs
|
||||||
|
if (isinstance(obj, LTTextLineHorizontal) and
|
||||||
|
abs(obj.height-self.height) < d and
|
||||||
|
(abs(obj.x0-self.x0) < d or
|
||||||
|
abs(obj.x1-self.x1) < d))]
|
||||||
|
|
||||||
|
|
||||||
class LTTextLineVertical(LTTextLine):
|
class LTTextLineVertical(LTTextLine):
|
||||||
|
@ -563,9 +513,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
(min(obj0.height, obj1.height) * laparams.line_overlap <
|
||||||
obj0.voverlap(obj1)) and
|
obj0.voverlap(obj1)) and
|
||||||
(obj0.hdistance(obj1) <
|
(obj0.hdistance(obj1) <
|
||||||
max(obj0.width, obj1.width) * laparams.char_margin) or
|
max(obj0.width, obj1.width) * laparams.char_margin))
|
||||||
# If the line is zero width, default to horizontal
|
|
||||||
(max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0))
|
|
||||||
|
|
||||||
# valign: obj0 and obj1 is vertically aligned.
|
# valign: obj0 and obj1 is vertically aligned.
|
||||||
#
|
#
|
||||||
|
@ -587,10 +535,7 @@ class LTLayoutContainer(LTContainer):
|
||||||
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
(min(obj0.width, obj1.width) * laparams.line_overlap <
|
||||||
obj0.hoverlap(obj1)) and
|
obj0.hoverlap(obj1)) and
|
||||||
(obj0.vdistance(obj1) <
|
(obj0.vdistance(obj1) <
|
||||||
max(obj0.height, obj1.height) * laparams.char_margin) and
|
max(obj0.height, obj1.height) * laparams.char_margin))
|
||||||
# Don't start a vertical line if the previous letter is
|
|
||||||
# whitspace. Prevents double spaces being caught as vert lines.
|
|
||||||
(line or obj0._text.strip()))
|
|
||||||
|
|
||||||
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
if ((halign and isinstance(line, LTTextLineHorizontal)) or
|
||||||
(valign and isinstance(line, LTTextLineVertical))):
|
(valign and isinstance(line, LTTextLineVertical))):
|
||||||
|
@ -619,25 +564,19 @@ class LTLayoutContainer(LTContainer):
|
||||||
yield line
|
yield line
|
||||||
return
|
return
|
||||||
|
|
||||||
# group_textlines: group neighbouring lines to textboxes.
|
# group_textlines: group neighboring lines to textboxes.
|
||||||
def group_textlines(self, laparams, lines):
|
def group_textlines(self, laparams, lines):
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(lines)
|
plane.extend(lines)
|
||||||
boxes = {}
|
boxes = {}
|
||||||
# for line in plane:
|
|
||||||
# print "line", ("".join([s._text for s in line])).encode('ascii', 'ignore')
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
neighbors = line.find_neighbors(plane, laparams.line_margin)
|
||||||
if line not in neighbors:
|
if line not in neighbors: continue
|
||||||
logging.error("Line cannot find itself: %s"%line)
|
|
||||||
continue
|
|
||||||
neighbors = line.group_line_neighbors(neighbors, laparams.line_margin)
|
|
||||||
members = []
|
members = []
|
||||||
for obj1 in neighbors:
|
for obj1 in neighbors:
|
||||||
members.append(obj1)
|
members.append(obj1)
|
||||||
if obj1 in boxes:
|
if obj1 in boxes:
|
||||||
members.extend(boxes.pop(obj1))
|
members.extend(boxes.pop(obj1))
|
||||||
# print "members: ", ["".join([o._text for o in line]) for line in members]
|
|
||||||
if isinstance(line, LTTextLineHorizontal):
|
if isinstance(line, LTTextLineHorizontal):
|
||||||
box = LTTextBoxHorizontal()
|
box = LTTextBoxHorizontal()
|
||||||
else:
|
else:
|
||||||
|
@ -692,19 +631,8 @@ class LTLayoutContainer(LTContainer):
|
||||||
(c,d,_,_) = t
|
(c,d,_,_) = t
|
||||||
return (c,d)
|
return (c,d)
|
||||||
|
|
||||||
# The algorithm below still takes O(n^2) :(
|
# XXX this still takes O(n^2) :(
|
||||||
# For now, if we have many boxes, split them into two and perform them
|
|
||||||
# separately. This will cause bugs, but will prevent hanging.
|
|
||||||
if len(boxes) > 100:
|
|
||||||
boxes = sorted(boxes, key=lambda obj: obj.y0)
|
|
||||||
# Divide in two, then perform grouping
|
|
||||||
# print "Making Recursive Call %d"%len(boxes)
|
|
||||||
bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2])
|
|
||||||
top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:])
|
|
||||||
boxes = bot_boxes + top_boxes
|
|
||||||
|
|
||||||
dists = []
|
dists = []
|
||||||
# Calculate the distance between each box
|
|
||||||
for i in xrange(len(boxes)):
|
for i in xrange(len(boxes)):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in xrange(i+1, len(boxes)):
|
for j in xrange(i+1, len(boxes)):
|
||||||
|
@ -714,20 +642,16 @@ class LTLayoutContainer(LTContainer):
|
||||||
dists = csort(dists, key=key_obj)
|
dists = csort(dists, key=key_obj)
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
# Start with the two closest objects
|
|
||||||
while dists:
|
while dists:
|
||||||
(c, d, obj1, obj2) = dists.pop(0)
|
(c, d, obj1, obj2) = dists.pop(0)
|
||||||
# If there are any objects in between, then skip these two
|
|
||||||
if c == 0 and isany(obj1, obj2):
|
if c == 0 and isany(obj1, obj2):
|
||||||
dists.append((1, d, obj1, obj2))
|
dists.append((1, d, obj1, obj2))
|
||||||
continue
|
continue
|
||||||
# Group these two closest objects
|
|
||||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
||||||
group = LTTextGroupTBRL([obj1, obj2])
|
group = LTTextGroupTBRL([obj1, obj2])
|
||||||
else:
|
else:
|
||||||
group = LTTextGroupLRTB([obj1, obj2])
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
# Remove the two individual objects
|
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
plane.remove(obj2)
|
plane.remove(obj2)
|
||||||
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
||||||
|
@ -801,4 +725,4 @@ class LTPage(LTLayoutContainer):
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<%s(%r) %s rotate=%r>' %
|
return ('<%s(%r) %s rotate=%r>' %
|
||||||
(self.__class__.__name__, self.pageid,
|
(self.__class__.__name__, self.pageid,
|
||||||
bbox2str(self.bbox), self.rotate))
|
bbox2str(self.bbox), self.rotate))
|
Loading…
Reference in New Issue