Revert changes unrelated to this branch.

pull/55/head
speedplane 2016-06-13 23:42:21 -04:00
parent b0b8818a41
commit 2049462f6f
2 changed files with 12 additions and 199 deletions

111
.gitignore vendored
View File

@ -1,111 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
=======
# Intermediate documents
*.xps
# Password and Key Files
*.pem
*.p12
# Compiled source #
###################
*.pyc
*.com
*.class
*.dll
*.exe
*.o
*.so
# Mecurial Files?
*.i
*.d
*.mo
*.hg/
# Python data files #
*.shelf
*.shelve
# Don't track these files, they are output from scripts
# Subversion files
*.svn-base
all-wcprops
entries
# Logs and databases #
######################
bulkloader-log-*
*.log
*.sql
*.sql3
*.sql3-journal
*.sqlite
# OS generated files #
######################
.DS_Store?
ehthumbs.db
Icon?
Thumbs.db
# tmp files #
#############
~$*.doc
~WRL*.tmp
*.gz

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python
import logging
from .utils import INF from .utils import INF
from .utils import Plane from .utils import Plane
from .utils import get_bound from .utils import get_bound
@ -356,9 +355,6 @@ class LTTextLine(LTTextContainer):
LTContainer.add(self, LTAnno('\n')) LTContainer.add(self, LTAnno('\n'))
return return
def group_line_neighbors(self, objs, ratio):
return objs
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
raise NotImplementedError raise NotImplementedError
@ -372,67 +368,21 @@ class LTTextLineHorizontal(LTTextLine):
def add(self, obj): def add(self, obj):
if isinstance(obj, LTChar) and self.word_margin: if isinstance(obj, LTChar) and self.word_margin:
# Add a space between words if separated by more than word_margin
# apart. Use the max of obj width and height so narrow letters
# (i, l) are treated like wider letters, reducing extra spaces
margin = self.word_margin * max(obj.width, obj.height) margin = self.word_margin * max(obj.width, obj.height)
if self._x1 < obj.x0-margin: if self._x1 < obj.x0-margin:
# But only do it if there is not already a space.
last_was_alpha = self._objs and \
isinstance(self._objs[-1], LTChar) and \
self._objs[-1]._text == ' '
if not last_was_alpha:
LTContainer.add(self, LTAnno(' ')) LTContainer.add(self, LTAnno(' '))
self._x1 = obj.x1 self._x1 = obj.x1
LTTextLine.add(self, obj) LTTextLine.add(self, obj)
return return
def is_neighbor(self, obj, d, same_line = False):
# Horizontal lines can only connect with horizontal lines
if not isinstance(obj, LTTextLineHorizontal):
return False
# Ensure they are vertically close
if abs(obj.height-self.height) >= d:
return False
# Ensure that they have similar start or stop x positions
if not (abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d or
# Or that they intersect eachother horizontally.
(obj.x0 < self.x0 and obj.x1 > self.x0) or
(obj.x0 > self.x0 and obj.x0 < self.x1)):
return False
if same_line and not (
# Ensure they have similar
(obj.y0 == self.y0) or (obj.y1 == self.y1) or
(obj.y0 < self.y0 and obj.y0 > self.y1) or
(obj.y0 > self.y0 and obj.y1 < self.y0)):
return False
return True
def group_line_neighbors(self, objs, ratio):
'''
Given a set of objects that may or may not be on the same line as this,
add the objects that are on the same line.
Return the objects that are not on the same line.
'''
d = ratio*self.height
other_lines = []
for o in objs:
if o == self:
other_lines.append(o)
elif self.is_neighbor(o, d, same_line=True):
[self.add(oc) for oc in o]
# Clear out the old line
o._objs = []
else:
other_lines.append(o)
return other_lines
def find_neighbors(self, plane, ratio): def find_neighbors(self, plane, ratio):
d = ratio*self.height d = ratio*self.height
objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
return [o for o in objs if self.is_neighbor(o, d)] return [obj for obj in objs
if (isinstance(obj, LTTextLineHorizontal) and
abs(obj.height-self.height) < d and
(abs(obj.x0-self.x0) < d or
abs(obj.x1-self.x1) < d))]
class LTTextLineVertical(LTTextLine): class LTTextLineVertical(LTTextLine):
@ -563,9 +513,7 @@ class LTLayoutContainer(LTContainer):
(min(obj0.height, obj1.height) * laparams.line_overlap < (min(obj0.height, obj1.height) * laparams.line_overlap <
obj0.voverlap(obj1)) and obj0.voverlap(obj1)) and
(obj0.hdistance(obj1) < (obj0.hdistance(obj1) <
max(obj0.width, obj1.width) * laparams.char_margin) or max(obj0.width, obj1.width) * laparams.char_margin))
# If the line is zero width, default to horizontal
(max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0))
# valign: obj0 and obj1 is vertically aligned. # valign: obj0 and obj1 is vertically aligned.
# #
@ -587,10 +535,7 @@ class LTLayoutContainer(LTContainer):
(min(obj0.width, obj1.width) * laparams.line_overlap < (min(obj0.width, obj1.width) * laparams.line_overlap <
obj0.hoverlap(obj1)) and obj0.hoverlap(obj1)) and
(obj0.vdistance(obj1) < (obj0.vdistance(obj1) <
max(obj0.height, obj1.height) * laparams.char_margin) and max(obj0.height, obj1.height) * laparams.char_margin))
# Don't start a vertical line if the previous letter is
# whitspace. Prevents double spaces being caught as vert lines.
(line or obj0._text.strip()))
if ((halign and isinstance(line, LTTextLineHorizontal)) or if ((halign and isinstance(line, LTTextLineHorizontal)) or
(valign and isinstance(line, LTTextLineVertical))): (valign and isinstance(line, LTTextLineVertical))):
@ -619,25 +564,19 @@ class LTLayoutContainer(LTContainer):
yield line yield line
return return
# group_textlines: group neighbouring lines to textboxes. # group_textlines: group neighboring lines to textboxes.
def group_textlines(self, laparams, lines): def group_textlines(self, laparams, lines):
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(lines) plane.extend(lines)
boxes = {} boxes = {}
# for line in plane:
# print "line", ("".join([s._text for s in line])).encode('ascii', 'ignore')
for line in lines: for line in lines:
neighbors = line.find_neighbors(plane, laparams.line_margin) neighbors = line.find_neighbors(plane, laparams.line_margin)
if line not in neighbors: if line not in neighbors: continue
logging.error("Line cannot find itself: %s"%line)
continue
neighbors = line.group_line_neighbors(neighbors, laparams.line_margin)
members = [] members = []
for obj1 in neighbors: for obj1 in neighbors:
members.append(obj1) members.append(obj1)
if obj1 in boxes: if obj1 in boxes:
members.extend(boxes.pop(obj1)) members.extend(boxes.pop(obj1))
# print "members: ", ["".join([o._text for o in line]) for line in members]
if isinstance(line, LTTextLineHorizontal): if isinstance(line, LTTextLineHorizontal):
box = LTTextBoxHorizontal() box = LTTextBoxHorizontal()
else: else:
@ -692,19 +631,8 @@ class LTLayoutContainer(LTContainer):
(c,d,_,_) = t (c,d,_,_) = t
return (c,d) return (c,d)
# The algorithm below still takes O(n^2) :( # XXX this still takes O(n^2) :(
# For now, if we have many boxes, split them into two and perform them
# separately. This will cause bugs, but will prevent hanging.
if len(boxes) > 100:
boxes = sorted(boxes, key=lambda obj: obj.y0)
# Divide in two, then perform grouping
# print "Making Recursive Call %d"%len(boxes)
bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2])
top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:])
boxes = bot_boxes + top_boxes
dists = [] dists = []
# Calculate the distance between each box
for i in xrange(len(boxes)): for i in xrange(len(boxes)):
obj1 = boxes[i] obj1 = boxes[i]
for j in xrange(i+1, len(boxes)): for j in xrange(i+1, len(boxes)):
@ -714,20 +642,16 @@ class LTLayoutContainer(LTContainer):
dists = csort(dists, key=key_obj) dists = csort(dists, key=key_obj)
plane = Plane(self.bbox) plane = Plane(self.bbox)
plane.extend(boxes) plane.extend(boxes)
# Start with the two closest objects
while dists: while dists:
(c, d, obj1, obj2) = dists.pop(0) (c, d, obj1, obj2) = dists.pop(0)
# If there are any objects in between, then skip these two
if c == 0 and isany(obj1, obj2): if c == 0 and isany(obj1, obj2):
dists.append((1, d, obj1, obj2)) dists.append((1, d, obj1, obj2))
continue continue
# Group these two closest objects
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
group = LTTextGroupTBRL([obj1, obj2]) group = LTTextGroupTBRL([obj1, obj2])
else: else:
group = LTTextGroupLRTB([obj1, obj2]) group = LTTextGroupLRTB([obj1, obj2])
# Remove the two individual objects
plane.remove(obj1) plane.remove(obj1)
plane.remove(obj2) plane.remove(obj2)
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists