commit
e03ecab856
|
@ -8,5 +8,6 @@ install:
|
||||||
- pip install six
|
- pip install six
|
||||||
- pip install pycryptodome
|
- pip install pycryptodome
|
||||||
- pip install chardet
|
- pip install chardet
|
||||||
|
- pip install sortedcontainers
|
||||||
script:
|
script:
|
||||||
nosetests --nologcapture
|
nosetests --nologcapture
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
|
from sortedcontainers import SortedListWithKey
|
||||||
|
|
||||||
from .utils import INF
|
from .utils import INF
|
||||||
from .utils import Plane
|
from .utils import Plane
|
||||||
from .utils import get_bound
|
from .utils import get_bound
|
||||||
from .utils import uniq
|
from .utils import uniq
|
||||||
from .utils import csort
|
|
||||||
from .utils import fsplit
|
from .utils import fsplit
|
||||||
from .utils import bbox2str
|
from .utils import bbox2str
|
||||||
from .utils import matrix2str
|
from .utils import matrix2str
|
||||||
|
@ -441,7 +441,7 @@ class LTTextBoxHorizontal(LTTextBox):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
LTTextBox.analyze(self, laparams)
|
LTTextBox.analyze(self, laparams)
|
||||||
self._objs = csort(self._objs, key=lambda obj: -obj.y1)
|
self._objs.sort(key=lambda obj: -obj.y1)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_writing_mode(self):
|
def get_writing_mode(self):
|
||||||
|
@ -452,7 +452,7 @@ class LTTextBoxVertical(LTTextBox):
|
||||||
|
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
LTTextBox.analyze(self, laparams)
|
LTTextBox.analyze(self, laparams)
|
||||||
self._objs = csort(self._objs, key=lambda obj: -obj.x1)
|
self._objs.sort(key=lambda obj: -obj.x1)
|
||||||
return
|
return
|
||||||
|
|
||||||
def get_writing_mode(self):
|
def get_writing_mode(self):
|
||||||
|
@ -474,7 +474,7 @@ class LTTextGroupLRTB(LTTextGroup):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
LTTextGroup.analyze(self, laparams)
|
LTTextGroup.analyze(self, laparams)
|
||||||
# reorder the objects from top-left to bottom-right.
|
# reorder the objects from top-left to bottom-right.
|
||||||
self._objs = csort(self._objs, key=lambda obj:
|
self._objs.sort(key=lambda obj:
|
||||||
(1-laparams.boxes_flow)*(obj.x0) -
|
(1-laparams.boxes_flow)*(obj.x0) -
|
||||||
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
(1+laparams.boxes_flow)*(obj.y0+obj.y1))
|
||||||
return
|
return
|
||||||
|
@ -485,7 +485,7 @@ class LTTextGroupTBRL(LTTextGroup):
|
||||||
def analyze(self, laparams):
|
def analyze(self, laparams):
|
||||||
LTTextGroup.analyze(self, laparams)
|
LTTextGroup.analyze(self, laparams)
|
||||||
# reorder the objects from top-right to bottom-left.
|
# reorder the objects from top-right to bottom-left.
|
||||||
self._objs = csort(self._objs, key=lambda obj:
|
self._objs.sort(key=lambda obj:
|
||||||
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
-(1+laparams.boxes_flow)*(obj.x0+obj.x1)
|
||||||
- (1-laparams.boxes_flow)*(obj.y1))
|
- (1-laparams.boxes_flow)*(obj.y1))
|
||||||
return
|
return
|
||||||
|
@ -639,21 +639,18 @@ class LTLayoutContainer(LTContainer):
|
||||||
(c,d,_,_) = t
|
(c,d,_,_) = t
|
||||||
return (c,d)
|
return (c,d)
|
||||||
|
|
||||||
# XXX this still takes O(n^2) :(
|
dists = SortedListWithKey(key=key_obj)
|
||||||
dists = []
|
|
||||||
for i in range(len(boxes)):
|
for i in range(len(boxes)):
|
||||||
obj1 = boxes[i]
|
obj1 = boxes[i]
|
||||||
for j in range(i+1, len(boxes)):
|
for j in range(i+1, len(boxes)):
|
||||||
obj2 = boxes[j]
|
obj2 = boxes[j]
|
||||||
dists.append((0, dist(obj1, obj2), obj1, obj2))
|
dists.add((0, dist(obj1, obj2), obj1, obj2))
|
||||||
# We could use dists.sort(), but it would randomize the test result.
|
|
||||||
dists = csort(dists, key=key_obj)
|
|
||||||
plane = Plane(self.bbox)
|
plane = Plane(self.bbox)
|
||||||
plane.extend(boxes)
|
plane.extend(boxes)
|
||||||
while dists:
|
while dists:
|
||||||
(c, d, obj1, obj2) = dists.pop(0)
|
(c, d, obj1, obj2) = dists.pop(0)
|
||||||
if c == 0 and isany(obj1, obj2):
|
if c == 0 and isany(obj1, obj2):
|
||||||
dists.append((1, d, obj1, obj2))
|
dists.add((1, d, obj1, obj2))
|
||||||
continue
|
continue
|
||||||
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
|
||||||
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
|
||||||
|
@ -662,11 +659,13 @@ class LTLayoutContainer(LTContainer):
|
||||||
group = LTTextGroupLRTB([obj1, obj2])
|
group = LTTextGroupLRTB([obj1, obj2])
|
||||||
plane.remove(obj1)
|
plane.remove(obj1)
|
||||||
plane.remove(obj2)
|
plane.remove(obj2)
|
||||||
dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
removed = [obj1, obj2]
|
||||||
if (obj1 in plane and obj2 in plane) ]
|
to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists
|
||||||
|
if (obj1 in removed or obj2 in removed) ]
|
||||||
|
for r in to_remove:
|
||||||
|
dists.remove(r)
|
||||||
for other in plane:
|
for other in plane:
|
||||||
dists.append((0, dist(group, other), group, other))
|
dists.add((0, dist(group, other), group, other))
|
||||||
dists = csort(dists, key=key_obj)
|
|
||||||
plane.add(group)
|
plane.add(group)
|
||||||
assert len(plane) == 1, str(len(plane))
|
assert len(plane) == 1, str(len(plane))
|
||||||
return list(plane)
|
return list(plane)
|
||||||
|
|
|
@ -145,13 +145,6 @@ def uniq(objs):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# csort
|
|
||||||
def csort(objs, key):
|
|
||||||
"""Order-preserving sorting function."""
|
|
||||||
idxs = dict((obj, i) for (i, obj) in enumerate(objs))
|
|
||||||
return sorted(objs, key=lambda obj: (key(obj), idxs[obj]))
|
|
||||||
|
|
||||||
|
|
||||||
# fsplit
|
# fsplit
|
||||||
def fsplit(pred, objs):
|
def fsplit(pred, objs):
|
||||||
"""Split a list into two classes according to the predicate."""
|
"""Split a list into two classes according to the predicate."""
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -3,7 +3,7 @@ import sys
|
||||||
|
|
||||||
import pdfminer as package
|
import pdfminer as package
|
||||||
|
|
||||||
requires = ['six', 'pycryptodome']
|
requires = ['six', 'pycryptodome', 'sortedcontainers']
|
||||||
if sys.version_info >= (3, 0):
|
if sys.version_info >= (3, 0):
|
||||||
requires.append('chardet')
|
requires.append('chardet')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue