diff --git a/.gitignore b/.gitignore deleted file mode 100644 index c81d6cf..0000000 --- a/.gitignore +++ /dev/null @@ -1,111 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo -*.pot - -# Django stuff: -*.log - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ -======= -# Intermediate documents -*.xps - -# Password and Key Files -*.pem -*.p12 - -# Compiled source # -################### -*.pyc -*.com -*.class -*.dll -*.exe -*.o -*.so - -# Mecurial Files? -*.i -*.d -*.mo -*.hg/ - -# Python data files # -*.shelf -*.shelve - -# Don't track these files, they are output from scripts - -# Subversion files -*.svn-base -all-wcprops -entries - -# Logs and databases # -###################### -bulkloader-log-* -*.log -*.sql -*.sql3 -*.sql3-journal -*.sqlite - -# OS generated files # -###################### -.DS_Store? -ehthumbs.db -Icon? -Thumbs.db - -# tmp files # -############# -~$*.doc -~WRL*.tmp - -*.gz \ No newline at end of file diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 3d52859..9426ad3 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -import logging from .utils import INF from .utils import Plane from .utils import get_bound @@ -356,9 +355,6 @@ class LTTextLine(LTTextContainer): LTContainer.add(self, LTAnno('\n')) return - def group_line_neighbors(self, objs, ratio): - return objs - def find_neighbors(self, plane, ratio): raise NotImplementedError @@ -372,67 +368,21 @@ class LTTextLineHorizontal(LTTextLine): def add(self, obj): if isinstance(obj, LTChar) and self.word_margin: - # Add a space between words if separated by more than word_margin - # apart. Use the max of obj width and height so narrow letters - # (i, l) are treated like wider letters, reducing extra spaces margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0-margin: - # But only do it if there is not already a space. - last_was_alpha = self._objs and \ - isinstance(self._objs[-1], LTChar) and \ - self._objs[-1]._text == ' ' - if not last_was_alpha: - LTContainer.add(self, LTAnno(' ')) + LTContainer.add(self, LTAnno(' ')) self._x1 = obj.x1 LTTextLine.add(self, obj) return - def is_neighbor(self, obj, d, same_line = False): - # Horizontal lines can only connect with horizontal lines - if not isinstance(obj, LTTextLineHorizontal): - return False - # Ensure they are vertically close - if abs(obj.height-self.height) >= d: - return False - # Ensure that they have similar start or stop x positions - if not (abs(obj.x0-self.x0) < d or - abs(obj.x1-self.x1) < d or - # Or that they intersect eachother horizontally. - (obj.x0 < self.x0 and obj.x1 > self.x0) or - (obj.x0 > self.x0 and obj.x0 < self.x1)): - return False - if same_line and not ( - # Ensure they have similar - (obj.y0 == self.y0) or (obj.y1 == self.y1) or - (obj.y0 < self.y0 and obj.y0 > self.y1) or - (obj.y0 > self.y0 and obj.y1 < self.y0)): - return False - return True - - def group_line_neighbors(self, objs, ratio): - ''' - Given a set of objects that may or may not be on the same line as this, - add the objects that are on the same line. - - Return the objects that are not on the same line. - ''' - d = ratio*self.height - other_lines = [] - for o in objs: - if o == self: - other_lines.append(o) - elif self.is_neighbor(o, d, same_line=True): - [self.add(oc) for oc in o] - # Clear out the old line - o._objs = [] - else: - other_lines.append(o) - return other_lines - def find_neighbors(self, plane, ratio): d = ratio*self.height objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d)) - return [o for o in objs if self.is_neighbor(o, d)] + return [obj for obj in objs + if (isinstance(obj, LTTextLineHorizontal) and + abs(obj.height-self.height) < d and + (abs(obj.x0-self.x0) < d or + abs(obj.x1-self.x1) < d))] class LTTextLineVertical(LTTextLine): @@ -563,9 +513,7 @@ class LTLayoutContainer(LTContainer): (min(obj0.height, obj1.height) * laparams.line_overlap < obj0.voverlap(obj1)) and (obj0.hdistance(obj1) < - max(obj0.width, obj1.width) * laparams.char_margin) or - # If the line is zero width, default to horizontal - (max(obj0.width, obj1.width) == 0 and obj1.x0 >= obj0.x0)) + max(obj0.width, obj1.width) * laparams.char_margin)) # valign: obj0 and obj1 is vertically aligned. # @@ -587,10 +535,7 @@ class LTLayoutContainer(LTContainer): (min(obj0.width, obj1.width) * laparams.line_overlap < obj0.hoverlap(obj1)) and (obj0.vdistance(obj1) < - max(obj0.height, obj1.height) * laparams.char_margin) and - # Don't start a vertical line if the previous letter is - # whitspace. Prevents double spaces being caught as vert lines. - (line or obj0._text.strip())) + max(obj0.height, obj1.height) * laparams.char_margin)) if ((halign and isinstance(line, LTTextLineHorizontal)) or (valign and isinstance(line, LTTextLineVertical))): @@ -619,25 +564,19 @@ class LTLayoutContainer(LTContainer): yield line return - # group_textlines: group neighbouring lines to textboxes. + # group_textlines: group neighboring lines to textboxes. def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} - # for line in plane: - # print "line", ("".join([s._text for s in line])).encode('ascii', 'ignore') for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) - if line not in neighbors: - logging.error("Line cannot find itself: %s"%line) - continue - neighbors = line.group_line_neighbors(neighbors, laparams.line_margin) + if line not in neighbors: continue members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) - # print "members: ", ["".join([o._text for o in line]) for line in members] if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: @@ -692,19 +631,8 @@ class LTLayoutContainer(LTContainer): (c,d,_,_) = t return (c,d) - # The algorithm below still takes O(n^2) :( - # For now, if we have many boxes, split them into two and perform them - # separately. This will cause bugs, but will prevent hanging. - if len(boxes) > 100: - boxes = sorted(boxes, key=lambda obj: obj.y0) - # Divide in two, then perform grouping - # print "Making Recursive Call %d"%len(boxes) - bot_boxes = self.group_textboxes(laparams, boxes[:len(boxes)/2]) - top_boxes = self.group_textboxes(laparams, boxes[len(boxes)/2:]) - boxes = bot_boxes + top_boxes - + # XXX this still takes O(n^2) :( dists = [] - # Calculate the distance between each box for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): @@ -714,20 +642,16 @@ class LTLayoutContainer(LTContainer): dists = csort(dists, key=key_obj) plane = Plane(self.bbox) plane.extend(boxes) - # Start with the two closest objects while dists: (c, d, obj1, obj2) = dists.pop(0) - # If there are any objects in between, then skip these two if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue - # Group these two closest objects if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) - # Remove the two individual objects plane.remove(obj1) plane.remove(obj2) dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists @@ -801,4 +725,4 @@ class LTPage(LTLayoutContainer): def __repr__(self): return ('<%s(%r) %s rotate=%r>' % (self.__class__.__name__, self.pageid, - bbox2str(self.bbox), self.rotate)) + bbox2str(self.bbox), self.rotate)) \ No newline at end of file