From 0298e26acc4ec917782f2ba2314a464e84f7f63b Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 14 Nov 2009 11:29:40 +0000 Subject: [PATCH] speed-tweak.diff from Yannick Gingras git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@158 1aa58f4a-7d42-0410-adbc-911cccaed67c --- docs/index.html | 9 ++++---- pdfminer/layout.py | 38 ++++++++++++++++++++++----------- pdfminer/psparser.py | 51 +++++++++++++++++++++++++++----------------- pdfminer/utils.py | 10 +++++---- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/docs/index.html b/docs/index.html index 1bf6fdb..02b5509 100644 --- a/docs/index.html +++ b/docs/index.html @@ -19,7 +19,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Nov 7 18:11:40 JST 2009 +Last Modified: Fri Nov 13 19:12:36 JST 2009
@@ -41,8 +41,9 @@ Last Modified: Sat Nov 7 18:11:40 JST 2009

What's It?

PDFMiner is a suite of programs that help -extracting and analyzing text data of PDF documents. -Unlike other PDF-related tools, it allows to obtain +extracting some meaningful informatin out of PDF documents. +Unlike other PDF-related tools, it focuses entirely on getting +and analyzing text data from PDFs. PDFMiner allows to obtain the exact location of texts in a page, as well as other extra information such as font information or ruled lines. It includes a PDF converter that can transform PDF files @@ -59,7 +60,7 @@ PDF parser that can be used for other purposes instead of text analysis.

  • PDF to HTML conversion (with a sample converter web app).
  • Outline (TOC) extraction.
  • Tagged contents extraction. -
  • Infer text running by using clustering technique. +
  • Reconstruct the original layout by grouping text chunks. diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 218856c..7a15ce7 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -57,14 +57,14 @@ class Plane(object): # find(): finds objects that are in a certain area. def find(self, (x0,y0,x1,y1)): - (i0,_) = bsearch(self.xobjs, x0) - (_,i1) = bsearch(self.xobjs, x1) - xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] ) - (i0,_) = bsearch(self.yobjs, y0) - (_,i1) = bsearch(self.yobjs, y1) - yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] ) - objs = xobjs.intersection(yobjs) - return objs + i0 = bsearch(self.xobjs, x0)[0] + i1 = bsearch(self.xobjs, x1)[1] + xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] ) + i0 = bsearch(self.yobjs, y0)[0] + i1 = bsearch(self.yobjs, y1)[1] + yobjs = [pair[1] for pair in self.yobjs[i0:i1]] + xobjs.intersection_update(yobjs) + return xobjs ## ClusterSet @@ -139,6 +139,13 @@ class LayoutItem(object): def get_bbox(self): return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) + def is_hoverlap(self, obj): + assert isinstance(obj, LayoutItem) + if self.x1 <= obj.x0 or obj.x1 <= self.x0: + return False + else: + return True + def hoverlap(self, obj): assert isinstance(obj, LayoutItem) if self.x1 <= obj.x0 or obj.x1 <= self.x0: @@ -146,6 +153,13 @@ class LayoutItem(object): else: return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0)) + def is_voverlap(self, obj): + assert isinstance(obj, LayoutItem) + if self.y1 <= obj.y0 or obj.y1 <= self.y0: + return False + else: + return True + def voverlap(self, obj): assert isinstance(obj, LayoutItem) if self.y1 <= obj.y0 or obj.y1 <= self.y0: @@ -473,9 +487,9 @@ class LTPage(LayoutContainer): def vline(obj1, obj2): return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2) def vorder(obj1, obj2): - if obj1.voverlap(obj2): + if obj1.is_voverlap(obj2): return obj2.x1 < obj1.x0 - elif obj1.hoverlap(obj2): + elif obj1.is_hoverlap(obj2): return obj2.y1 < obj1.y0 else: return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0 @@ -489,9 +503,9 @@ class LTPage(LayoutContainer): def hline(obj1, obj2): return obj1.height * laparams.line_overlap < obj1.voverlap(obj2) def horder(obj1, obj2): - if obj1.hoverlap(obj2): + if obj1.is_hoverlap(obj2): return obj2.y1 < obj1.y0 - elif obj1.voverlap(obj2): + elif obj1.is_voverlap(obj2): return obj1.x1 < obj2.x0 else: return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0 diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index aa69192..a2d98cf 100644 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -21,23 +21,28 @@ class PSValueError(PSException): pass ## PSObject ## -## Base class for all PS or PDF-related data types. -## -class PSObject(object): pass +class PSObject(object): + + """Base class for all PS or PDF-related data types.""" + + pass ## PSLiteral ## -## Postscript literals are used as identifiers, such as -## variable names, property names and dictionary keys. -## Literals are case sensitive and denoted by a preceding -## slash sign (e.g. "/Name") -## -## Note: Never create an instance of PSLiteral by hand. -## Always use PSLiteralTable.intern(). -## class PSLiteral(PSObject): + """A class that represents a PostScript literal. + + Postscript literals are used as identifiers, such as + variable names, property names and dictionary keys. + Literals are case sensitive and denoted by a preceding + slash sign (e.g. "/Name") + + Note: Do not create an instance of PSLiteral directly. + Always use PSLiteralTable.intern(). + """ + def __init__(self, name): self.name = name return @@ -48,11 +53,18 @@ class PSLiteral(PSObject): ## PSKeyword ## -## Note: Never create an instance of PSLiteral by hand. -## Always use PSKeywordTable.intern(). -## class PSKeyword(PSObject): + """A class that represents a PostScript keyword. + + PostScript keywords are a dozen of predefined words. + Commands and directives in PostScript are expressed by keywords. + They are also used to denote the content boundaries. + + Note: Do not create an instance of PSKeyword directly. + Always use PSKeywordTable.intern(). + """ + def __init__(self, name): self.name = name return @@ -63,14 +75,13 @@ class PSKeyword(PSObject): ## PSSymbolTable ## -## A dictionary-like object that is used for -## storing PSLiteral/PSKeyword objects so that -## an object that has the same name can never be defined -## twice and it is always assured that the same name is -## referred to as the same PSLiteral/PSKeyword object. -## class PSSymbolTable(object): + """A utility class for storing PSLiteral/PSKeyword objects. + + Interned objects can be checked its identity with "is" operator. + """ + def __init__(self, klass): self.dic = {} self.klass = klass diff --git a/pdfminer/utils.py b/pdfminer/utils.py index e390f84..19799cc 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -13,6 +13,7 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): a0*e1+c0*f1+e0, b0*e1+d0*f1+f0) def translate_matrix((a,b,c,d,e,f), (x,y)): + '''Translates a matrix by (x,y).''' return (a,b,c,d,x*a+y*c+e,x*b+y*d+f) def apply_matrix_pt((a,b,c,d,e,f), (x,y)): @@ -29,7 +30,7 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)): # pick def pick(seq, func, maxobj=None): - '''Picks the object that has the highest value of func(obj).''' + '''Picks the object obj where func(obj) has the highest value.''' maxscore = None for obj in seq: score = func(obj) @@ -40,8 +41,9 @@ def pick(seq, func, maxobj=None): # bsearch def bsearch(objs, v0): '''Tries to find the closest value to v0.''' + nb_objs = len(objs) i0 = 0 - i1 = len(objs) + i1 = nb_objs while i0 < i1: i = (i0+i1)/2 (v, obj) = objs[i] @@ -49,7 +51,7 @@ def bsearch(objs, v0): (i0,i1) = (i,i+1) while 0 < i0 and objs[i0-1][0] == v0: i0 -= 1 - while i1 < len(objs)-1 and objs[i1][0] == v0: + while i1 < nb_objs-1 and objs[i1][0] == v0: i1 += 1 break elif v0 < v: @@ -71,7 +73,7 @@ def choplist(n, seq): # nunpack def nunpack(s, default=0): - '''Unpacks up to 4 bytes big endian.''' + '''Unpacks 1 to 4 byte integers (big endian).''' l = len(s) if not l: return default