speed-tweak.diff from Yannick Gingras
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@158 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
61d4872c3a
commit
0298e26acc
|
@ -19,7 +19,7 @@ Python PDF parser and analyzer
|
||||||
|
|
||||||
<div align=right class=lastmod>
|
<div align=right class=lastmod>
|
||||||
<!-- hhmts start -->
|
<!-- hhmts start -->
|
||||||
Last Modified: Sat Nov 7 18:11:40 JST 2009
|
Last Modified: Fri Nov 13 19:12:36 JST 2009
|
||||||
<!-- hhmts end -->
|
<!-- hhmts end -->
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -41,8 +41,9 @@ Last Modified: Sat Nov 7 18:11:40 JST 2009
|
||||||
<h2>What's It?</h2>
|
<h2>What's It?</h2>
|
||||||
<p>
|
<p>
|
||||||
PDFMiner is a suite of programs that help
|
PDFMiner is a suite of programs that help
|
||||||
extracting and analyzing text data of PDF documents.
|
extracting some meaningful informatin out of PDF documents.
|
||||||
Unlike other PDF-related tools, it allows to obtain
|
Unlike other PDF-related tools, it focuses entirely on getting
|
||||||
|
and analyzing text data from PDFs. PDFMiner allows to obtain
|
||||||
the exact location of texts in a page, as well as
|
the exact location of texts in a page, as well as
|
||||||
other extra information such as font information or ruled lines.
|
other extra information such as font information or ruled lines.
|
||||||
It includes a PDF converter that can transform PDF files
|
It includes a PDF converter that can transform PDF files
|
||||||
|
@ -59,7 +60,7 @@ PDF parser that can be used for other purposes instead of text analysis.
|
||||||
<li> PDF to HTML conversion (with a sample converter web app).
|
<li> PDF to HTML conversion (with a sample converter web app).
|
||||||
<li> Outline (TOC) extraction.
|
<li> Outline (TOC) extraction.
|
||||||
<li> Tagged contents extraction.
|
<li> Tagged contents extraction.
|
||||||
<li> Infer text running by using clustering technique.
|
<li> Reconstruct the original layout by grouping text chunks.
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
<a name="source"></a>
|
<a name="source"></a>
|
||||||
|
|
|
@ -57,14 +57,14 @@ class Plane(object):
|
||||||
|
|
||||||
# find(): finds objects that are in a certain area.
|
# find(): finds objects that are in a certain area.
|
||||||
def find(self, (x0,y0,x1,y1)):
|
def find(self, (x0,y0,x1,y1)):
|
||||||
(i0,_) = bsearch(self.xobjs, x0)
|
i0 = bsearch(self.xobjs, x0)[0]
|
||||||
(_,i1) = bsearch(self.xobjs, x1)
|
i1 = bsearch(self.xobjs, x1)[1]
|
||||||
xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] )
|
xobjs = set( [pair[1] for pair in self.xobjs[i0:i1]] )
|
||||||
(i0,_) = bsearch(self.yobjs, y0)
|
i0 = bsearch(self.yobjs, y0)[0]
|
||||||
(_,i1) = bsearch(self.yobjs, y1)
|
i1 = bsearch(self.yobjs, y1)[1]
|
||||||
yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] )
|
yobjs = [pair[1] for pair in self.yobjs[i0:i1]]
|
||||||
objs = xobjs.intersection(yobjs)
|
xobjs.intersection_update(yobjs)
|
||||||
return objs
|
return xobjs
|
||||||
|
|
||||||
|
|
||||||
## ClusterSet
|
## ClusterSet
|
||||||
|
@ -139,6 +139,13 @@ class LayoutItem(object):
|
||||||
def get_bbox(self):
|
def get_bbox(self):
|
||||||
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1)
|
||||||
|
|
||||||
|
def is_hoverlap(self, obj):
|
||||||
|
assert isinstance(obj, LayoutItem)
|
||||||
|
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def hoverlap(self, obj):
|
def hoverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
if self.x1 <= obj.x0 or obj.x1 <= self.x0:
|
||||||
|
@ -146,6 +153,13 @@ class LayoutItem(object):
|
||||||
else:
|
else:
|
||||||
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
|
||||||
|
|
||||||
|
def is_voverlap(self, obj):
|
||||||
|
assert isinstance(obj, LayoutItem)
|
||||||
|
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def voverlap(self, obj):
|
def voverlap(self, obj):
|
||||||
assert isinstance(obj, LayoutItem)
|
assert isinstance(obj, LayoutItem)
|
||||||
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
if self.y1 <= obj.y0 or obj.y1 <= self.y0:
|
||||||
|
@ -473,9 +487,9 @@ class LTPage(LayoutContainer):
|
||||||
def vline(obj1, obj2):
|
def vline(obj1, obj2):
|
||||||
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
return obj1.width * laparams.line_overlap < obj1.hoverlap(obj2)
|
||||||
def vorder(obj1, obj2):
|
def vorder(obj1, obj2):
|
||||||
if obj1.voverlap(obj2):
|
if obj1.is_voverlap(obj2):
|
||||||
return obj2.x1 < obj1.x0
|
return obj2.x1 < obj1.x0
|
||||||
elif obj1.hoverlap(obj2):
|
elif obj1.is_hoverlap(obj2):
|
||||||
return obj2.y1 < obj1.y0
|
return obj2.y1 < obj1.y0
|
||||||
else:
|
else:
|
||||||
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
|
return obj2.x1 < obj1.x0 and obj2.y1 < obj1.y0
|
||||||
|
@ -489,9 +503,9 @@ class LTPage(LayoutContainer):
|
||||||
def hline(obj1, obj2):
|
def hline(obj1, obj2):
|
||||||
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
return obj1.height * laparams.line_overlap < obj1.voverlap(obj2)
|
||||||
def horder(obj1, obj2):
|
def horder(obj1, obj2):
|
||||||
if obj1.hoverlap(obj2):
|
if obj1.is_hoverlap(obj2):
|
||||||
return obj2.y1 < obj1.y0
|
return obj2.y1 < obj1.y0
|
||||||
elif obj1.voverlap(obj2):
|
elif obj1.is_voverlap(obj2):
|
||||||
return obj1.x1 < obj2.x0
|
return obj1.x1 < obj2.x0
|
||||||
else:
|
else:
|
||||||
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
return obj1.x1 < obj2.x0 and obj2.y1 < obj1.y0
|
||||||
|
|
|
@ -21,23 +21,28 @@ class PSValueError(PSException): pass
|
||||||
|
|
||||||
## PSObject
|
## PSObject
|
||||||
##
|
##
|
||||||
## Base class for all PS or PDF-related data types.
|
class PSObject(object):
|
||||||
##
|
|
||||||
class PSObject(object): pass
|
"""Base class for all PS or PDF-related data types."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
## PSLiteral
|
## PSLiteral
|
||||||
##
|
##
|
||||||
## Postscript literals are used as identifiers, such as
|
|
||||||
## variable names, property names and dictionary keys.
|
|
||||||
## Literals are case sensitive and denoted by a preceding
|
|
||||||
## slash sign (e.g. "/Name")
|
|
||||||
##
|
|
||||||
## Note: Never create an instance of PSLiteral by hand.
|
|
||||||
## Always use PSLiteralTable.intern().
|
|
||||||
##
|
|
||||||
class PSLiteral(PSObject):
|
class PSLiteral(PSObject):
|
||||||
|
|
||||||
|
"""A class that represents a PostScript literal.
|
||||||
|
|
||||||
|
Postscript literals are used as identifiers, such as
|
||||||
|
variable names, property names and dictionary keys.
|
||||||
|
Literals are case sensitive and denoted by a preceding
|
||||||
|
slash sign (e.g. "/Name")
|
||||||
|
|
||||||
|
Note: Do not create an instance of PSLiteral directly.
|
||||||
|
Always use PSLiteralTable.intern().
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
@ -48,11 +53,18 @@ class PSLiteral(PSObject):
|
||||||
|
|
||||||
## PSKeyword
|
## PSKeyword
|
||||||
##
|
##
|
||||||
## Note: Never create an instance of PSLiteral by hand.
|
|
||||||
## Always use PSKeywordTable.intern().
|
|
||||||
##
|
|
||||||
class PSKeyword(PSObject):
|
class PSKeyword(PSObject):
|
||||||
|
|
||||||
|
"""A class that represents a PostScript keyword.
|
||||||
|
|
||||||
|
PostScript keywords are a dozen of predefined words.
|
||||||
|
Commands and directives in PostScript are expressed by keywords.
|
||||||
|
They are also used to denote the content boundaries.
|
||||||
|
|
||||||
|
Note: Do not create an instance of PSKeyword directly.
|
||||||
|
Always use PSKeywordTable.intern().
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
self.name = name
|
self.name = name
|
||||||
return
|
return
|
||||||
|
@ -63,14 +75,13 @@ class PSKeyword(PSObject):
|
||||||
|
|
||||||
## PSSymbolTable
|
## PSSymbolTable
|
||||||
##
|
##
|
||||||
## A dictionary-like object that is used for
|
|
||||||
## storing PSLiteral/PSKeyword objects so that
|
|
||||||
## an object that has the same name can never be defined
|
|
||||||
## twice and it is always assured that the same name is
|
|
||||||
## referred to as the same PSLiteral/PSKeyword object.
|
|
||||||
##
|
|
||||||
class PSSymbolTable(object):
|
class PSSymbolTable(object):
|
||||||
|
|
||||||
|
"""A utility class for storing PSLiteral/PSKeyword objects.
|
||||||
|
|
||||||
|
Interned objects can be checked its identity with "is" operator.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, klass):
|
def __init__(self, klass):
|
||||||
self.dic = {}
|
self.dic = {}
|
||||||
self.klass = klass
|
self.klass = klass
|
||||||
|
|
|
@ -13,6 +13,7 @@ def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)):
|
||||||
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
a0*e1+c0*f1+e0, b0*e1+d0*f1+f0)
|
||||||
|
|
||||||
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
def translate_matrix((a,b,c,d,e,f), (x,y)):
|
||||||
|
'''Translates a matrix by (x,y).'''
|
||||||
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
return (a,b,c,d,x*a+y*c+e,x*b+y*d+f)
|
||||||
|
|
||||||
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
def apply_matrix_pt((a,b,c,d,e,f), (x,y)):
|
||||||
|
@ -29,7 +30,7 @@ def apply_matrix_norm((a,b,c,d,e,f), (p,q)):
|
||||||
|
|
||||||
# pick
|
# pick
|
||||||
def pick(seq, func, maxobj=None):
|
def pick(seq, func, maxobj=None):
|
||||||
'''Picks the object that has the highest value of func(obj).'''
|
'''Picks the object obj where func(obj) has the highest value.'''
|
||||||
maxscore = None
|
maxscore = None
|
||||||
for obj in seq:
|
for obj in seq:
|
||||||
score = func(obj)
|
score = func(obj)
|
||||||
|
@ -40,8 +41,9 @@ def pick(seq, func, maxobj=None):
|
||||||
# bsearch
|
# bsearch
|
||||||
def bsearch(objs, v0):
|
def bsearch(objs, v0):
|
||||||
'''Tries to find the closest value to v0.'''
|
'''Tries to find the closest value to v0.'''
|
||||||
|
nb_objs = len(objs)
|
||||||
i0 = 0
|
i0 = 0
|
||||||
i1 = len(objs)
|
i1 = nb_objs
|
||||||
while i0 < i1:
|
while i0 < i1:
|
||||||
i = (i0+i1)/2
|
i = (i0+i1)/2
|
||||||
(v, obj) = objs[i]
|
(v, obj) = objs[i]
|
||||||
|
@ -49,7 +51,7 @@ def bsearch(objs, v0):
|
||||||
(i0,i1) = (i,i+1)
|
(i0,i1) = (i,i+1)
|
||||||
while 0 < i0 and objs[i0-1][0] == v0:
|
while 0 < i0 and objs[i0-1][0] == v0:
|
||||||
i0 -= 1
|
i0 -= 1
|
||||||
while i1 < len(objs)-1 and objs[i1][0] == v0:
|
while i1 < nb_objs-1 and objs[i1][0] == v0:
|
||||||
i1 += 1
|
i1 += 1
|
||||||
break
|
break
|
||||||
elif v0 < v:
|
elif v0 < v:
|
||||||
|
@ -71,7 +73,7 @@ def choplist(n, seq):
|
||||||
|
|
||||||
# nunpack
|
# nunpack
|
||||||
def nunpack(s, default=0):
|
def nunpack(s, default=0):
|
||||||
'''Unpacks up to 4 bytes big endian.'''
|
'''Unpacks 1 to 4 byte integers (big endian).'''
|
||||||
l = len(s)
|
l = len(s)
|
||||||
if not l:
|
if not l:
|
||||||
return default
|
return default
|
||||||
|
|
Loading…
Reference in New Issue