git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@79 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
c5991f74ff
commit
68cc99379d
|
@ -17,7 +17,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Wed Mar 25 20:59:18 JST 2009
|
||||
Last Modified: Sun Mar 29 19:09:46 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -202,6 +202,7 @@ This makes the word spacing correctly handled.
|
|||
<dd> Specifies the output format. The following formats are currently supported.
|
||||
<ul>
|
||||
<li> <code>html</code> : HTML format. (Default)
|
||||
<li> <code>text</code> : TEXT format.
|
||||
<li> <code>sgml</code> : SGML format.
|
||||
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
|
||||
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.
|
||||
|
|
|
@ -59,9 +59,9 @@ class Plane(object):
|
|||
return objs
|
||||
|
||||
|
||||
## Clusters
|
||||
## ClusterSet
|
||||
##
|
||||
class Clusters(object):
|
||||
class ClusterSet(object):
|
||||
|
||||
def __init__(self):
|
||||
self.clusters = {}
|
||||
|
@ -86,11 +86,12 @@ class Clusters(object):
|
|||
|
||||
|
||||
def cluster_pageobjs(objs, ratio):
|
||||
idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
|
||||
plane = Plane()
|
||||
for obj in objs:
|
||||
plane.add(obj.bbox, obj)
|
||||
plane.finish()
|
||||
clusters = Clusters()
|
||||
cset = ClusterSet()
|
||||
for obj in objs:
|
||||
(bx0,by0,bx1,by1) = obj.bbox
|
||||
margin = abs(obj.fontsize * ratio)
|
||||
|
@ -100,17 +101,26 @@ def cluster_pageobjs(objs, ratio):
|
|||
y1 = max(by0,by1)
|
||||
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
|
||||
if len(found) == 1:
|
||||
clusters.add(found.pop())
|
||||
cset.add(found.pop())
|
||||
else:
|
||||
clusters.merge(found)
|
||||
cset.merge(found)
|
||||
clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
|
||||
r = []
|
||||
for objs in clusters.finish():
|
||||
for objs in clusters:
|
||||
objs = sorted(objs, key=lambda obj: idx[obj])
|
||||
h = v = 0
|
||||
(bx0,by0,bx1,by1) = objs[0].bbox
|
||||
(lx0,ly0,_,_) = objs[0].bbox
|
||||
for obj in objs[1:]:
|
||||
(x0,y0,x1,y1) = obj.bbox
|
||||
if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
|
||||
v += 1
|
||||
else:
|
||||
h += 1
|
||||
(lx0,ly0) = (x0,y0)
|
||||
bx0 = min(bx0, x0)
|
||||
bx1 = max(bx1, x1)
|
||||
by0 = min(by0, y0)
|
||||
by1 = max(by1, y1)
|
||||
r.append(((bx0,by0,bx1,by1), objs))
|
||||
r.append(((bx0,by0,bx1,by1), h < v, objs))
|
||||
return r
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
import sys
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
|
||||
from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from cmap import CMapDB
|
||||
|
||||
|
@ -15,6 +15,15 @@ def encprops(props, codec):
|
|||
if not props: return ''
|
||||
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
||||
|
||||
def get_textobjs(item, r=None):
|
||||
if r == None: r = []
|
||||
if isinstance(item, TextItem):
|
||||
r.append(item)
|
||||
elif isinstance(item, PageItem):
|
||||
for child in item.objs:
|
||||
get_textobjs(child, r)
|
||||
return r
|
||||
|
||||
|
||||
## PDFConverter
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
@ -73,7 +82,8 @@ class HTMLConverter(PDFConverter):
|
|||
page = PDFConverter.end_page(self, page)
|
||||
def f(item):
|
||||
if isinstance(item, FigureItem):
|
||||
pass
|
||||
for child in item.objs:
|
||||
f(child)
|
||||
elif isinstance(item, TextItem):
|
||||
if item.direction == 2:
|
||||
wmode = 'tb-rl'
|
||||
|
@ -95,8 +105,8 @@ class HTMLConverter(PDFConverter):
|
|||
for child in page.objs:
|
||||
f(child)
|
||||
if self.cluster_margin:
|
||||
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
|
||||
for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
|
||||
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
|
||||
for ((x0,y0,x1,y1),_,objs) in clusters:
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
|
@ -114,7 +124,7 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||
self.pagenum = pagenum
|
||||
self.cluster_margin = cluster_margin
|
||||
|
@ -125,15 +135,18 @@ class TextConverter(PDFConverter):
|
|||
from cluster import cluster_pageobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.cluster_margin:
|
||||
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
|
||||
idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
|
||||
textobjs = get_textobjs(page)
|
||||
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
||||
clusters.sort(key=lambda (_,objs): idx[objs[0]])
|
||||
for (_,objs) in clusters:
|
||||
for item in sorted(objs, key=lambda obj:idx[obj]):
|
||||
text = item.text
|
||||
self.outfp.write(text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
for (_,vertical,objs) in clusters:
|
||||
for (i,item) in enumerate(objs):
|
||||
(x0,y0,x1,y1) = item.bbox
|
||||
if (i and
|
||||
((not vertical and (y1 < ly0 or ly1 < y0)) or
|
||||
(vertical and (x1 < lx0 or lx1 < x0)))):
|
||||
self.outfp.write('\n')
|
||||
(lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
|
||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n\n')
|
||||
else:
|
||||
for item in page.objs:
|
||||
if isinstance(item, TextItem):
|
||||
|
@ -243,7 +256,7 @@ def main(argv):
|
|||
codec = 'ascii'
|
||||
pagenos = set()
|
||||
maxpages = 0
|
||||
outtype = 'text'
|
||||
outtype = 'html'
|
||||
password = ''
|
||||
splitwords = False
|
||||
outfp = sys.stdout
|
||||
|
|
Loading…
Reference in New Issue