git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@79 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-03-29 13:21:21 +00:00
parent c5991f74ff
commit 68cc99379d
3 changed files with 46 additions and 22 deletions

View File

@ -17,7 +17,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Wed Mar 25 20:59:18 JST 2009
Last Modified: Sun Mar 29 19:09:46 JST 2009
<!-- hhmts end -->
</div>
@ -202,6 +202,7 @@ This makes the word spacing correctly handled.
<dd> Specifies the output format. The following formats are currently supported.
<ul>
<li> <code>html</code> : HTML format. (Default)
<li> <code>text</code> : TEXT format.
<li> <code>sgml</code> : SGML format.
<li> <code>tag</code> : "Tagged PDF" format. A tagged PDF has its own contents annotated with
HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations.

View File

@ -59,9 +59,9 @@ class Plane(object):
return objs
## Clusters
## ClusterSet
##
class Clusters(object):
class ClusterSet(object):
def __init__(self):
self.clusters = {}
@ -86,11 +86,12 @@ class Clusters(object):
def cluster_pageobjs(objs, ratio):
idx = dict( (obj,i) for (i,obj) in enumerate(objs) )
plane = Plane()
for obj in objs:
plane.add(obj.bbox, obj)
plane.finish()
clusters = Clusters()
cset = ClusterSet()
for obj in objs:
(bx0,by0,bx1,by1) = obj.bbox
margin = abs(obj.fontsize * ratio)
@ -100,17 +101,26 @@ def cluster_pageobjs(objs, ratio):
y1 = max(by0,by1)
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
if len(found) == 1:
clusters.add(found.pop())
cset.add(found.pop())
else:
clusters.merge(found)
cset.merge(found)
clusters = sorted(cset.finish(), key=lambda objs: idx[objs[0]])
r = []
for objs in clusters.finish():
for objs in clusters:
objs = sorted(objs, key=lambda obj: idx[obj])
h = v = 0
(bx0,by0,bx1,by1) = objs[0].bbox
(lx0,ly0,_,_) = objs[0].bbox
for obj in objs[1:]:
(x0,y0,x1,y1) = obj.bbox
if len(obj.text) == 1 and abs(lx0-x0) < abs(ly0-y0):
v += 1
else:
h += 1
(lx0,ly0) = (x0,y0)
bx0 = min(bx0, x0)
bx1 = max(bx1, x1)
by0 = min(by0, y0)
by1 = max(by1, y1)
r.append(((bx0,by0,bx1,by1), objs))
r.append(((bx0,by0,bx1,by1), h < v, objs))
return r

View File

@ -2,7 +2,7 @@
import sys
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
from pdfdevice import PDFDevice, PageItem, FigureItem, TextItem, PDFPageAggregator
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
@ -15,6 +15,15 @@ def encprops(props, codec):
if not props: return ''
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
def get_textobjs(item, r=None):
if r == None: r = []
if isinstance(item, TextItem):
r.append(item)
elif isinstance(item, PageItem):
for child in item.objs:
get_textobjs(child, r)
return r
## PDFConverter
class PDFConverter(PDFPageAggregator):
@ -73,7 +82,8 @@ class HTMLConverter(PDFConverter):
page = PDFConverter.end_page(self, page)
def f(item):
if isinstance(item, FigureItem):
pass
for child in item.objs:
f(child)
elif isinstance(item, TextItem):
if item.direction == 2:
wmode = 'tb-rl'
@ -95,8 +105,8 @@ class HTMLConverter(PDFConverter):
for child in page.objs:
f(child)
if self.cluster_margin:
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
clusters = cluster_pageobjs(get_textobjs(page), self.cluster_margin)
for ((x0,y0,x1,y1),_,objs) in clusters:
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
@ -114,7 +124,7 @@ class HTMLConverter(PDFConverter):
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.5, splitwords=False, hyphenation=True):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
self.pagenum = pagenum
self.cluster_margin = cluster_margin
@ -125,15 +135,18 @@ class TextConverter(PDFConverter):
from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page)
if self.cluster_margin:
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
textobjs = get_textobjs(page)
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
clusters.sort(key=lambda (_,objs): idx[objs[0]])
for (_,objs) in clusters:
for item in sorted(objs, key=lambda obj:idx[obj]):
text = item.text
self.outfp.write(text.encode(self.codec, 'replace'))
for (_,vertical,objs) in clusters:
for (i,item) in enumerate(objs):
(x0,y0,x1,y1) = item.bbox
if (i and
((not vertical and (y1 < ly0 or ly1 < y0)) or
(vertical and (x1 < lx0 or lx1 < x0)))):
self.outfp.write('\n')
(lx0,ly0,lx1,ly1) = (x0,y0,x1,y1)
self.outfp.write(item.text.encode(self.codec, 'replace'))
self.outfp.write('\n\n')
else:
for item in page.objs:
if isinstance(item, TextItem):
@ -243,7 +256,7 @@ def main(argv):
codec = 'ascii'
pagenos = set()
maxpages = 0
outtype = 'text'
outtype = 'html'
password = ''
splitwords = False
outfp = sys.stdout