git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@78 1aa58f4a-7d42-0410-adbc-911cccaed67c

pull/1/head
yusuke.shinyama.dummy 2009-03-28 17:23:53 +00:00
parent b366ad270f
commit c5991f74ff
5 changed files with 200 additions and 26 deletions

View File

@ -17,7 +17,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod>
<!-- hhmts start -->
Last Modified: Wed Mar 25 08:10:02 JST 2009
Last Modified: Wed Mar 25 20:59:18 JST 2009
<!-- hhmts end -->
</div>
@ -76,6 +76,22 @@ http://pdf2html.tabesugi.net:8080/
<a name="install"></a>
<hr noshade>
<h2>How to Install</h2>
<p>
<strong>Note:</strong>
This software is not yet out-of-the-box.
You have to download and unpack it manually,
and spend some time to make it work.
<strong>Your will</strong> is needed!
I do not support easy_install or setup.py or any automated installation until
this is very polished to the point that it really should be widely distributed.
(For now, it's not yet up to the standard, IMO.)
Until then, I don't want to help scattering this
poorly tested/documented/supported piece of software in your system.
Having that said,
I don't stop anyone repackaging or redistributing this as a more
sophisticated package, in which case, it's their responsibility
to properly test, document and maintain the derived packages.
<ol>
<li> Install <a href="http://www.python.org/download/">Python</a> 2.5 or newer.
<li> Download the <a href="#source">PDFMiner source</a>.

6
TODO
View File

@ -1,7 +1,5 @@
TODOs:
- Better API Documentation.
- Error handling for invalid type.
- Infer text stream by clustering.
- Robust error handling.
- Any special handling for linearized PDFs?
- Support writing/creating PDFs.
- Any special treatments for linearized PDFs?

116
pdflib/cluster.py Normal file
View File

@ -0,0 +1,116 @@
#!/usr/bin/env python
import sys
## binary search
##
def bsearch(objs, v0, v1):
assert v0 <= v1
i0 = 0
i1 = len(objs)-1
while i0 <= i1:
i = (i0+i1)/2
assert 0 <= i and i < len(objs)
(v, obj) = objs[i]
if v < v0:
i0 = i+1
elif v1 < v:
i1 = i-1
else:
i0 = i
while 0 < i0:
(v,_) = objs[i0-1]
if v < v0: break
i0 -= 1
i1 = i
while i1 < len(objs)-1:
(v,_) = objs[i1+1]
if v1 < v: break
i1 += 1
return [ obj for (_,obj) in objs[i0:i1+1] ]
return []
## Plane
##
class Plane(object):
def __init__(self):
self.xobjs = []
self.yobjs = []
return
def add(self, (x0,y0,x1,y1), obj):
self.xobjs.append((x0, obj))
self.xobjs.append((x1, obj))
self.yobjs.append((y0, obj))
self.yobjs.append((y1, obj))
return
def finish(self):
self.xobjs.sort()
self.yobjs.sort()
return
def find(self, (x0,y0,x1,y1)):
xobjs = set(bsearch(self.xobjs, x0, x1))
yobjs = set(bsearch(self.yobjs, y0, y1))
objs = xobjs.intersection(yobjs)
return objs
## Clusters
##
class Clusters(object):
def __init__(self):
self.clusters = {}
return
def add(self, obj):
self.clusters[obj] = (obj,)
return
def merge(self, objs):
allobjs = set(objs)
for obj in objs:
if obj in self.clusters:
allobjs.update(self.clusters[obj])
c = tuple(allobjs)
for obj in allobjs:
self.clusters[obj] = c
return
def finish(self):
return set(self.clusters.itervalues())
def cluster_pageobjs(objs, ratio):
plane = Plane()
for obj in objs:
plane.add(obj.bbox, obj)
plane.finish()
clusters = Clusters()
for obj in objs:
(bx0,by0,bx1,by1) = obj.bbox
margin = abs(obj.fontsize * ratio)
x0 = min(bx0,bx1)
y0 = min(by0,by1)
x1 = max(bx0,bx1)
y1 = max(by0,by1)
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
if len(found) == 1:
clusters.add(found.pop())
else:
clusters.merge(found)
r = []
for objs in clusters.finish():
(bx0,by0,bx1,by1) = objs[0].bbox
for obj in objs[1:]:
(x0,y0,x1,y1) = obj.bbox
bx0 = min(bx0, x0)
bx1 = max(bx1, x1)
by0 = min(by0, y0)
by1 = max(by1, y1)
r.append(((bx0,by0,bx1,by1), objs))
return r

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
@ -18,8 +16,8 @@ def encprops(props, codec):
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
## TextConverter
class TextConverter(PDFPageAggregator):
## PDFConverter
class PDFConverter(PDFPageAggregator):
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
@ -30,11 +28,10 @@ class TextConverter(PDFPageAggregator):
## SGMLConverter
##
class SGMLConverter(TextConverter):
class SGMLConverter(PDFConverter):
def end_page(self, page):
TextConverter.end_page(self, page)
page = self.cur_item
page = PDFConverter.end_page(self, page)
def f(item):
bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox
if isinstance(item, FigureItem):
@ -58,21 +55,22 @@ class SGMLConverter(TextConverter):
## HTMLConverter
##
class HTMLConverter(TextConverter):
class HTMLConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False):
TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
self.pagenum = pagenum
self.pagepad = pagepad
self.scale = scale
self.outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
self.outfp.write('</head><body>\n')
self.yoffset = self.pagepad
self.cluster_margin = cluster_margin
return
def end_page(self, page):
TextConverter.end_page(self, page)
page = self.cur_item
from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page)
def f(item):
if isinstance(item, FigureItem):
pass
@ -96,6 +94,12 @@ class HTMLConverter(TextConverter):
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
for child in page.objs:
f(child)
if self.cluster_margin:
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
self.yoffset += self.pagepad
return
@ -106,6 +110,41 @@ class HTMLConverter(TextConverter):
return
## TextConverter
##
class TextConverter(PDFConverter):
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
self.pagenum = pagenum
self.cluster_margin = cluster_margin
self.hyphenation = hyphenation
return
def end_page(self, page):
from cluster import cluster_pageobjs
page = PDFConverter.end_page(self, page)
if self.cluster_margin:
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
clusters.sort(key=lambda (_,objs): idx[objs[0]])
for (_,objs) in clusters:
for item in sorted(objs, key=lambda obj:idx[obj]):
text = item.text
self.outfp.write(text.encode(self.codec, 'replace'))
self.outfp.write('\n')
else:
for item in page.objs:
if isinstance(item, TextItem):
self.outfp.write(item.text.encode(self.codec, 'replace'))
self.outfp.write('\n')
return
def close(self):
return
## TagExtractor
##
class TagExtractor(PDFDevice):
@ -142,6 +181,7 @@ class TagExtractor(PDFDevice):
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
(self.pageno, bbox, page.rotate))
return
def end_page(self, page):
self.outfp.write('</page>\n')
self.pageno += 1
@ -190,7 +230,7 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0]
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
@ -203,10 +243,10 @@ def main(argv):
codec = 'ascii'
pagenos = set()
maxpages = 0
outtype = 'html'
outtype = 'text'
password = ''
splitwords = False
outfp = stdout
outfp = sys.stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
@ -231,8 +271,10 @@ def main(argv):
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
elif outtype == 'html':
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
elif outtype == 'text':
device = TextConverter(rsrc, outfp, codec=codec)
elif outtype == 'tag':
device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords)
device = TagExtractor(rsrc, outfp, codec=codec)
else:
return usage()
for fname in args:

View File

@ -80,7 +80,7 @@ class FigureItem(PageItem):
##
class TextItem(object):
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
self.matrix = matrix
self.font = font
(_,_,_,_,tx,ty) = self.matrix
@ -96,7 +96,7 @@ class TextItem(object):
w = 0
dx = 0
prev = ' '
for (char,cid,t) in text:
for (char,cid,t) in chars:
if char:
if prev != ' ' and spwidth < dx:
self.text += ' '
@ -118,13 +118,13 @@ class TextItem(object):
self.direction = 2
disp = 0
h = 0
for (char,cid,disp) in text:
for (char,cid,disp) in chars:
if not char: continue
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
break
for (char,cid,_) in text:
for (char,cid,_) in chars[1:]:
if not char: continue
self.text += font.to_unicode(cid)
h += (font.char_width(cid) * fontsize + charspace) * scaling
@ -155,16 +155,18 @@ class PDFPageAggregator(PDFDevice):
def begin_page(self, page):
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
return
def end_page(self, _):
assert not self.stack
assert isinstance(self.cur_item, PageItem)
self.pageno += 1
return
return self.cur_item
def begin_figure(self, name, bbox):
self.stack.append(self.cur_item)
self.cur_item = FigureItem(name, bbox)
return
def end_figure(self, _):
fig = self.cur_item
self.cur_item = self.stack.pop()