git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@78 1aa58f4a-7d42-0410-adbc-911cccaed67c
parent
b366ad270f
commit
c5991f74ff
18
README.html
18
README.html
|
@ -17,7 +17,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Wed Mar 25 08:10:02 JST 2009
|
||||
Last Modified: Wed Mar 25 20:59:18 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -76,6 +76,22 @@ http://pdf2html.tabesugi.net:8080/
|
|||
<a name="install"></a>
|
||||
<hr noshade>
|
||||
<h2>How to Install</h2>
|
||||
<p>
|
||||
<strong>Note:</strong>
|
||||
This software is not yet out-of-the-box.
|
||||
You have to download and unpack it manually,
|
||||
and spend some time to make it work.
|
||||
<strong>Your will</strong> is needed!
|
||||
I do not support easy_install or setup.py or any automated installation until
|
||||
this is very polished to the point that it really should be widely distributed.
|
||||
(For now, it's not yet up to the standard, IMO.)
|
||||
Until then, I don't want to help scattering this
|
||||
poorly tested/documented/supported piece of software in your system.
|
||||
Having that said,
|
||||
I don't stop anyone repackaging or redistributing this as a more
|
||||
sophisticated package, in which case, it's their responsibility
|
||||
to properly test, document and maintain the derived packages.
|
||||
|
||||
<ol>
|
||||
<li> Install <a href="http://www.python.org/download/">Python</a> 2.5 or newer.
|
||||
<li> Download the <a href="#source">PDFMiner source</a>.
|
||||
|
|
6
TODO
6
TODO
|
@ -1,7 +1,5 @@
|
|||
TODOs:
|
||||
- Better API Documentation.
|
||||
- Error handling for invalid type.
|
||||
- Infer text stream by clustering.
|
||||
|
||||
- Robust error handling.
|
||||
- Any special handling for linearized PDFs?
|
||||
- Support writing/creating PDFs.
|
||||
- Any special treatments for linearized PDFs?
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
|
||||
|
||||
## binary search
|
||||
##
|
||||
def bsearch(objs, v0, v1):
|
||||
assert v0 <= v1
|
||||
i0 = 0
|
||||
i1 = len(objs)-1
|
||||
while i0 <= i1:
|
||||
i = (i0+i1)/2
|
||||
assert 0 <= i and i < len(objs)
|
||||
(v, obj) = objs[i]
|
||||
if v < v0:
|
||||
i0 = i+1
|
||||
elif v1 < v:
|
||||
i1 = i-1
|
||||
else:
|
||||
i0 = i
|
||||
while 0 < i0:
|
||||
(v,_) = objs[i0-1]
|
||||
if v < v0: break
|
||||
i0 -= 1
|
||||
i1 = i
|
||||
while i1 < len(objs)-1:
|
||||
(v,_) = objs[i1+1]
|
||||
if v1 < v: break
|
||||
i1 += 1
|
||||
return [ obj for (_,obj) in objs[i0:i1+1] ]
|
||||
return []
|
||||
|
||||
|
||||
## Plane
|
||||
##
|
||||
class Plane(object):
|
||||
|
||||
def __init__(self):
|
||||
self.xobjs = []
|
||||
self.yobjs = []
|
||||
return
|
||||
|
||||
def add(self, (x0,y0,x1,y1), obj):
|
||||
self.xobjs.append((x0, obj))
|
||||
self.xobjs.append((x1, obj))
|
||||
self.yobjs.append((y0, obj))
|
||||
self.yobjs.append((y1, obj))
|
||||
return
|
||||
|
||||
def finish(self):
|
||||
self.xobjs.sort()
|
||||
self.yobjs.sort()
|
||||
return
|
||||
|
||||
def find(self, (x0,y0,x1,y1)):
|
||||
xobjs = set(bsearch(self.xobjs, x0, x1))
|
||||
yobjs = set(bsearch(self.yobjs, y0, y1))
|
||||
objs = xobjs.intersection(yobjs)
|
||||
return objs
|
||||
|
||||
|
||||
## Clusters
|
||||
##
|
||||
class Clusters(object):
|
||||
|
||||
def __init__(self):
|
||||
self.clusters = {}
|
||||
return
|
||||
|
||||
def add(self, obj):
|
||||
self.clusters[obj] = (obj,)
|
||||
return
|
||||
|
||||
def merge(self, objs):
|
||||
allobjs = set(objs)
|
||||
for obj in objs:
|
||||
if obj in self.clusters:
|
||||
allobjs.update(self.clusters[obj])
|
||||
c = tuple(allobjs)
|
||||
for obj in allobjs:
|
||||
self.clusters[obj] = c
|
||||
return
|
||||
|
||||
def finish(self):
|
||||
return set(self.clusters.itervalues())
|
||||
|
||||
|
||||
def cluster_pageobjs(objs, ratio):
|
||||
plane = Plane()
|
||||
for obj in objs:
|
||||
plane.add(obj.bbox, obj)
|
||||
plane.finish()
|
||||
clusters = Clusters()
|
||||
for obj in objs:
|
||||
(bx0,by0,bx1,by1) = obj.bbox
|
||||
margin = abs(obj.fontsize * ratio)
|
||||
x0 = min(bx0,bx1)
|
||||
y0 = min(by0,by1)
|
||||
x1 = max(bx0,bx1)
|
||||
y1 = max(by0,by1)
|
||||
found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin))
|
||||
if len(found) == 1:
|
||||
clusters.add(found.pop())
|
||||
else:
|
||||
clusters.merge(found)
|
||||
r = []
|
||||
for objs in clusters.finish():
|
||||
(bx0,by0,bx1,by1) = objs[0].bbox
|
||||
for obj in objs[1:]:
|
||||
(x0,y0,x1,y1) = obj.bbox
|
||||
bx0 = min(bx0, x0)
|
||||
bx1 = max(bx1, x1)
|
||||
by0 = min(by0, y0)
|
||||
by1 = max(by1, y1)
|
||||
r.append(((bx0,by0,bx1,by1), objs))
|
||||
return r
|
|
@ -1,7 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
||||
from pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
|
||||
|
@ -18,8 +16,8 @@ def encprops(props, codec):
|
|||
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
||||
|
||||
|
||||
## TextConverter
|
||||
class TextConverter(PDFPageAggregator):
|
||||
## PDFConverter
|
||||
class PDFConverter(PDFPageAggregator):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='ascii', splitwords=False):
|
||||
PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords)
|
||||
|
@ -30,11 +28,10 @@ class TextConverter(PDFPageAggregator):
|
|||
|
||||
## SGMLConverter
|
||||
##
|
||||
class SGMLConverter(TextConverter):
|
||||
class SGMLConverter(PDFConverter):
|
||||
|
||||
def end_page(self, page):
|
||||
TextConverter.end_page(self, page)
|
||||
page = self.cur_item
|
||||
page = PDFConverter.end_page(self, page)
|
||||
def f(item):
|
||||
bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox
|
||||
if isinstance(item, FigureItem):
|
||||
|
@ -58,21 +55,22 @@ class SGMLConverter(TextConverter):
|
|||
|
||||
## HTMLConverter
|
||||
##
|
||||
class HTMLConverter(TextConverter):
|
||||
class HTMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False):
|
||||
TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
self.pagenum = pagenum
|
||||
self.pagepad = pagepad
|
||||
self.scale = scale
|
||||
self.outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
||||
self.outfp.write('</head><body>\n')
|
||||
self.yoffset = self.pagepad
|
||||
self.cluster_margin = cluster_margin
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
TextConverter.end_page(self, page)
|
||||
page = self.cur_item
|
||||
from cluster import cluster_pageobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
def f(item):
|
||||
if isinstance(item, FigureItem):
|
||||
pass
|
||||
|
@ -96,6 +94,12 @@ class HTMLConverter(TextConverter):
|
|||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
for child in page.objs:
|
||||
f(child)
|
||||
if self.cluster_margin:
|
||||
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
|
||||
for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin):
|
||||
self.outfp.write('<span style="position:absolute; border: 1px solid red; '
|
||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
||||
self.yoffset += self.pagepad
|
||||
return
|
||||
|
||||
|
@ -106,6 +110,41 @@ class HTMLConverter(TextConverter):
|
|||
return
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
class TextConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True):
|
||||
PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True)
|
||||
self.pagenum = pagenum
|
||||
self.cluster_margin = cluster_margin
|
||||
self.hyphenation = hyphenation
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
from cluster import cluster_pageobjs
|
||||
page = PDFConverter.end_page(self, page)
|
||||
if self.cluster_margin:
|
||||
textobjs = [ item for item in page.objs if isinstance(item, TextItem) ]
|
||||
idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) )
|
||||
clusters = cluster_pageobjs(textobjs, self.cluster_margin)
|
||||
clusters.sort(key=lambda (_,objs): idx[objs[0]])
|
||||
for (_,objs) in clusters:
|
||||
for item in sorted(objs, key=lambda obj:idx[obj]):
|
||||
text = item.text
|
||||
self.outfp.write(text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
else:
|
||||
for item in page.objs:
|
||||
if isinstance(item, TextItem):
|
||||
self.outfp.write(item.text.encode(self.codec, 'replace'))
|
||||
self.outfp.write('\n')
|
||||
return
|
||||
|
||||
def close(self):
|
||||
return
|
||||
|
||||
|
||||
## TagExtractor
|
||||
##
|
||||
class TagExtractor(PDFDevice):
|
||||
|
@ -142,6 +181,7 @@ class TagExtractor(PDFDevice):
|
|||
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
||||
(self.pageno, bbox, page.rotate))
|
||||
return
|
||||
|
||||
def end_page(self, page):
|
||||
self.outfp.write('</page>\n')
|
||||
self.pageno += 1
|
||||
|
@ -190,7 +230,7 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''):
|
|||
def main(argv):
|
||||
import getopt
|
||||
def usage():
|
||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0]
|
||||
return 100
|
||||
try:
|
||||
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w')
|
||||
|
@ -203,10 +243,10 @@ def main(argv):
|
|||
codec = 'ascii'
|
||||
pagenos = set()
|
||||
maxpages = 0
|
||||
outtype = 'html'
|
||||
outtype = 'text'
|
||||
password = ''
|
||||
splitwords = False
|
||||
outfp = stdout
|
||||
outfp = sys.stdout
|
||||
for (k, v) in opts:
|
||||
if k == '-d': debug += 1
|
||||
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
||||
|
@ -231,8 +271,10 @@ def main(argv):
|
|||
device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
elif outtype == 'text':
|
||||
device = TextConverter(rsrc, outfp, codec=codec)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords)
|
||||
device = TagExtractor(rsrc, outfp, codec=codec)
|
||||
else:
|
||||
return usage()
|
||||
for fname in args:
|
||||
|
|
|
@ -80,7 +80,7 @@ class FigureItem(PageItem):
|
|||
##
|
||||
class TextItem(object):
|
||||
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, text):
|
||||
def __init__(self, matrix, font, fontsize, charspace, scaling, chars):
|
||||
self.matrix = matrix
|
||||
self.font = font
|
||||
(_,_,_,_,tx,ty) = self.matrix
|
||||
|
@ -96,7 +96,7 @@ class TextItem(object):
|
|||
w = 0
|
||||
dx = 0
|
||||
prev = ' '
|
||||
for (char,cid,t) in text:
|
||||
for (char,cid,t) in chars:
|
||||
if char:
|
||||
if prev != ' ' and spwidth < dx:
|
||||
self.text += ' '
|
||||
|
@ -118,13 +118,13 @@ class TextItem(object):
|
|||
self.direction = 2
|
||||
disp = 0
|
||||
h = 0
|
||||
for (char,cid,disp) in text:
|
||||
for (char,cid,disp) in chars:
|
||||
if not char: continue
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
break
|
||||
for (char,cid,_) in text:
|
||||
for (char,cid,_) in chars[1:]:
|
||||
if not char: continue
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
|
@ -155,16 +155,18 @@ class PDFPageAggregator(PDFDevice):
|
|||
def begin_page(self, page):
|
||||
self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate)
|
||||
return
|
||||
|
||||
def end_page(self, _):
|
||||
assert not self.stack
|
||||
assert isinstance(self.cur_item, PageItem)
|
||||
self.pageno += 1
|
||||
return
|
||||
return self.cur_item
|
||||
|
||||
def begin_figure(self, name, bbox):
|
||||
self.stack.append(self.cur_item)
|
||||
self.cur_item = FigureItem(name, bbox)
|
||||
return
|
||||
|
||||
def end_figure(self, _):
|
||||
fig = self.cur_item
|
||||
self.cur_item = self.stack.pop()
|
||||
|
|
Loading…
Reference in New Issue