From c5991f74ff23a20a33e6652f6c3ced2b2f27d97e Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 28 Mar 2009 17:23:53 +0000 Subject: [PATCH] git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@78 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 18 ++++++- TODO | 6 +-- pdflib/cluster.py | 116 ++++++++++++++++++++++++++++++++++++++++++++ pdflib/pdf2txt.py | 74 ++++++++++++++++++++++------ pdflib/pdfdevice.py | 12 +++-- 5 files changed, 200 insertions(+), 26 deletions(-) create mode 100644 pdflib/cluster.py diff --git a/README.html b/README.html index e78c81e..035c865 100644 --- a/README.html +++ b/README.html @@ -17,7 +17,7 @@ Python PDF parser and analyzer
-Last Modified: Wed Mar 25 08:10:02 JST 2009 +Last Modified: Wed Mar 25 20:59:18 JST 2009
@@ -76,6 +76,22 @@ http://pdf2html.tabesugi.net:8080/

How to Install

+

+Note: +This software is not yet out-of-the-box. +You have to download and unpack it manually, +and spend some time to make it work. +Your will is needed! +I do not support easy_install or setup.py or any automated installation until +this is very polished to the point that it really should be widely distributed. +(For now, it's not yet up to the standard, IMO.) +Until then, I don't want to help scattering this +poorly tested/documented/supported piece of software in your system. +Having that said, +I don't stop anyone repackaging or redistributing this as a more +sophisticated package, in which case, it's their responsibility +to properly test, document and maintain the derived packages. +

  1. Install Python 2.5 or newer.
  2. Download the PDFMiner source. diff --git a/TODO b/TODO index 35585e9..36b9aeb 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,5 @@ TODOs: - Better API Documentation. - - Error handling for invalid type. - - Infer text stream by clustering. - + - Robust error handling. + - Any special handling for linearized PDFs? - Support writing/creating PDFs. - - Any special treatments for linearized PDFs? diff --git a/pdflib/cluster.py b/pdflib/cluster.py new file mode 100644 index 0000000..2b49704 --- /dev/null +++ b/pdflib/cluster.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +import sys + + +## binary search +## +def bsearch(objs, v0, v1): + assert v0 <= v1 + i0 = 0 + i1 = len(objs)-1 + while i0 <= i1: + i = (i0+i1)/2 + assert 0 <= i and i < len(objs) + (v, obj) = objs[i] + if v < v0: + i0 = i+1 + elif v1 < v: + i1 = i-1 + else: + i0 = i + while 0 < i0: + (v,_) = objs[i0-1] + if v < v0: break + i0 -= 1 + i1 = i + while i1 < len(objs)-1: + (v,_) = objs[i1+1] + if v1 < v: break + i1 += 1 + return [ obj for (_,obj) in objs[i0:i1+1] ] + return [] + + +## Plane +## +class Plane(object): + + def __init__(self): + self.xobjs = [] + self.yobjs = [] + return + + def add(self, (x0,y0,x1,y1), obj): + self.xobjs.append((x0, obj)) + self.xobjs.append((x1, obj)) + self.yobjs.append((y0, obj)) + self.yobjs.append((y1, obj)) + return + + def finish(self): + self.xobjs.sort() + self.yobjs.sort() + return + + def find(self, (x0,y0,x1,y1)): + xobjs = set(bsearch(self.xobjs, x0, x1)) + yobjs = set(bsearch(self.yobjs, y0, y1)) + objs = xobjs.intersection(yobjs) + return objs + + +## Clusters +## +class Clusters(object): + + def __init__(self): + self.clusters = {} + return + + def add(self, obj): + self.clusters[obj] = (obj,) + return + + def merge(self, objs): + allobjs = set(objs) + for obj in objs: + if obj in self.clusters: + allobjs.update(self.clusters[obj]) + c = tuple(allobjs) + for obj in allobjs: + self.clusters[obj] = c + return + + def finish(self): + return set(self.clusters.itervalues()) + + +def cluster_pageobjs(objs, ratio): + plane = Plane() + for obj in objs: + plane.add(obj.bbox, obj) + plane.finish() + clusters = Clusters() + for obj in objs: + (bx0,by0,bx1,by1) = obj.bbox + margin = abs(obj.fontsize * ratio) + x0 = min(bx0,bx1) + y0 = min(by0,by1) + x1 = max(bx0,bx1) + y1 = max(by0,by1) + found = plane.find((x0-margin, y0-margin, x1+margin, y1+margin)) + if len(found) == 1: + clusters.add(found.pop()) + else: + clusters.merge(found) + r = [] + for objs in clusters.finish(): + (bx0,by0,bx1,by1) = objs[0].bbox + for obj in objs[1:]: + (x0,y0,x1,y1) = obj.bbox + bx0 = min(bx0, x0) + bx1 = max(bx1, x1) + by0 = min(by0, y0) + by1 = max(by1, y1) + r.append(((bx0,by0,bx1,by1), objs)) + return r diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 3a2ed4e..dd8cb81 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -1,7 +1,5 @@ #!/usr/bin/env python import sys -stdout = sys.stdout -stderr = sys.stderr from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect from pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator @@ -18,8 +16,8 @@ def encprops(props, codec): return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) -## TextConverter -class TextConverter(PDFPageAggregator): +## PDFConverter +class PDFConverter(PDFPageAggregator): def __init__(self, rsrc, outfp, codec='ascii', splitwords=False): PDFPageAggregator.__init__(self, rsrc, splitwords=splitwords) @@ -30,11 +28,10 @@ class TextConverter(PDFPageAggregator): ## SGMLConverter ## -class SGMLConverter(TextConverter): +class SGMLConverter(PDFConverter): def end_page(self, page): - TextConverter.end_page(self, page) - page = self.cur_item + page = PDFConverter.end_page(self, page) def f(item): bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox if isinstance(item, FigureItem): @@ -58,21 +55,22 @@ class SGMLConverter(TextConverter): ## HTMLConverter ## -class HTMLConverter(TextConverter): +class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, splitwords=False): - TextConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, cluster_margin=0.5, splitwords=False): + PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=splitwords) self.pagenum = pagenum self.pagepad = pagepad self.scale = scale self.outfp.write('\n' % self.codec) self.outfp.write('\n') self.yoffset = self.pagepad + self.cluster_margin = cluster_margin return def end_page(self, page): - TextConverter.end_page(self, page) - page = self.cur_item + from cluster import cluster_pageobjs + page = PDFConverter.end_page(self, page) def f(item): if isinstance(item, FigureItem): pass @@ -96,6 +94,12 @@ class HTMLConverter(TextConverter): (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) for child in page.objs: f(child) + if self.cluster_margin: + textobjs = [ item for item in page.objs if isinstance(item, TextItem) ] + for ((x0,y0,x1,y1),objs) in cluster_pageobjs(textobjs, self.cluster_margin): + self.outfp.write('\n' % + (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) self.yoffset += self.pagepad return @@ -106,6 +110,41 @@ class HTMLConverter(TextConverter): return +## TextConverter +## +class TextConverter(PDFConverter): + + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, cluster_margin=0.2, splitwords=False, hyphenation=True): + PDFConverter.__init__(self, rsrc, outfp, codec=codec, splitwords=True) + self.pagenum = pagenum + self.cluster_margin = cluster_margin + self.hyphenation = hyphenation + return + + def end_page(self, page): + from cluster import cluster_pageobjs + page = PDFConverter.end_page(self, page) + if self.cluster_margin: + textobjs = [ item for item in page.objs if isinstance(item, TextItem) ] + idx = dict( (obj,i) for (i,obj) in enumerate(textobjs) ) + clusters = cluster_pageobjs(textobjs, self.cluster_margin) + clusters.sort(key=lambda (_,objs): idx[objs[0]]) + for (_,objs) in clusters: + for item in sorted(objs, key=lambda obj:idx[obj]): + text = item.text + self.outfp.write(text.encode(self.codec, 'replace')) + self.outfp.write('\n') + else: + for item in page.objs: + if isinstance(item, TextItem): + self.outfp.write(item.text.encode(self.codec, 'replace')) + self.outfp.write('\n') + return + + def close(self): + return + + ## TagExtractor ## class TagExtractor(PDFDevice): @@ -142,6 +181,7 @@ class TagExtractor(PDFDevice): self.outfp.write('' % (self.pageno, bbox, page.rotate)) return + def end_page(self, page): self.outfp.write('\n') self.pageno += 1 @@ -190,7 +230,7 @@ def convert(rsrc, device, fname, pagenos=None, maxpages=0, password=''): def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t html|sgml|tag] [-o output] file ...' % argv[0] + print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-w] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:w') @@ -203,10 +243,10 @@ def main(argv): codec = 'ascii' pagenos = set() maxpages = 0 - outtype = 'html' + outtype = 'text' password = '' splitwords = False - outfp = stdout + outfp = sys.stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) @@ -231,8 +271,10 @@ def main(argv): device = SGMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, splitwords=splitwords) + elif outtype == 'text': + device = TextConverter(rsrc, outfp, codec=codec) elif outtype == 'tag': - device = TagExtractor(rsrc, outfp, codec=codec, splitwords=splitwords) + device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index 2bce6bf..d450486 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -80,7 +80,7 @@ class FigureItem(PageItem): ## class TextItem(object): - def __init__(self, matrix, font, fontsize, charspace, scaling, text): + def __init__(self, matrix, font, fontsize, charspace, scaling, chars): self.matrix = matrix self.font = font (_,_,_,_,tx,ty) = self.matrix @@ -96,7 +96,7 @@ class TextItem(object): w = 0 dx = 0 prev = ' ' - for (char,cid,t) in text: + for (char,cid,t) in chars: if char: if prev != ' ' and spwidth < dx: self.text += ' ' @@ -118,13 +118,13 @@ class TextItem(object): self.direction = 2 disp = 0 h = 0 - for (char,cid,disp) in text: + for (char,cid,disp) in chars: if not char: continue (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) self.text += font.to_unicode(cid) h += (font.char_width(cid) * fontsize + charspace) * scaling break - for (char,cid,_) in text: + for (char,cid,_) in chars[1:]: if not char: continue self.text += font.to_unicode(cid) h += (font.char_width(cid) * fontsize + charspace) * scaling @@ -155,16 +155,18 @@ class PDFPageAggregator(PDFDevice): def begin_page(self, page): self.cur_item = PageItem(self.pageno, page.mediabox, page.rotate) return + def end_page(self, _): assert not self.stack assert isinstance(self.cur_item, PageItem) self.pageno += 1 - return + return self.cur_item def begin_figure(self, name, bbox): self.stack.append(self.cur_item) self.cur_item = FigureItem(name, bbox) return + def end_figure(self, _): fig = self.cur_item self.cur_item = self.stack.pop()