From 787ae4f81418953f04b1a2f03d744b6244c754ed Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 11 Jul 2009 12:42:12 +0000 Subject: [PATCH] documentation fix git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@117 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 52 +++++++++----- pdfminer/converter.py | 31 +++++---- pdfminer/layout.py | 156 ++++++++++++++++++++---------------------- setup.py | 3 +- tools/pdf2txt.py | 25 ++++--- 5 files changed, 146 insertions(+), 121 deletions(-) diff --git a/README.html b/README.html index 53ad545..da23724 100644 --- a/README.html +++ b/README.html @@ -18,7 +18,7 @@ Python PDF parser and analyzer
-Last Modified: Sat May 23 10:06:04 JST 2009 +Last Modified: Sat Jun 20 19:51:02 JST 2009
@@ -89,18 +89,13 @@ http://pdf2html.tabesugi.net:8080/
  • Do the following test:
     $ pdf2txt.py samples/simple1.pdf
    -<html><head>
    -<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    -</head><body>
    -<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
    -<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
    -<span style="position:absolute; writing-mode:lr-tb; left:158px; top:224px; font-size:22px;"> World </span>
    -<span style="position:absolute; writing-mode:lr-tb; left:100px; top:124px; font-size:22px;"> </span>
    -<span style="position:absolute; writing-mode:lr-tb; left:100px; top:224px; font-size:22px;"> Hello </span>
    -<span style="position:absolute; writing-mode:lr-tb; left:368px; top:124px; font-size:22px;">World </span>
    -<span style="position:absolute; writing-mode:lr-tb; left:206px; top:124px; font-size:22px;">Hello </span>
    -<div style="position:absolute; top:0px;">Page: <a href="#1">1</a></div>
    -</body></html>
    +
    +
    +Hello
    +
    +World
    +
    + Hello  World
     
  • Done! @@ -160,13 +155,13 @@ For non-ASCII languages, you can specify the output encoding

    Examples:

    -$ pdf2txt.py samples/naacl06-shinyama.pdf > output.html
    +$ pdf2txt.py samples/naacl06-shinyama.pdf -o output.html
     (extract text as an HTML file whose filename is output.html)
     
    -$ pdf2txt.py -c euc-jp samples/jo.pdf > output.html
    +$ pdf2txt.py -c euc-jp samples/jo.pdf -o output.html
     (extract a Japanese HTML file in vertical writing, CMap is required)
     
    -$ pdf2txt.py -P mypassword -t text secret.pdf > output.txt
    +$ pdf2txt.py -P mypassword secret.pdf -o output.txt
     (extract a text from an encrypted PDF file)
     
    @@ -175,7 +170,7 @@ Options:
    -o filename
    Specifies the output file name. -By default, it prints the extracted contents to stdout. +By default, it prints the extracted contents to stdout in text format.

    -p pageno[,pageno,...]
    Specifies the comma-separated list of the page numbers to be extracted. @@ -196,9 +191,24 @@ HTML-like tags. pdf2txt tries to extract its content streams rather than inferri Tags used here are defined in the PDF specification (See §10.7 "Tagged PDF").

    +

    -T cluster_margin +
    +

    +

    -W word_margin +
    +

    +

    -s scale +
    +

    +

    -m maxpages +
    +

    -P password
    Provides the user password to open the PDF file.

    +

    -C CMap directory +
    +

    -d
    Increases the debug level.
    @@ -231,7 +241,10 @@ Options:
    Instructs to dump all the objects. By default, it only prints the document trailer (like a header).

    -

    -p pageno +
    -i objno,objno, ... +
    +

    +

    -p pageno,pageno, ...
    Specifies the page number to be extracted. Multiple -p options are allowed. Note that page numbers start from one. @@ -253,6 +266,9 @@ no stream header is displayed for the ease of saving it to a file.
    -P password
    Provides the user password to open the PDF file.

    +

    -T +
    +

    -d
    Increases the debug level. diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 666c10d..9cf86d5 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -10,9 +10,10 @@ from pdfminer.utils import mult_matrix, translate_matrix, apply_matrix_pt, enc ## class PDFPageAggregator(PDFDevice): - def __init__(self, rsrc, pageno=1, cluster_margin=None): + def __init__(self, rsrc, pageno=1, char_margin=None, line_margin=None): PDFDevice.__init__(self, rsrc) - self.cluster_margin = cluster_margin + self.char_margin = char_margin + self.line_margin = line_margin self.undefined_char = '?' self.pageno = pageno self.stack = [] @@ -27,8 +28,8 @@ class PDFPageAggregator(PDFDevice): assert isinstance(self.cur_item, LTPage) self.cur_item.fixate() self.pageno += 1 - if self.cluster_margin: - self.cur_item.group_text(self.cluster_margin) + if self.char_margin != None and self.line_margin != None: + self.cur_item.group_text(self.char_margin, self.line_margin) return self.cur_item def begin_figure(self, name, bbox, matrix): @@ -115,8 +116,10 @@ class PDFPageAggregator(PDFDevice): ## class PDFConverter(PDFPageAggregator): - def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8'): - PDFPageAggregator.__init__(self, rsrc, pageno=pageno, cluster_margin=cluster_margin) + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, + char_margin=None, line_margin=None, word_margin=None): + PDFPageAggregator.__init__(self, rsrc, pageno=pageno, + char_margin=char_margin, line_margin=line_margin) self.outfp = outfp self.codec = codec self.word_margin = word_margin @@ -234,9 +237,11 @@ class SGMLConverter(PDFConverter): ## class HTMLConverter(PDFConverter): - def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, word_margin=None, codec='utf-8', + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, + char_margin=None, line_margin=None, word_margin=None, scale=1, showpageno=True, pagepad=50): - PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) + PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, + char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) self.showpageno = showpageno self.pagepad = pagepad self.scale = scale @@ -277,7 +282,7 @@ class HTMLConverter(PDFConverter): if self.debug: self.write_rect('red', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTAnon): - self.write(item.text) + pass elif isinstance(item, LTLine) or isinstance(item, LTRect): self.write_rect('black', 1, item.x0, self.yoffset-item.y1, item.width, item.height) elif isinstance(item, LTTextBox): @@ -302,9 +307,11 @@ class HTMLConverter(PDFConverter): ## class TextConverter(PDFConverter): - def __init__(self, rsrc, outfp, pageno=1, cluster_margin=None, codec='utf-8', - showpageno=False, word_margin=None): - PDFConverter.__init__(self, rsrc, outfp, pageno=pageno, cluster_margin=cluster_margin, word_margin=word_margin, codec=codec) + def __init__(self, rsrc, outfp, codec='utf-8', pageno=1, + char_margin=None, line_margin=None, word_margin=None, + showpageno=False): + PDFConverter.__init__(self, rsrc, outfp, codec=codec, pageno=pageno, + char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) self.showpageno = showpageno return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index cb2598c..7e575cf 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -21,31 +21,20 @@ def pick(seq, func, maxobj=None): ## It performs binary search so that the processing time ## should be around O(log n). ## -def bsearch(objs, v0, v1): - if v1 <= v0: return [] +def bsearch(objs, v0): i0 = 0 - i1 = len(objs)-1 - while i0 <= i1: + i1 = len(objs) + while i0 < i1: i = (i0+i1)/2 - assert 0 <= i and i < len(objs) (v, obj) = objs[i] - if v < v0: - i0 = i+1 - elif v1 < v: - i1 = i-1 - else: - i0 = i - while 0 < i0: - (v,_) = objs[i0-1] - if v < v0: break - i0 -= 1 + if v0 == v: + (i0,i1) = (i,i+1) + break + elif v0 < v: i1 = i - while i1 < len(objs)-1: - (v,_) = objs[i1+1] - if v1 < v: break - i1 += 1 - return [ obj for (_,obj) in objs[i0:i1+1] ] - return [] + else: + i0 = i+1 + return (i0,i1) ## reorder_hv, reorder_vh @@ -63,10 +52,12 @@ def reorder_vh(objs, hdir): r = [] line = [] for obj in sorted(objs, key=vkey): - if line and not line[-1].voverlap(obj): - line.sort(key=hkey) - r.append(line) - line = [] + if line: + v = line[-1].voverlap(obj) * 2 + if v < obj.height or v < line[-1].height: + line.sort(key=hkey) + r.append(line) + line = [] line.append(obj) line.sort(key=hkey) r.append(line) @@ -106,7 +97,8 @@ class Plane(object): self.yobjs = [] for obj in objs: self.place(obj) - self.fixate() + self.xobjs.sort() + self.yobjs.sort() return # place(obj): place an object in a certain area. @@ -118,16 +110,14 @@ class Plane(object): self.yobjs.append((obj.y1, obj)) return - # fixate(): you must call this after adding all objects. - def fixate(self): - self.xobjs.sort() - self.yobjs.sort() - return - # find(): finds objects that are in a certain area. def find(self, (x0,y0,x1,y1)): - xobjs = set(bsearch(self.xobjs, x0, x1)) - yobjs = set(bsearch(self.yobjs, y0, y1)) + (i0,_) = bsearch(self.xobjs, x0) + (_,i1) = bsearch(self.xobjs, x1) + xobjs = set( obj for (_,obj) in self.xobjs[i0:i1] ) + (i0,_) = bsearch(self.yobjs, y0) + (_,i1) = bsearch(self.yobjs, y1) + yobjs = set( obj for (_,obj) in self.yobjs[i0:i1] ) objs = xobjs.intersection(yobjs) return objs @@ -166,12 +156,14 @@ class ClusterSet(object): group.fixate() return list(r) -def group_objs(objs, ratio, klass): +def group_objs(objs, hratio, vratio, klass): plane = Plane(objs) cset = ClusterSet(klass) for obj in objs: - margin = abs(obj.get_margin(ratio)) - neighbors = plane.find((obj.x0-margin, obj.y0-margin, obj.x1+margin, obj.y1+margin)) + margin = obj.get_margin() + hmargin = hratio * margin + vmargin = vratio * margin + neighbors = plane.find((obj.x0-hmargin, obj.y0-vmargin, obj.x1+hmargin, obj.y1+vmargin)) cset.add(neighbors) return cset.finish() @@ -214,7 +206,7 @@ class LayoutItem(object): def get_bbox(self): return '%.3f,%.3f,%.3f,%.3f' % (self.x0, self.y0, self.x1, self.y1) - def get_margin(self, ratio): + def get_margin(self): return 0 def get_weight(self): @@ -253,7 +245,7 @@ class LayoutContainer(LayoutItem): return # fixate(): determines its boundery and writing direction. - def fixate(self): + def fixate(self, direction=None): if not self.width and self.objs: (bx0, by0, bx1, by1) = (INF, INF, -INF, -INF) for obj in self.objs: @@ -354,8 +346,8 @@ class LTText(LayoutItem): '(%.1f, %.1f)' % self.adv, self.text)) - def get_margin(self, ratio): - return self.fontsize * ratio + def get_margin(self): + return abs(self.fontsize) def get_weight(self): return len(self.text) @@ -392,24 +384,25 @@ class LTTextBox(LayoutContainer): def __repr__(self): return ('' % (self.get_bbox(), self.direction)) - def fixate(self): - LayoutContainer.fixate(self) - self.direction = 'H' - for obj in self.objs: - if obj.is_vertical(): - self.direction = 'V' - break - if 2 <= len(self.objs): - objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) - if objs[0].get_weight() == 1 and objs[1].get_weight() == 1: - h = objs[0].voverlap(objs[1]) - v = objs[0].hoverlap(objs[1]) - if h < v: - self.direction = 'V' - if self.direction == 'H': - self.lines = reorder_vh(self.objs, +1) - else: + def fixate(self, direction='H'): + LayoutContainer.fixate(self, direction=direction) + if not direction: + for obj in self.objs: + if obj.is_vertical(): + direction = 'V' + break + if 2 <= len(self.objs): + objs = sorted(self.objs, key=lambda obj: -obj.x1-obj.y1) + if objs[0].get_weight() == 1 and objs[1].get_weight() == 1: + h = objs[0].voverlap(objs[1]) + v = objs[0].hoverlap(objs[1]) + if h < v: + direction = 'V' + self.direction = direction + if self.direction == 'V': self.lines = reorder_hv(self.objs, -1) + else: + self.lines = reorder_vh(self.objs, +1) self.objs = [] for line in self.lines: self.objs.extend(line) @@ -418,31 +411,31 @@ class LTTextBox(LayoutContainer): def get_direction(self): return self.direction - def get_lines(self, ratio): - if self.get_direction() == 'H': - for line in self.lines: - x1 = INF - for obj in line: - if not isinstance(obj, LTText): continue - if ratio: - margin = obj.get_margin(ratio) - if x1 < obj.x0-margin: - yield LTAnon(' ') - yield obj - x1 = obj.x1 - yield LTAnon('\n') - else: + def get_lines(self, word_margin): + if self.get_direction() == 'V': for line in self.lines: y0 = -INF for obj in line: if not isinstance(obj, LTText): continue - if ratio: - margin = obj.get_margin(ratio) + if word_margin: + margin = word_margin * obj.get_margin() if obj.y1+margin < y0: yield LTAnon(' ') yield obj y0 = obj.y0 yield LTAnon('\n') + else: + for line in self.lines: + x1 = INF + for obj in line: + if not isinstance(obj, LTText): continue + if word_margin: + margin = word_margin * obj.get_margin() + if x1 < obj.x0-margin: + yield LTAnon(' ') + yield obj + x1 = obj.x1 + yield LTAnon('\n') return @@ -458,17 +451,18 @@ class LTPage(LayoutContainer): def __repr__(self): return ('' % (self.id, self.get_bbox(), self.rotate)) - def fixate(self): + def fixate(self, dirtection='H'): return - def group_text(self, ratio): + def group_text(self, char_margin, line_margin): textobjs = [ obj for obj in self.objs if isinstance(obj, LTText) ] - otherobjs = [ obj for obj in self.objs if not isinstance(obj, LTText) ] - self.objs = group_objs(textobjs, ratio, LTTextBox) + otherobjs - if self.get_direction() == 'H': - lines = reorder_vh(self.objs, +1) + objs = [ obj for obj in self.objs if not isinstance(obj, LTText) ] + if self.get_direction() == 'V': + objs += group_objs(textobjs, line_margin, char_margin, LTTextBox) + lines = reorder_hv(objs, -1) else: - lines = reorder_hv(self.objs, -1) + objs += group_objs(textobjs, char_margin, line_margin, LTTextBox) + lines = reorder_vh(objs, +1) self.objs = [] for line in lines: self.objs.extend(line) diff --git a/setup.py b/setup.py index 3ea37f9..8ab539f 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,9 @@ #!/usr/bin/env python from distutils.core import setup +from pdfminer import __version__ setup(name='pdfminer', - version='20090330', + version=__version__, description='PDF parser and analyzer', license='MIT/X', author='Yusuke Shinyama', diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index c44eb01..6ad95e6 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -10,10 +10,12 @@ from pdfminer.cmap import CMapDB, find_cmap_path def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-T cluster_threshold] [-W word_margin] [-t text|html|sgml|tag] [-o output] file ...' % argv[0] + print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] ' + '[-M char_margin] [-L line_margin] [-W word_margin] ' + '[-t text|html|sgml|tag] [-o output] file ...' % argv[0]) return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:T:W:t:o:C:D:m:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:M:L:W:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -29,7 +31,8 @@ def main(argv): outfile = None outtype = None codec = 'utf-8' - cluster_margin = 0.5 + char_margin = 1.0 + line_margin = 0.3 word_margin = 0.2 pageno = 1 scale = 1 @@ -44,7 +47,8 @@ def main(argv): elif k == '-c': codec = v elif k == '-o': outfile = v elif k == '-s': scale = float(v) - elif k == '-T': cluster_margin = float(v) + elif k == '-M': char_margin = float(v) + elif k == '-L': line_margin = float(v) elif k == '-W': word_margin = float(v) # CMapDB.debug = debug @@ -69,12 +73,15 @@ def main(argv): outfp = file(outfile, 'w') else: outfp = sys.stdout - if outtype == 'sgml': - device = SGMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin) + if outtype == 'text': + device = TextConverter(rsrc, outfp, codec=codec, + char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) + elif outtype == 'sgml': + device = SGMLConverter(rsrc, outfp, codec=codec, + char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) elif outtype == 'html': - device = HTMLConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin, scale=scale) - elif outtype == 'text': - device = TextConverter(rsrc, outfp, codec=codec, cluster_margin=cluster_margin, word_margin=word_margin) + device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, + char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: