From 395a8dc062ac18d002b6e2c8a5ddb876206432e8 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sun, 27 Jul 2008 04:30:37 +0000 Subject: [PATCH] tagged pdf extraction supported. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@45 1aa58f4a-7d42-0410-adbc-911cccaed67c --- README.html | 35 +++++--- TODO | 1 - pdflib/pdfinterp.py | 32 +++++-- tools/pdf2txt.py | 202 +++++++++++++++++++++++++++++++------------- 4 files changed, 190 insertions(+), 80 deletions(-) diff --git a/README.html b/README.html index 36cfa82..dd45552 100644 --- a/README.html +++ b/README.html @@ -11,7 +11,7 @@ blockquote { background: #eeeeee; }

PDFMiner

-Last Modified: Thu Jul 10 00:18:46 JST 2008 +Last Modified: Sun Jul 27 13:29:39 JST 2008
@@ -31,11 +31,14 @@ which could be useful for analyzing the document.

Features:

@@ -47,8 +50,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html

Download (source):
- -http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080710.tar.gz + +http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20080727.tar.gz (1.8Mbytes) @@ -68,7 +71,7 @@ http://pdf2html.tabesugi.net:8080/


How to Install

    -
  1. Install Python 2.4 or newer. +
  2. Install Python 2.5 or newer.
  3. Download the PDFMiner source.
  4. Extract it.
  5. Go to the pdfminer directory. @@ -135,7 +138,7 @@ Unicode Standard.

    Examples:

    -$ python -m tools.pdf2txt -H -o output.html samples/naacl06-shinyama.pdf
    +$ python -m tools.pdf2txt -o output.html samples/naacl06-shinyama.pdf
     (extract text as an HTML file whose filename is output.html)
     
     $ python -m tools.pdf2txt -c euc-jp samples/jo.pdf
    @@ -160,8 +163,15 @@ By default, it extracts texts from all the pages.
     
    -c codec
    Speficies the output codec for non-ASCII texts.

    -

    -H -
    Speficies the output to be HTML file. +
    -t type +
    Speficies the output format. The following formats are currently supported. +
      +
    • html : HTML format. (Default) +
    • sgml : SGML format. +
    • tag : "Tagged PDF" format. A tagged PDF has its own contents annotated with +HTML-like tags. pdf2txt tries to extract its content streams rather than inferring its text locations. +Tags used here are defined in the PDF specification. +

    -P password
    Provides the user password to open the PDF file. @@ -228,6 +238,7 @@ no stream header is displayed for the ease of saving it to a file.

    Changes

      +
    • 2008/07/27: Tagged contents extraction support.
    • 2008/07/10: Outline (TOC) extraction support.
    • 2008/06/29: HTML output added. Reorganized the directory structure.
    • 2008/04/29: Bugfix for Win32. Thanks to Chris Clark. diff --git a/TODO b/TODO index 17c34b2..859ffe8 100644 --- a/TODO +++ b/TODO @@ -1,7 +1,6 @@ TODOs: - Documentation. - Error handling for invalid type. - - Tagged PDF. - Infer text stream by clustering. - Support writing/creating PDFs. diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 7e3da6c..72abe82 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -482,6 +482,13 @@ class PDFDevice(object): self.ctm = ctm return + def begin_tag(self, tag, props=None): + return + def end_tag(self): + return + def do_tag(self, tag, props=None): + return + def begin_page(self, page): return def end_page(self, page): @@ -619,6 +626,9 @@ class PDFPageInterpreter(object): self.debug = debug return + def dup(self): + return PDFPageInterpreter(self.rsrc, self.device, debug=self.debug) + def init_resources(self, resources): self.fontmap = {} self.xobjmap = {} @@ -836,11 +846,21 @@ class PDFPageInterpreter(object): def do_EX(self): return # marked content operators - def do_MP(self, tag): return - def do_DP(self, tag, props): return - def do_BMC(self, tag): return - def do_BDC(self, tag, props): return - def do_EMC(self): return + def do_MP(self, tag): + self.device.do_tag(tag) + return + def do_DP(self, tag, props): + self.device.do_tag(tag, props) + return + def do_BMC(self, tag): + self.device.begin_tag(tag) + return + def do_BDC(self, tag, props): + self.device.begin_tag(tag, props) + return + def do_EMC(self): + self.device.end_tag() + return # setcharspace def do_Tc(self, space): @@ -960,7 +980,7 @@ class PDFPageInterpreter(object): print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') if subtype == LITERAL_FORM and 'BBox' in xobj.dic: - interpreter = PDFPageInterpreter(self.rsrc, self.device) + interpreter = self.dup() (x0,y0,x1,y1) = xobj.dic['BBox'] ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm) (x0,y0) = apply_matrix(ctm, (x0,y0)) diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index bcba45f..8f153d9 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -10,9 +10,13 @@ from pdflib.cmap import CMapDB def enc(x, codec): - x = x.replace('&','&').replace('>','>').replace('<','<') + x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"') return x.encode(codec, 'xmlcharrefreplace') +def encprops(props, codec): + if not props: return '' + return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) ) + ## PageItem ## @@ -82,22 +86,22 @@ class TextItem(object): ## class TextConverter(PDFDevice): - def __init__(self, rsrc, debug=0): + def __init__(self, rsrc, outfp, codec='utf-8', debug=0): PDFDevice.__init__(self, rsrc, debug=debug) - self.reset() - return - - def reset(self): - self.pages = [] + self.outfp = outfp + self.codec = codec + self.pageno = 0 self.stack = [] return def begin_page(self, page): - self.context = PageItem(len(self.pages), page.mediabox, page.rotate) + self.context = PageItem(self.pageno, page.mediabox, page.rotate) return def end_page(self, _): assert not self.stack - self.pages.append(self.context) + assert isinstance(self.context, PageItem) + self.pageno += 1 + self.dump_page(self.context) return def begin_figure(self, name, bbox): @@ -143,31 +147,49 @@ class TextConverter(PDFDevice): font, textstate.fontsize, size, text) self.context.add(item) return - - def dump_sgml(self, outfp, codec): + + +## SGMLConverter +## +class SGMLConverter(TextConverter): + + def dump_page(self, page): def f(item): bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox if isinstance(item, FigureItem): - outfp.write('
      \n' % (item.id, bbox)) + self.outfp.write('
      \n' % (item.id, bbox)) for child in item.objs: f(child) - outfp.write('
      \n') + self.outfp.write('
      \n') elif isinstance(item, TextItem): - outfp.write('' % - (enc(item.font.fontname, codec), item.direction, bbox, item.fontsize)) - outfp.write(enc(item.text, codec)) - outfp.write('\n') - for page in self.pages: - bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox - outfp.write('\n' % - (page.id, bbox, page.rotate)) - for child in page.objs: - f(child) - outfp.write('\n') + self.outfp.write('' % + (enc(item.font.fontname, self.codec), item.direction, bbox, item.fontsize)) + self.outfp.write(enc(item.text, self.codec)) + self.outfp.write('\n') + bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox + self.outfp.write('\n' % + (page.id, bbox, page.rotate)) + for child in page.objs: + f(child) + self.outfp.write('\n') return - def dump_html(self, outfp, codec, scale=1, pagepad=50, pagenum=True): - offset = pagepad + +## HTMLConverter +## +class HTMLConverter(TextConverter): + + def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0): + TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug) + self.pagenum = pagenum + self.pagepad = pagepad + self.scale = scale + self.outfp.write('\n' % self.codec) + self.outfp.write('\n') + self.yoffset = self.pagepad + return + + def dump_page(self, page): def f(item): if isinstance(item, FigureItem): pass @@ -177,36 +199,91 @@ class TextConverter(PDFDevice): else: wmode = 'lr-tb' (x,_,_,y) = item.bbox - outfp.write('' % - (wmode, x*scale, (offset-y)*scale, item.fontsize*scale)) - outfp.write(enc(item.text, codec)) - outfp.write('\n') - outfp.write('\n' % codec) - outfp.write('\n') - if pagenum: - outfp.write('
      Page: %s
      \n' % - ', '.join('%s' % (page.id,page.id) for page in self.pages )) - for page in self.pages: - (x0,y0,x1,y1) = page.bbox - offset += y1 - if pagenum: - outfp.write('' % - ((offset-y1)*scale, page.id, page.id)) - outfp.write('\n' % - (x0*scale, (offset-y1)*scale, (x1-x0)*scale, (y1-y0)*scale)) - for child in page.objs: - f(child) - offset += pagepad - outfp.write('\n') + self.outfp.write('' % + (wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale)) + self.outfp.write(enc(item.text, self.codec)) + self.outfp.write('\n') + (x0,y0,x1,y1) = page.bbox + self.yoffset += y1 + if self.pagenum: + self.outfp.write('' % + ((self.yoffset-y1)*self.scale, page.id, page.id)) + self.outfp.write('\n' % + (x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale)) + for child in page.objs: + f(child) + self.yoffset += self.pagepad + return + + def close(self): + self.outfp.write('
      Page: %s
      \n' % + ', '.join('%s' % (i,i) for i in xrange(self.pageno))) + self.outfp.write('\n') + return + + +## TagExtractor +## +class TagExtractor(PDFDevice): + + def __init__(self, rsrc, outfp, codec='utf-8', debug=0): + PDFDevice.__init__(self, rsrc, debug=debug) + self.outfp = outfp + self.codec = codec + self.pageno = 0 + self.tag = None + return + + def render_image(self, stream, size, matrix): + return + + def render_string(self, textstate, textmatrix, size, seq): + font = textstate.font + text = '' + for x in seq: + if not isinstance(x, str): continue + chars = font.decode(x) + for cid in chars: + try: + char = font.to_unicode(cid) + text += char + except PDFUnicodeNotDefined, e: + pass + self.outfp.write(enc(text, self.codec)) + return + + def begin_page(self, page): + (x0, y0, x1, y1) = page.mediabox + bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1) + self.outfp.write('' % + (self.pageno, bbox, page.rotate)) + return + def end_page(self, _): + self.outfp.write('\n') + self.pageno += 1 + return + + def begin_tag(self, tag, props=None): + self.outfp.write('<%s%s>' % (enc(tag.name, self.codec), encprops(props, self.codec))) + self.tag = tag + return + + def end_tag(self): + assert self.tag + self.outfp.write('' % enc(self.tag.name, self.codec)) + self.tag = None + return + + def do_tag(self, tag, props=None): + self.outfp.write('<%s%s/>' % (enc(tag.name, self.codec), encprops(props, self.codec))) return # pdf2txt class TextExtractionNotAllowed(RuntimeError): pass -def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password='', debug=0): - device = TextConverter(rsrc, debug=debug) +def convert(outfp, rsrc, device, fname, pagenos, maxpages=0, password='', debug=0): doc = PDFDocument(debug=debug) fp = file(fname, 'rb') parser = PDFParser(doc, fp, debug=debug) @@ -217,15 +294,10 @@ def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password if not doc.is_extractable: raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) - device.reset() for (pageno,page) in enumerate(doc.get_pages(debug=debug)): if pagenos and (pageno not in pagenos): continue interpreter.process_page(page) if maxpages and maxpages <= pageno+1: break - if html: - device.dump_html(outfp, codec) - else: - device.dump_sgml(outfp, codec) device.close() fp.close() return @@ -235,10 +307,10 @@ def pdf2txt(outfp, rsrc, fname, pagenos, codec, maxpages=0, html=False, password def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] + print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:C:D:m:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() @@ -248,7 +320,7 @@ def main(argv): codec = 'ascii' pagenos = set() maxpages = 0 - html = False + outtype = 'html' password = '' outfp = stdout for (k, v) in opts: @@ -259,14 +331,22 @@ def main(argv): elif k == '-m': maxpages = int(v) elif k == '-C': cmapdir = v elif k == '-D': cdbcmapdir = v - elif k == '-H': html = True + elif k == '-t': outtype = v elif k == '-o': outfp = file(v, 'wb') # CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) + if outtype == 'sgml': + device = SGMLConverter(rsrc, outfp, codec, debug=debug) + elif outtype == 'html': + device = HTMLConverter(rsrc, outfp, codec, debug=debug) + elif outtype == 'tag': + device = TagExtractor(rsrc, outfp, codec, debug=debug) + else: + return usage() for fname in args: - pdf2txt(outfp, rsrc, fname, pagenos, codec, - maxpages=maxpages, html=html, password=password, debug=debug) + convert(outfp, rsrc, device, fname, pagenos, + maxpages=maxpages, password=password, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv))