From c808175eb685f555d8697e1f6497f9cb7f0fb5ea Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Mon, 23 Jun 2008 13:22:27 +0000 Subject: [PATCH] html output mode added. git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@32 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdf2txt.py | 116 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 44 deletions(-) diff --git a/pdf2txt.py b/pdf2txt.py index efe9949..d4832bb 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -9,13 +9,18 @@ from pdfinterp import PDFDevice, PDFResourceManager, \ from cmap import CMapDB +def enc(x, codec): + x = x.replace('&','&').replace('>','>').replace('<','<') + return x.encode(codec, 'xmlcharrefreplace') + + ## PageItem ## -class PageItem: +class PageItem(object): def __init__(self, id, (x0,y0,x1,y1), rotate=0): self.id = id - self.bbox = (x0, y0, x1-x0, y1-y0) + self.bbox = (x0, y0, x1, y1) self.rotate = rotate self.objs = [] return @@ -26,15 +31,6 @@ class PageItem: def add(self, obj): self.objs.append(obj) return - - def dump(self, outfp, codec): - bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox - outfp.write('\n' % - (self.id, bbox, self.rotate)) - for obj in self.objs: - obj.dump(outfp, codec) - outfp.write('\n') - return ## FigureItem @@ -44,18 +40,10 @@ class FigureItem(PageItem): def __repr__(self): return ('
' % (self.id, self.bbox)) - def dump(self, outfp, codec): - bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox - outfp.write('
\n' % (self.id, bbox)) - for obj in self.objs: - obj.dump(outfp, codec) - outfp.write('
\n') - return - ## TextItem ## -class TextItem: +class TextItem(object): def __init__(self, matrix, font, fontsize, width, text): self.matrix = matrix @@ -69,12 +57,15 @@ class TextItem: self.direction = 1 (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001)) - self.bbox = (tx, ty+descent, self.width, self.height) + ty += descent + self.bbox = (tx, ty, tx+self.width, ty+self.height) else: self.direction = 2 mindisp = min( d for (d,_) in text ) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0)) - self.bbox = (tx-mindisp, ty+self.width, self.height, self.width) + tx -= mindisp + ty += self.width + self.bbox = (tx, ty, tx+self.height, ty+self.width) self.text = ''.join( c for (_,c) in text ) (w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize)) self.fontsize = max(w,h) @@ -83,17 +74,6 @@ class TextItem: def __repr__(self): return ('' % (self.matrix, self.font, self.fontsize, self.width, self.height, self.text)) - - def dump(self, outfp, codec): - def e(x): - x = x.replace('&','&').replace('>','>').replace('<','<') - return x.encode(codec, 'xmlcharrefreplace') - bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox - outfp.write('' % - (e(self.font.fontname), self.direction, bbox, self.fontsize)) - outfp.write(e(self.text)) - outfp.write('\n') - return ## TextConverter @@ -161,16 +141,61 @@ class TextConverter(PDFDevice): self.context.add(item) return - def dump(self, outfp, codec): + def dump_sgml(self, outfp, codec): + def f(item): + bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox + if isinstance(item, FigureItem): + outfp.write('
\n' % (item.id, bbox)) + for child in item.objs: + f(child) + outfp.write('
\n') + elif isinstance(item, TextItem): + outfp.write('' % + (enc(item.font.fontname, codec), item.direction, bbox, item.fontsize)) + outfp.write(enc(item.text, codec)) + outfp.write('\n') for page in self.pages: - page.dump(outfp, codec) + bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox + outfp.write('\n' % + (page.id, bbox, page.rotate)) + for child in page.objs: + f(child) + outfp.write('\n') + return + + def dump_html(self, outfp, codec, scale=1.2, pagepad=50): + offset = 0 + def f(item): + if isinstance(item, FigureItem): + pass + elif isinstance(item, TextItem): + if item.direction == 2: + wmode = 'tb-rl' + else: + wmode = 'lr-tb' + (x,_,_,y) = item.bbox + outfp.write('' % + (wmode, x*scale, (offset-y)*scale, item.fontsize*scale)) + outfp.write(enc(item.text, codec)) + outfp.write('\n') + outfp.write('\n') + for page in self.pages: + (x0,y0,x1,y1) = page.bbox + offset += y1 + outfp.write('\n' % + (x0*scale, (offset-y1)*scale, (x1-x0)*scale, (y1-y0)*scale)) + for child in page.objs: + f(child) + offset += pagepad + outfp.write('\n') return # pdf2txt class TextExtractionNotAllowed(RuntimeError): pass -def pdf2txt(outfp, rsrc, fname, pages, codec, password='', debug=0): +def pdf2txt(outfp, rsrc, fname, pages, codec, html=False, password='', debug=0): device = TextConverter(rsrc, debug=debug) doc = PDFDocument(debug=debug) fp = file(fname, 'rb') @@ -182,15 +207,16 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, password='', debug=0): if not doc.is_extractable: raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) - outfp.write('\n') + device.reset() for (i,page) in enumerate(doc.get_pages(debug=debug)): if pages and (i not in pages): continue - device.reset() interpreter.process_page(page) - device.dump(outfp, codec) - fp.close() + if html: + device.dump_html(outfp, codec) + else: + device.dump_sgml(outfp, codec) device.close() - outfp.write('\n') + fp.close() return @@ -198,10 +224,10 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, password='', debug=0): def main(argv): import getopt def usage(): - print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-o output] file ...' % argv[0] + print 'usage: %s [-d] [-p pages] [-P password] [-c codec] [-H] [-o output] file ...' % argv[0] return 100 try: - (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:o:') + (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:Ho:') except getopt.GetoptError: return usage() if not args: return usage() @@ -210,6 +236,7 @@ def main(argv): cdbcmapdir = 'CDBCMap' codec = 'ascii' pages = set() + html = False password = '' outfp = stdout for (k, v) in opts: @@ -217,12 +244,13 @@ def main(argv): elif k == '-p': pages.add(int(v)) elif k == '-P': password = v elif k == '-c': codec = v + elif k == '-H': html = True elif k == '-o': outfp = file(v, 'wb') # CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) for fname in args: - pdf2txt(outfp, rsrc, fname, pages, codec, password=password, debug=debug) + pdf2txt(outfp, rsrc, fname, pages, codec, html=html, password=password, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv))