#!/usr/bin/env python import sys stdout = sys.stdout stderr = sys.stderr from pdfparser import PDFDocument, PDFParser from pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined, \ mult_matrix, apply_matrix from cmap import CMapDB ## TextConverter ## class TextConverter(PDFDevice): def __init__(self, outfp, rsrc, codec): PDFDevice.__init__(self, rsrc) self.outfp = outfp self.codec = codec return def close(self): self.outfp.write('\n') return def begin_block(self, name): self.outfp.write('\n' % name) return def end_block(self): self.outfp.write('\n') return def render_string(self, textstate, textmatrix, size, seq): font = textstate.font spwidth = int(-font.char_width(32) * 0.6) # space width buf = '' for x in seq: if isinstance(x, int) or isinstance(x, float): if not font.is_vertical() and x <= spwidth: buf += ' ' else: chars = font.decode(x) for cid in chars: try: char = font.to_unicode(cid) except PDFUnicodeNotDefined, e: (cidcoding, cid) = e.args char = u'[%s:%d]' % (cidcoding, cid) buf += char (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm) skewed = (b != 0 or c != 0) if font.is_vertical(): size = -size tag = 'vtext' else: tag = 'htext' if skewed: tag += ' skewed' s = buf.encode(self.codec, 'xmlcharrefreplace') (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize)) def f(x): return '%.03f' % x self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s\n' % (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag)) return # pdf2txt def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): device = TextConverter(outfp, rsrc, codec) doc = PDFDocument(debug=debug) fp = file(fname) parser = PDFParser(doc, fp, debug=debug) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) for (i,page) in enumerate(doc.get_pages(debug=debug)): if pages and (i not in pages): continue interpreter.process_page(page) fp.close() device.close() return # main def main(argv): import getopt def usage(): print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dvp:c:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' pages = set() outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pages.add(int(v)) elif k == '-o': outfp = file(v, 'wb') elif k == '-c': codec = v # CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug) rsrc = PDFResourceManager(debug=debug) for fname in args: pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug) return if __name__ == '__main__': sys.exit(main(sys.argv))