pdfminer.six/pdf2txt.py

112 lines
3.0 KiB
Python
Executable File

#!/usr/bin/env python
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdfparser import PDFDocument, PDFParser
from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix
from cmap import CMapDB
## TextConverter
##
class TextConverter(PDFDevice):
def __init__(self, outfp, rsrc, codec):
PDFDevice.__init__(self, rsrc)
self.outfp = outfp
self.codec = codec
return
def close(self):
self.outfp.write('\n')
return
def begin_block(self, name):
self.outfp.write('<block name="%s">\n' % name)
return
def end_block(self):
self.outfp.write('</block>\n')
return
def render_string(self, textstate, textmatrix, size, seq):
font = textstate.font
spwidth = int(-font.char_width(32) * 0.6) # space width
buf = ''
for x in seq:
if isinstance(x, int) or isinstance(x, float):
if not font.is_vertical() and x <= spwidth:
buf += ' '
else:
chars = font.decode(x)
for cid in chars:
try:
char = font.to_unicode(cid)
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
char = u'[%s:%d]' % (cidcoding, cid)
buf += char
(a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
skewed = (b != 0 or c != 0)
if font.is_vertical():
size = -size
tag = 'vtext'
else:
tag = 'htext'
if skewed:
tag += ' skewed'
s = buf.encode(self.codec, 'xmlcharrefreplace')
(w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
def f(x): return '%.03f' % x
self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
(tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
return
# pdf2txt
def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
device = TextConverter(outfp, rsrc, codec)
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue
interpreter.process_page(page)
fp.close()
device.close()
return
# main
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-v] [-c codec] [-p pages] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dvp:c:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
cmapdir = 'CMap'
cdbcmapdir = 'CDBCMap'
codec = 'ascii'
pages = set()
outfp = stdout
for (k, v) in opts:
if k == '-d': debug += 1
elif k == '-p': pages.add(int(v))
elif k == '-o': outfp = file(v, 'wb')
elif k == '-c': codec = v
#
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
rsrc = PDFResourceManager(debug=debug)
for fname in args:
pdf2txt(outfp, rsrc, fname, pages, codec, debug=debug)
return
if __name__ == '__main__': sys.exit(main(sys.argv))