353 lines
11 KiB
Python
Executable File
353 lines
11 KiB
Python
Executable File
#!/usr/bin/env python
|
|
import sys
|
|
stdout = sys.stdout
|
|
stderr = sys.stderr
|
|
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
|
|
from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \
|
|
PDFPageInterpreter, PDFUnicodeNotDefined, \
|
|
mult_matrix, apply_matrix
|
|
from pdflib.cmap import CMapDB
|
|
|
|
|
|
def enc(x, codec):
|
|
x = x.replace('&','&').replace('>','>').replace('<','<').replace('"','"')
|
|
return x.encode(codec, 'xmlcharrefreplace')
|
|
|
|
def encprops(props, codec):
|
|
if not props: return ''
|
|
return ''.join( ' %s="%s"' % (enc(k,codec), enc(str(v),codec)) for (k,v) in sorted(props.iteritems()) )
|
|
|
|
|
|
## PageItem
|
|
##
|
|
class PageItem(object):
|
|
|
|
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
|
|
self.id = id
|
|
self.bbox = (x0, y0, x1, y1)
|
|
self.rotate = rotate
|
|
self.objs = []
|
|
return
|
|
|
|
def __repr__(self):
|
|
return ('<page id=%r bbox=%r rotate=%r>' % (self.id, self.bbox, self.rotate))
|
|
|
|
def add(self, obj):
|
|
self.objs.append(obj)
|
|
return
|
|
|
|
|
|
## FigureItem
|
|
##
|
|
class FigureItem(PageItem):
|
|
|
|
def __repr__(self):
|
|
return ('<figure id=%r bbox=%r>' % (self.id, self.bbox))
|
|
|
|
|
|
## TextItem
|
|
##
|
|
class TextItem(object):
|
|
|
|
def __init__(self, matrix, font, fontsize, width, text):
|
|
self.matrix = matrix
|
|
self.font = font
|
|
(a,b,c,d,tx,ty) = self.matrix
|
|
self.origin = (tx,ty)
|
|
self.direction = 0
|
|
if not self.font.is_vertical():
|
|
self.direction = 1
|
|
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
|
self.width = abs(self.width)
|
|
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
|
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
|
ty += descent
|
|
self.bbox = (tx, ty, tx+self.width, ty+self.height)
|
|
else:
|
|
self.direction = 2
|
|
(self.width, self.height) = apply_matrix((a,b,c,d,0,0), (fontsize,width))
|
|
self.width = abs(self.width)
|
|
(disp,_) = text[0]
|
|
(_,disp) = apply_matrix((a,b,c,d,0,0), (0, (1000-disp)*fontsize*0.001))
|
|
tx -= self.width/2
|
|
ty += disp
|
|
self.bbox = (tx, ty+self.height, tx+self.width, ty)
|
|
self.text = ''.join( c for (_,c) in text )
|
|
(w,h) = apply_matrix((a,b,c,d,0,0), (fontsize,fontsize))
|
|
self.fontsize = max(w,h)
|
|
return
|
|
|
|
def __repr__(self):
|
|
return ('<text matrix=%r font=%r fontsize=%r width=%r height=%r text=%r>' %
|
|
(self.matrix, self.font, self.fontsize, self.width, self.height, self.text))
|
|
|
|
|
|
## TextConverter
|
|
##
|
|
class TextConverter(PDFDevice):
|
|
|
|
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
|
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
|
self.outfp = outfp
|
|
self.codec = codec
|
|
self.pageno = 0
|
|
self.stack = []
|
|
return
|
|
|
|
def begin_page(self, page):
|
|
self.context = PageItem(self.pageno, page.mediabox, page.rotate)
|
|
return
|
|
def end_page(self, _):
|
|
assert not self.stack
|
|
assert isinstance(self.context, PageItem)
|
|
self.pageno += 1
|
|
self.dump_page(self.context)
|
|
return
|
|
|
|
def begin_figure(self, name, bbox):
|
|
self.stack.append(self.context)
|
|
self.context = FigureItem(name, bbox)
|
|
return
|
|
def end_figure(self, _):
|
|
fig = self.context
|
|
self.context = self.stack.pop()
|
|
self.context.add(fig)
|
|
return
|
|
|
|
def render_image(self, stream, size, matrix):
|
|
return
|
|
|
|
def handle_undefined_char(self, cidcoding, cid):
|
|
if self.debug:
|
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
|
#return unichr(cid)
|
|
return None
|
|
|
|
def render_string(self, textstate, textmatrix, size, seq):
|
|
font = textstate.font
|
|
spwidth = int(-font.char_width(32) * 0.6) # space width
|
|
text = []
|
|
for x in seq:
|
|
if isinstance(x, int) or isinstance(x, float):
|
|
if not font.is_vertical() and x <= spwidth:
|
|
text.append((0, ' '))
|
|
else:
|
|
chars = font.decode(x)
|
|
for cid in chars:
|
|
try:
|
|
char = font.to_unicode(cid)
|
|
text.append((font.char_disp(cid), char))
|
|
except PDFUnicodeNotDefined, e:
|
|
(cidcoding, cid) = e.args
|
|
s = self.handle_undefined_char(cidcoding, cid)
|
|
if s:
|
|
text.append(s)
|
|
if text:
|
|
item = TextItem(mult_matrix(textmatrix, self.ctm),
|
|
font, textstate.fontsize, size, text)
|
|
self.context.add(item)
|
|
return
|
|
|
|
|
|
## SGMLConverter
|
|
##
|
|
class SGMLConverter(TextConverter):
|
|
|
|
def dump_page(self, page):
|
|
def f(item):
|
|
bbox = '%.3f,%.3f,%.3f,%.3f' % item.bbox
|
|
if isinstance(item, FigureItem):
|
|
self.outfp.write('<figure id="%s" bbox="%s">\n' % (item.id, bbox))
|
|
for child in item.objs:
|
|
f(child)
|
|
self.outfp.write('</figure>\n')
|
|
elif isinstance(item, TextItem):
|
|
self.outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
|
|
(enc(item.font.fontname, self.codec), item.direction, bbox, item.fontsize))
|
|
self.outfp.write(enc(item.text, self.codec))
|
|
self.outfp.write('</text>\n')
|
|
bbox = '%.3f,%.3f,%.3f,%.3f' % page.bbox
|
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">\n' %
|
|
(page.id, bbox, page.rotate))
|
|
for child in page.objs:
|
|
f(child)
|
|
self.outfp.write('</page>\n')
|
|
return
|
|
|
|
|
|
## HTMLConverter
|
|
##
|
|
class HTMLConverter(TextConverter):
|
|
|
|
def __init__(self, rsrc, outfp, codec='utf-8', pagenum=True, pagepad=50, scale=1, debug=0):
|
|
TextConverter.__init__(self, rsrc, outfp, codec=codec, debug=debug)
|
|
self.pagenum = pagenum
|
|
self.pagepad = pagepad
|
|
self.scale = scale
|
|
self.outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % self.codec)
|
|
self.outfp.write('</head><body>\n')
|
|
self.yoffset = self.pagepad
|
|
return
|
|
|
|
def dump_page(self, page):
|
|
def f(item):
|
|
if isinstance(item, FigureItem):
|
|
pass
|
|
elif isinstance(item, TextItem):
|
|
if item.direction == 2:
|
|
wmode = 'tb-rl'
|
|
else:
|
|
wmode = 'lr-tb'
|
|
(x,_,_,y) = item.bbox
|
|
self.outfp.write('<span style="position:absolute; writing-mode:%s; left:%dpx; top:%dpx; font-size:%dpx;">' %
|
|
(wmode, x*self.scale, (self.yoffset-y)*self.scale, item.fontsize*self.scale))
|
|
self.outfp.write(enc(item.text, self.codec))
|
|
self.outfp.write('</span>\n')
|
|
(x0,y0,x1,y1) = page.bbox
|
|
self.yoffset += y1
|
|
if self.pagenum:
|
|
self.outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
|
|
((self.yoffset-y1)*self.scale, page.id, page.id))
|
|
self.outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
|
(x0*self.scale, (self.yoffset-y1)*self.scale, (x1-x0)*self.scale, (y1-y0)*self.scale))
|
|
for child in page.objs:
|
|
f(child)
|
|
self.yoffset += self.pagepad
|
|
return
|
|
|
|
def close(self):
|
|
self.outfp.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
|
|
', '.join('<a href="#%s">%s</a>' % (i,i) for i in xrange(self.pageno)))
|
|
self.outfp.write('</body></html>\n')
|
|
return
|
|
|
|
|
|
## TagExtractor
|
|
##
|
|
class TagExtractor(PDFDevice):
|
|
|
|
def __init__(self, rsrc, outfp, codec='utf-8', debug=0):
|
|
PDFDevice.__init__(self, rsrc, debug=debug)
|
|
self.outfp = outfp
|
|
self.codec = codec
|
|
self.pageno = 0
|
|
self.tag = None
|
|
return
|
|
|
|
def render_image(self, stream, size, matrix):
|
|
return
|
|
|
|
def render_string(self, textstate, textmatrix, size, seq):
|
|
font = textstate.font
|
|
text = ''
|
|
for x in seq:
|
|
if not isinstance(x, str): continue
|
|
chars = font.decode(x)
|
|
for cid in chars:
|
|
try:
|
|
char = font.to_unicode(cid)
|
|
text += char
|
|
except PDFUnicodeNotDefined, e:
|
|
pass
|
|
self.outfp.write(enc(text, self.codec))
|
|
return
|
|
|
|
def begin_page(self, page):
|
|
(x0, y0, x1, y1) = page.mediabox
|
|
bbox = '%.3f,%.3f,%.3f,%.3f' % (x0, y0, x1, y1)
|
|
self.outfp.write('<page id="%s" bbox="%s" rotate="%d">' %
|
|
(self.pageno, bbox, page.rotate))
|
|
return
|
|
def end_page(self, _):
|
|
self.outfp.write('</page>\n')
|
|
self.pageno += 1
|
|
return
|
|
|
|
def begin_tag(self, tag, props=None):
|
|
self.outfp.write('<%s%s>' % (enc(tag.name, self.codec), encprops(props, self.codec)))
|
|
self.tag = tag
|
|
return
|
|
|
|
def end_tag(self):
|
|
assert self.tag
|
|
self.outfp.write('</%s>' % enc(self.tag.name, self.codec))
|
|
self.tag = None
|
|
return
|
|
|
|
def do_tag(self, tag, props=None):
|
|
self.outfp.write('<%s%s/>' % (enc(tag.name, self.codec), encprops(props, self.codec)))
|
|
return
|
|
|
|
|
|
# pdf2txt
|
|
class TextExtractionNotAllowed(RuntimeError): pass
|
|
|
|
def convert(outfp, rsrc, device, fname, pagenos, maxpages=0, password='', debug=0):
|
|
doc = PDFDocument(debug=debug)
|
|
fp = file(fname, 'rb')
|
|
parser = PDFParser(doc, fp, debug=debug)
|
|
try:
|
|
doc.initialize(password)
|
|
except PDFPasswordIncorrect:
|
|
raise TextExtractionNotAllowed('incorrect password')
|
|
if not doc.is_extractable:
|
|
raise TextExtractionNotAllowed('text extraction is not allowed: %r' % fname)
|
|
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
|
|
for (pageno,page) in enumerate(doc.get_pages(debug=debug)):
|
|
if pagenos and (pageno not in pagenos): continue
|
|
interpreter.process_page(page)
|
|
if maxpages and maxpages <= pageno+1: break
|
|
device.close()
|
|
fp.close()
|
|
return
|
|
|
|
|
|
# main
|
|
def main(argv):
|
|
import getopt
|
|
def usage():
|
|
print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0]
|
|
return 100
|
|
try:
|
|
(opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:')
|
|
except getopt.GetoptError:
|
|
return usage()
|
|
if not args: return usage()
|
|
debug = 0
|
|
cmapdir = 'CMap'
|
|
cdbcmapdir = 'CDBCMap'
|
|
codec = 'ascii'
|
|
pagenos = set()
|
|
maxpages = 0
|
|
outtype = 'html'
|
|
password = ''
|
|
outfp = stdout
|
|
for (k, v) in opts:
|
|
if k == '-d': debug += 1
|
|
elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
|
|
elif k == '-P': password = v
|
|
elif k == '-c': codec = v
|
|
elif k == '-m': maxpages = int(v)
|
|
elif k == '-C': cmapdir = v
|
|
elif k == '-D': cdbcmapdir = v
|
|
elif k == '-t': outtype = v
|
|
elif k == '-o': outfp = file(v, 'wb')
|
|
#
|
|
CMapDB.initialize(cmapdir, cdbcmapdir, debug=debug)
|
|
rsrc = PDFResourceManager(debug=debug)
|
|
if outtype == 'sgml':
|
|
device = SGMLConverter(rsrc, outfp, codec, debug=debug)
|
|
elif outtype == 'html':
|
|
device = HTMLConverter(rsrc, outfp, codec, debug=debug)
|
|
elif outtype == 'tag':
|
|
device = TagExtractor(rsrc, outfp, codec, debug=debug)
|
|
else:
|
|
return usage()
|
|
for fname in args:
|
|
convert(outfp, rsrc, device, fname, pagenos,
|
|
maxpages=maxpages, password=password, debug=debug)
|
|
return
|
|
|
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|