improved html.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@38 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
1d5492fe9f
commit
07fc1799b3
|
@ -144,7 +144,7 @@ def main(argv):
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
elif k == '-i': objids.extend( int(x) for x in v.split(',') )
|
||||||
elif k == '-p': pageids.update( int(x) for x in v.split(',') )
|
elif k == '-p': pageids.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-a': dumpall = True
|
elif k == '-a': dumpall = True
|
||||||
elif k == '-r': codec = 'raw'
|
elif k == '-r': codec = 'raw'
|
||||||
|
|
|
@ -93,7 +93,7 @@ class TextConverter(PDFDevice):
|
||||||
return
|
return
|
||||||
|
|
||||||
def begin_page(self, page):
|
def begin_page(self, page):
|
||||||
self.context = PageItem(str(page.pageid), page.mediabox, page.rotate)
|
self.context = PageItem(str(page.pageid+1), page.mediabox, page.rotate)
|
||||||
return
|
return
|
||||||
def end_page(self, _):
|
def end_page(self, _):
|
||||||
assert not self.stack
|
assert not self.stack
|
||||||
|
@ -166,8 +166,8 @@ class TextConverter(PDFDevice):
|
||||||
outfp.write('</page>\n')
|
outfp.write('</page>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def dump_html(self, outfp, codec, scale=1.2, pagepad=50):
|
def dump_html(self, outfp, codec, scale=1, pagepad=50, pagenum=True):
|
||||||
offset = 0
|
offset = pagepad
|
||||||
def f(item):
|
def f(item):
|
||||||
if isinstance(item, FigureItem):
|
if isinstance(item, FigureItem):
|
||||||
pass
|
pass
|
||||||
|
@ -183,9 +183,15 @@ class TextConverter(PDFDevice):
|
||||||
outfp.write('</span>\n')
|
outfp.write('</span>\n')
|
||||||
outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % codec)
|
outfp.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=%s">\n' % codec)
|
||||||
outfp.write('</head><body>\n')
|
outfp.write('</head><body>\n')
|
||||||
|
if pagenum:
|
||||||
|
outfp.write('<div>Page: %s</div>\n' %
|
||||||
|
', '.join('<a href="#%s">%s</a>' % (page.id,page.id) for page in self.pages ))
|
||||||
for page in self.pages:
|
for page in self.pages:
|
||||||
(x0,y0,x1,y1) = page.bbox
|
(x0,y0,x1,y1) = page.bbox
|
||||||
offset += y1
|
offset += y1
|
||||||
|
if pagenum:
|
||||||
|
outfp.write('<div style="position:absolute; top:%dpx;"><a name="%s">Page %s</a></div>' %
|
||||||
|
((offset-y1)*scale, page.id, page.id))
|
||||||
outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
outfp.write('<span style="position:absolute; border: 1px solid gray; '
|
||||||
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
'left:%dpx; top:%dpx; width:%dpx; height:%dpx;"></span>\n' %
|
||||||
(x0*scale, (offset-y1)*scale, (x1-x0)*scale, (y1-y0)*scale))
|
(x0*scale, (offset-y1)*scale, (x1-x0)*scale, (y1-y0)*scale))
|
||||||
|
@ -199,7 +205,7 @@ class TextConverter(PDFDevice):
|
||||||
# pdf2txt
|
# pdf2txt
|
||||||
class TextExtractionNotAllowed(RuntimeError): pass
|
class TextExtractionNotAllowed(RuntimeError): pass
|
||||||
|
|
||||||
def pdf2txt(outfp, rsrc, fname, pages, codec, html=False, password='', debug=0):
|
def pdf2txt(outfp, rsrc, fname, pages, codec, maxpages=10, html=False, password='', debug=0):
|
||||||
device = TextConverter(rsrc, debug=debug)
|
device = TextConverter(rsrc, debug=debug)
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname, 'rb')
|
fp = file(fname, 'rb')
|
||||||
|
@ -215,6 +221,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, html=False, password='', debug=0):
|
||||||
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
for (i,page) in enumerate(doc.get_pages(debug=debug)):
|
||||||
if pages and (i not in pages): continue
|
if pages and (i not in pages): continue
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
|
if maxpages and maxpages <= i+1: break
|
||||||
if html:
|
if html:
|
||||||
device.dump_html(outfp, codec)
|
device.dump_html(outfp, codec)
|
||||||
else:
|
else:
|
||||||
|
@ -245,7 +252,7 @@ def main(argv):
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
if k == '-d': debug += 1
|
if k == '-d': debug += 1
|
||||||
elif k == '-p': pages.add(int(v))
|
elif k == '-p': pages.update( int(x)-1 for x in v.split(',') )
|
||||||
elif k == '-P': password = v
|
elif k == '-P': password = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-C': cmapdir = v
|
elif k == '-C': cmapdir = v
|
||||||
|
|
Loading…
Reference in New Issue