imagewriter now works with text output
parent
91174b5665
commit
82ff98c7b3
|
@ -144,9 +144,10 @@ class PDFConverter(PDFLayoutAnalyzer):
|
||||||
class TextConverter(PDFConverter):
|
class TextConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
showpageno=False):
|
showpageno=False, imagewriter=None):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
|
self.imagewriter = imagewriter
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_text(self, text):
|
def write_text(self, text):
|
||||||
|
@ -162,19 +163,25 @@ class TextConverter(PDFConverter):
|
||||||
self.write_text(item.get_text())
|
self.write_text(item.get_text())
|
||||||
if isinstance(item, LTTextBox):
|
if isinstance(item, LTTextBox):
|
||||||
self.write_text('\n')
|
self.write_text('\n')
|
||||||
|
elif isinstance(item, LTImage):
|
||||||
|
if self.imagewriter is not None:
|
||||||
|
self.imagewriter.export_image(item)
|
||||||
if self.showpageno:
|
if self.showpageno:
|
||||||
self.write_text('Page %s\n' % ltpage.pageid)
|
self.write_text('Page %s\n' % ltpage.pageid)
|
||||||
render(ltpage)
|
render(ltpage)
|
||||||
self.write_text('\f')
|
self.write_text('\f')
|
||||||
return
|
return
|
||||||
|
|
||||||
# Some dummy functions to save memory/CPU when all that is wanted is text.
|
# Some dummy functions to save memory/CPU when all that is wanted
|
||||||
# This stops all the image and drawing ouput from being recorded and taking
|
# is text. This stops all the image and drawing ouput from being
|
||||||
# up RAM.
|
# recorded and taking up RAM.
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name, stream):
|
||||||
pass
|
if self.imagewriter is None: return
|
||||||
|
PDFConverter.render_image(self, name, stream)
|
||||||
|
return
|
||||||
|
|
||||||
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
def paint_path(self, gstate, stroke, fill, evenodd, path):
|
||||||
pass
|
return
|
||||||
|
|
||||||
|
|
||||||
## HTMLConverter
|
## HTMLConverter
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
import sys
|
import sys
|
||||||
import struct
|
import struct
|
||||||
import os.path
|
import os, os.path
|
||||||
from pdftypes import LITERALS_DCT_DECODE
|
from pdftypes import LITERALS_DCT_DECODE
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||||
|
|
||||||
|
@ -54,6 +54,8 @@ class ImageWriter(object):
|
||||||
|
|
||||||
def __init__(self, outdir):
|
def __init__(self, outdir):
|
||||||
self.outdir = outdir
|
self.outdir = outdir
|
||||||
|
if not os.path.exists(self.outdir):
|
||||||
|
os.makedirs(self.outdir)
|
||||||
return
|
return
|
||||||
|
|
||||||
def export_image(self, image):
|
def export_image(self, image):
|
||||||
|
|
|
@ -80,7 +80,8 @@ def main(argv):
|
||||||
else:
|
else:
|
||||||
outfp = sys.stdout
|
outfp = sys.stdout
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
|
imagewriter=imagewriter)
|
||||||
elif outtype == 'xml':
|
elif outtype == 'xml':
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
imagewriter=imagewriter)
|
imagewriter=imagewriter)
|
||||||
|
|
Loading…
Reference in New Issue