imagewriter now works with text output

pull/1/head
Yusuke Shinyama 2011-11-07 01:15:10 +10:00
parent 91174b5665
commit 82ff98c7b3
3 changed files with 18 additions and 8 deletions

View File

@ -144,9 +144,10 @@ class PDFConverter(PDFLayoutAnalyzer):
class TextConverter(PDFConverter): class TextConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
showpageno=False): showpageno=False, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.showpageno = showpageno self.showpageno = showpageno
self.imagewriter = imagewriter
return return
def write_text(self, text): def write_text(self, text):
@ -162,19 +163,25 @@ class TextConverter(PDFConverter):
self.write_text(item.get_text()) self.write_text(item.get_text())
if isinstance(item, LTTextBox): if isinstance(item, LTTextBox):
self.write_text('\n') self.write_text('\n')
elif isinstance(item, LTImage):
if self.imagewriter is not None:
self.imagewriter.export_image(item)
if self.showpageno: if self.showpageno:
self.write_text('Page %s\n' % ltpage.pageid) self.write_text('Page %s\n' % ltpage.pageid)
render(ltpage) render(ltpage)
self.write_text('\f') self.write_text('\f')
return return
# Some dummy functions to save memory/CPU when all that is wanted is text. # Some dummy functions to save memory/CPU when all that is wanted
# This stops all the image and drawing ouput from being recorded and taking # is text. This stops all the image and drawing ouput from being
# up RAM. # recorded and taking up RAM.
def render_image(self, name, stream): def render_image(self, name, stream):
pass if self.imagewriter is None: return
PDFConverter.render_image(self, name, stream)
return
def paint_path(self, gstate, stroke, fill, evenodd, path): def paint_path(self, gstate, stroke, fill, evenodd, path):
pass return
## HTMLConverter ## HTMLConverter

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import sys import sys
import struct import struct
import os.path import os, os.path
from pdftypes import LITERALS_DCT_DECODE from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
@ -54,6 +54,8 @@ class ImageWriter(object):
def __init__(self, outdir): def __init__(self, outdir):
self.outdir = outdir self.outdir = outdir
if not os.path.exists(self.outdir):
os.makedirs(self.outdir)
return return
def export_image(self, image): def export_image(self, image):

View File

@ -80,7 +80,8 @@ def main(argv):
else: else:
outfp = sys.stdout outfp = sys.stdout
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter)