diff --git a/pdfminer/converter.py b/pdfminer/converter.py index f5261da..7b1f54f 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -144,9 +144,10 @@ class PDFConverter(PDFLayoutAnalyzer): class TextConverter(PDFConverter): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, - showpageno=False): + showpageno=False, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) self.showpageno = showpageno + self.imagewriter = imagewriter return def write_text(self, text): @@ -162,19 +163,25 @@ class TextConverter(PDFConverter): self.write_text(item.get_text()) if isinstance(item, LTTextBox): self.write_text('\n') + elif isinstance(item, LTImage): + if self.imagewriter is not None: + self.imagewriter.export_image(item) if self.showpageno: self.write_text('Page %s\n' % ltpage.pageid) render(ltpage) self.write_text('\f') return - # Some dummy functions to save memory/CPU when all that is wanted is text. - # This stops all the image and drawing ouput from being recorded and taking - # up RAM. + # Some dummy functions to save memory/CPU when all that is wanted + # is text. This stops all the image and drawing ouput from being + # recorded and taking up RAM. def render_image(self, name, stream): - pass + if self.imagewriter is None: return + PDFConverter.render_image(self, name, stream) + return + def paint_path(self, gstate, stroke, fill, evenodd, path): - pass + return ## HTMLConverter diff --git a/pdfminer/image.py b/pdfminer/image.py index c8ed4fd..87e6523 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -1,7 +1,7 @@ #!/usr/bin/env python2 import sys import struct -import os.path +import os, os.path from pdftypes import LITERALS_DCT_DECODE from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB @@ -54,6 +54,8 @@ class ImageWriter(object): def __init__(self, outdir): self.outdir = outdir + if not os.path.exists(self.outdir): + os.makedirs(self.outdir) return def export_image(self, image): diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index a0c1a51..2d233f9 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -80,7 +80,8 @@ def main(argv): else: outfp = sys.stdout if outtype == 'text': - device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) + device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, + imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)