diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py index 2838d53..140d2fd 100755 --- a/pdfminer/ccitt.py +++ b/pdfminer/ccitt.py @@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser): except self.ByteSkip: self._accept = self._parse_mode self._state = self.MODE + except self.EOFB: + break return def _parse_mode(self, mode): @@ -394,7 +396,7 @@ class CCITTG4Parser(BitParser): def _get_bits(self): return ''.join( str(b) for b in self._curline[:self._curpos] ) - + def _get_refline(self, i): if i < 0: return '[]'+''.join( str(b) for b in self._refline ) @@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase): return +## CCITTFaxDecoder +## +class CCITTFaxDecoder(CCITTG4Parser): + + def __init__(self, width, bytealign=False, reversed=False): + CCITTG4Parser.__init__(self, width, bytealign=bytealign) + self.reversed = reversed + self._buf = '' + return + + def close(self): + return self._buf + + def output_line(self, y, bits): + bytes = array.array('B', [0]*((len(bits)+7)/8)) + if self.reversed: + bits = [ not b for b in bits ] + for (i,b) in enumerate(bits): + if b: + bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8] + self._buf += bytes.tostring() + return + + +def ccittfaxdecode(data, params): + K = params.get('K') + cols = params.get('Columns') + bytealign = params.get('EncodedByteAlign') + reversed = params.get('BlackIs1') + if K == -1: + parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed) + else: + raise ValueError(K) + parser.feedbytes(data) + return parser.close() + + # test def main(argv): import pygame @@ -691,10 +730,7 @@ def main(argv): fp = file(path,'rb') (_,_,k,w,h,_) = path.split('.') parser = Parser(int(w)) - try: - parser.feedbytes(fp.read()) - except parser.EOFB: - pass + parser.feedbytes(fp.read()) parser.close() fp.close() return diff --git a/pdfminer/converter.py b/pdfminer/converter.py index a3b6f4e..f5261da 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -1,14 +1,12 @@ #!/usr/bin/env python2 -import sys, os.path +import sys from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined -from pdftypes import LITERALS_DCT_DECODE -from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve from layout import LTFigure, LTImage, LTChar, LTTextLine from layout import LTTextBox, LTTextBoxVertical, LTTextGroup from utils import apply_matrix_pt, mult_matrix -from utils import enc, bbox2str, create_bmp +from utils import enc, bbox2str ## PDFLayoutAnalyzer @@ -139,28 +137,6 @@ class PDFConverter(PDFLayoutAnalyzer): self.outfp = outfp self.codec = codec return - - def write_image(self, image): - stream = image.stream - filters = stream.get_filters() - if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: - ext = '.jpg' - data = stream.get_rawdata() - elif image.colorspace is LITERAL_DEVICE_RGB: - ext = '.bmp' - data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height) - elif image.colorspace is LITERAL_DEVICE_GRAY: - ext = '.bmp' - data = create_bmp(stream.get_data(), stream.bits, image.width, image.height) - else: - ext = '.img' - data = stream.get_data() - name = image.name+ext - path = os.path.join(self.outdir, name) - fp = file(path, 'wb') - fp.write(data) - fp.close() - return name ## TextConverter @@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, - pagemargin=50, outdir=None, + pagemargin=50, imagewriter=None, rect_colors={'curve':'black', 'page':'gray'}, text_colors={'char':'black'}): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) @@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter): self.layoutmode = layoutmode self.showpageno = showpageno self.pagemargin = pagemargin - self.outdir = outdir + self.imagewriter = imagewriter self.rect_colors = rect_colors self.text_colors = text_colors if self.debug: @@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter): return def place_image(self, item, borderwidth, x, y, w, h): - if self.outdir is not None: - name = self.write_image(item) + if self.imagewriter is not None: + name = self.imagewriter.export_image(item) self.write('\n' % (enc(name), borderwidth, @@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter): ## class XMLConverter(PDFConverter): - def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): + def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, + laparams=None, imagewriter=None): PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) - self.outdir = outdir + self.imagewriter = imagewriter self.write_header() return @@ -479,8 +456,8 @@ class XMLConverter(PDFConverter): elif isinstance(item, LTText): self.outfp.write('%s\n' % item.get_text()) elif isinstance(item, LTImage): - if self.outdir: - name = self.write_image(item) + if self.imagewriter is not None: + name = self.imagewriter.export_image(item) self.outfp.write('\n' % (enc(name), item.width, item.height)) else: diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 72f6b23..60717a0 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -4,6 +4,7 @@ import zlib from lzw import lzwdecode from ascii85 import ascii85decode, asciihexdecode from runlength import rldecode +from ccitt import ccittfaxdecode from psparser import PSException, PSObject from psparser import LIT, KWD, STRICT from utils import apply_png_predictor @@ -206,6 +207,7 @@ class PDFStream(PDFObject): self.rawdata = None return for f in filters: + params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. try: @@ -223,15 +225,13 @@ class PDFStream(PDFObject): elif f in LITERALS_RUNLENGTH_DECODE: data = rldecode(data) elif f in LITERALS_CCITTFAX_DECODE: - #data = ccittfaxdecode(data) - raise PDFNotImplementedError('Unsupported filter: %r' % f) + data = ccittfaxdecode(data, params) elif f == LITERAL_CRYPT: # not yet.. raise PDFNotImplementedError('/Crypt filter is unsupported') else: raise PDFNotImplementedError('Unsupported filter: %r' % f) # apply predictors - params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {}) if 'Predictor' in params: pred = int_value(params['Predictor']) if pred == 1: diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 82fb44d..75614f2 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -306,13 +306,3 @@ class Plane(object): obj.y1 <= y0 or y1 <= obj.y0): continue yield obj return - - -# create_bmp -def create_bmp(data, bits, width, height): - info = struct.pack('