diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py
index 2838d53..140d2fd 100755
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser):
except self.ByteSkip:
self._accept = self._parse_mode
self._state = self.MODE
+ except self.EOFB:
+ break
return
def _parse_mode(self, mode):
@@ -394,7 +396,7 @@ class CCITTG4Parser(BitParser):
def _get_bits(self):
return ''.join( str(b) for b in self._curline[:self._curpos] )
-
+
def _get_refline(self, i):
if i < 0:
return '[]'+''.join( str(b) for b in self._refline )
@@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase):
return
+## CCITTFaxDecoder
+##
+class CCITTFaxDecoder(CCITTG4Parser):
+
+ def __init__(self, width, bytealign=False, reversed=False):
+ CCITTG4Parser.__init__(self, width, bytealign=bytealign)
+ self.reversed = reversed
+ self._buf = ''
+ return
+
+ def close(self):
+ return self._buf
+
+ def output_line(self, y, bits):
+ bytes = array.array('B', [0]*((len(bits)+7)/8))
+ if self.reversed:
+ bits = [ not b for b in bits ]
+ for (i,b) in enumerate(bits):
+ if b:
+ bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
+ self._buf += bytes.tostring()
+ return
+
+
+def ccittfaxdecode(data, params):
+ K = params.get('K')
+ cols = params.get('Columns')
+ bytealign = params.get('EncodedByteAlign')
+ reversed = params.get('BlackIs1')
+ if K == -1:
+ parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
+ else:
+ raise ValueError(K)
+ parser.feedbytes(data)
+ return parser.close()
+
+
# test
def main(argv):
import pygame
@@ -691,10 +730,7 @@ def main(argv):
fp = file(path,'rb')
(_,_,k,w,h,_) = path.split('.')
parser = Parser(int(w))
- try:
- parser.feedbytes(fp.read())
- except parser.EOFB:
- pass
+ parser.feedbytes(fp.read())
parser.close()
fp.close()
return
diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index a3b6f4e..f5261da 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -1,14 +1,12 @@
#!/usr/bin/env python2
-import sys, os.path
+import sys
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
-from pdftypes import LITERALS_DCT_DECODE
-from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
from layout import LTFigure, LTImage, LTChar, LTTextLine
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
-from utils import enc, bbox2str, create_bmp
+from utils import enc, bbox2str
## PDFLayoutAnalyzer
@@ -139,28 +137,6 @@ class PDFConverter(PDFLayoutAnalyzer):
self.outfp = outfp
self.codec = codec
return
-
- def write_image(self, image):
- stream = image.stream
- filters = stream.get_filters()
- if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
- ext = '.jpg'
- data = stream.get_rawdata()
- elif image.colorspace is LITERAL_DEVICE_RGB:
- ext = '.bmp'
- data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
- elif image.colorspace is LITERAL_DEVICE_GRAY:
- ext = '.bmp'
- data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
- else:
- ext = '.img'
- data = stream.get_data()
- name = image.name+ext
- path = os.path.join(self.outdir, name)
- fp = file(path, 'wb')
- fp.write(data)
- fp.close()
- return name
## TextConverter
@@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
- pagemargin=50, outdir=None,
+ pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'},
text_colors={'char':'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
@@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter):
self.layoutmode = layoutmode
self.showpageno = showpageno
self.pagemargin = pagemargin
- self.outdir = outdir
+ self.imagewriter = imagewriter
self.rect_colors = rect_colors
self.text_colors = text_colors
if self.debug:
@@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter):
return
def place_image(self, item, borderwidth, x, y, w, h):
- if self.outdir is not None:
- name = self.write_image(item)
+ if self.imagewriter is not None:
+ name = self.imagewriter.export_image(item)
self.write('\n' %
(enc(name), borderwidth,
@@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter):
##
class XMLConverter(PDFConverter):
- def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
+ def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
+ laparams=None, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
- self.outdir = outdir
+ self.imagewriter = imagewriter
self.write_header()
return
@@ -479,8 +456,8 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTText):
self.outfp.write('%s\n' % item.get_text())
elif isinstance(item, LTImage):
- if self.outdir:
- name = self.write_image(item)
+ if self.imagewriter is not None:
+ name = self.imagewriter.export_image(item)
self.outfp.write('\n' %
(enc(name), item.width, item.height))
else:
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
index 72f6b23..60717a0 100644
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@@ -4,6 +4,7 @@ import zlib
from lzw import lzwdecode
from ascii85 import ascii85decode, asciihexdecode
from runlength import rldecode
+from ccitt import ccittfaxdecode
from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT
from utils import apply_png_predictor
@@ -206,6 +207,7 @@ class PDFStream(PDFObject):
self.rawdata = None
return
for f in filters:
+ params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
try:
@@ -223,15 +225,13 @@ class PDFStream(PDFObject):
elif f in LITERALS_RUNLENGTH_DECODE:
data = rldecode(data)
elif f in LITERALS_CCITTFAX_DECODE:
- #data = ccittfaxdecode(data)
- raise PDFNotImplementedError('Unsupported filter: %r' % f)
+ data = ccittfaxdecode(data, params)
elif f == LITERAL_CRYPT:
# not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
- params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred == 1:
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 82fb44d..75614f2 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -306,13 +306,3 @@ class Plane(object):
obj.y1 <= y0 or y1 <= obj.y0): continue
yield obj
return
-
-
-# create_bmp
-def create_bmp(data, bits, width, height):
- info = struct.pack('