added CCITTFaxFilter support and a very crude image extraction.

pull/1/head
Yusuke Shinyama 2011-07-18 21:07:00 +10:00
parent 2707ba75df
commit dc8fde0e47
5 changed files with 62 additions and 56 deletions

View File

@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser):
except self.ByteSkip: except self.ByteSkip:
self._accept = self._parse_mode self._accept = self._parse_mode
self._state = self.MODE self._state = self.MODE
except self.EOFB:
break
return return
def _parse_mode(self, mode): def _parse_mode(self, mode):
@ -394,7 +396,7 @@ class CCITTG4Parser(BitParser):
def _get_bits(self): def _get_bits(self):
return ''.join( str(b) for b in self._curline[:self._curpos] ) return ''.join( str(b) for b in self._curline[:self._curpos] )
def _get_refline(self, i): def _get_refline(self, i):
if i < 0: if i < 0:
return '[]'+''.join( str(b) for b in self._refline ) return '[]'+''.join( str(b) for b in self._refline )
@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase):
return return
## CCITTFaxDecoder
##
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed
self._buf = ''
return
def close(self):
return self._buf
def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)/8))
if self.reversed:
bits = [ not b for b in bits ]
for (i,b) in enumerate(bits):
if b:
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
self._buf += bytes.tostring()
return
def ccittfaxdecode(data, params):
K = params.get('K')
cols = params.get('Columns')
bytealign = params.get('EncodedByteAlign')
reversed = params.get('BlackIs1')
if K == -1:
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise ValueError(K)
parser.feedbytes(data)
return parser.close()
# test # test
def main(argv): def main(argv):
import pygame import pygame
@ -691,10 +730,7 @@ def main(argv):
fp = file(path,'rb') fp = file(path,'rb')
(_,_,k,w,h,_) = path.split('.') (_,_,k,w,h,_) = path.split('.')
parser = Parser(int(w)) parser = Parser(int(w))
try: parser.feedbytes(fp.read())
parser.feedbytes(fp.read())
except parser.EOFB:
pass
parser.close() parser.close()
fp.close() fp.close()
return return

View File

@ -1,14 +1,12 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
import sys, os.path import sys
from pdfdevice import PDFDevice, PDFTextDevice from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
from layout import LTFigure, LTImage, LTChar, LTTextLine from layout import LTFigure, LTImage, LTChar, LTTextLine
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str, create_bmp from utils import enc, bbox2str
## PDFLayoutAnalyzer ## PDFLayoutAnalyzer
@ -139,28 +137,6 @@ class PDFConverter(PDFLayoutAnalyzer):
self.outfp = outfp self.outfp = outfp
self.codec = codec self.codec = codec
return return
def write_image(self, image):
stream = image.stream
filters = stream.get_filters()
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
data = stream.get_rawdata()
elif image.colorspace is LITERAL_DEVICE_RGB:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
elif image.colorspace is LITERAL_DEVICE_GRAY:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
else:
ext = '.img'
data = stream.get_data()
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(data)
fp.close()
return name
## TextConverter ## TextConverter
@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True, scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
pagemargin=50, outdir=None, pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'}, rect_colors={'curve':'black', 'page':'gray'},
text_colors={'char':'black'}): text_colors={'char':'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter):
self.layoutmode = layoutmode self.layoutmode = layoutmode
self.showpageno = showpageno self.showpageno = showpageno
self.pagemargin = pagemargin self.pagemargin = pagemargin
self.outdir = outdir self.imagewriter = imagewriter
self.rect_colors = rect_colors self.rect_colors = rect_colors
self.text_colors = text_colors self.text_colors = text_colors
if self.debug: if self.debug:
@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter):
return return
def place_image(self, item, borderwidth, x, y, w, h): def place_image(self, item, borderwidth, x, y, w, h):
if self.outdir is not None: if self.imagewriter is not None:
name = self.write_image(item) name = self.imagewriter.export_image(item)
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" ' self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' % 'width="%d" height="%d" />\n' %
(enc(name), borderwidth, (enc(name), borderwidth,
@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter):
## ##
class XMLConverter(PDFConverter): class XMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None): def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams) PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir self.imagewriter = imagewriter
self.write_header() self.write_header()
return return
@ -479,8 +456,8 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.get_text()) self.outfp.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
if self.outdir: if self.imagewriter is not None:
name = self.write_image(item) name = self.imagewriter.export_image(item)
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' % self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height)) (enc(name), item.width, item.height))
else: else:

View File

@ -4,6 +4,7 @@ import zlib
from lzw import lzwdecode from lzw import lzwdecode
from ascii85 import ascii85decode, asciihexdecode from ascii85 import ascii85decode, asciihexdecode
from runlength import rldecode from runlength import rldecode
from ccitt import ccittfaxdecode
from psparser import PSException, PSObject from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT from psparser import LIT, KWD, STRICT
from utils import apply_png_predictor from utils import apply_png_predictor
@ -206,6 +207,7 @@ class PDFStream(PDFObject):
self.rawdata = None self.rawdata = None
return return
for f in filters: for f in filters:
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if f in LITERALS_FLATE_DECODE: if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
try: try:
@ -223,15 +225,13 @@ class PDFStream(PDFObject):
elif f in LITERALS_RUNLENGTH_DECODE: elif f in LITERALS_RUNLENGTH_DECODE:
data = rldecode(data) data = rldecode(data)
elif f in LITERALS_CCITTFAX_DECODE: elif f in LITERALS_CCITTFAX_DECODE:
#data = ccittfaxdecode(data) data = ccittfaxdecode(data, params)
raise PDFNotImplementedError('Unsupported filter: %r' % f)
elif f == LITERAL_CRYPT: elif f == LITERAL_CRYPT:
# not yet.. # not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported') raise PDFNotImplementedError('/Crypt filter is unsupported')
else: else:
raise PDFNotImplementedError('Unsupported filter: %r' % f) raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors # apply predictors
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if 'Predictor' in params: if 'Predictor' in params:
pred = int_value(params['Predictor']) pred = int_value(params['Predictor'])
if pred == 1: if pred == 1:

View File

@ -306,13 +306,3 @@ class Plane(object):
obj.y1 <= y0 or y1 <= obj.y0): continue obj.y1 <= y0 or y1 <= obj.y0): continue
yield obj yield obj
return return
# create_bmp
def create_bmp(data, bits, width, height):
info = struct.pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
assert len(info) == 40, len(info)
header = struct.pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
assert len(header) == 14, len(header)
# XXX re-rasterize every line
return header+info+data

View File

@ -6,6 +6,7 @@ from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main # main
def main(argv): def main(argv):
@ -29,7 +30,7 @@ def main(argv):
# output option # output option
outfile = None outfile = None
outtype = None outtype = None
outdir = None imagewriter = None
layoutmode = 'normal' layoutmode = 'normal'
codec = 'utf-8' codec = 'utf-8'
pageno = 1 pageno = 1
@ -52,7 +53,7 @@ def main(argv):
elif k == '-W': laparams.word_margin = float(v) elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v) elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v elif k == '-t': outtype = v
elif k == '-c': codec = v elif k == '-c': codec = v
elif k == '-s': scale = float(v) elif k == '-s': scale = float(v)
@ -81,10 +82,12 @@ def main(argv):
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml': elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html': elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir) layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag': elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec) device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: else: