added CCITTFaxFilter support and a very crude image extraction.

pull/1/head
Yusuke Shinyama 2011-07-18 21:07:00 +10:00
parent 2707ba75df
commit dc8fde0e47
5 changed files with 62 additions and 56 deletions

View File

@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser):
except self.ByteSkip:
self._accept = self._parse_mode
self._state = self.MODE
except self.EOFB:
break
return
def _parse_mode(self, mode):
@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase):
return
## CCITTFaxDecoder
##
class CCITTFaxDecoder(CCITTG4Parser):
def __init__(self, width, bytealign=False, reversed=False):
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
self.reversed = reversed
self._buf = ''
return
def close(self):
return self._buf
def output_line(self, y, bits):
bytes = array.array('B', [0]*((len(bits)+7)/8))
if self.reversed:
bits = [ not b for b in bits ]
for (i,b) in enumerate(bits):
if b:
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
self._buf += bytes.tostring()
return
def ccittfaxdecode(data, params):
K = params.get('K')
cols = params.get('Columns')
bytealign = params.get('EncodedByteAlign')
reversed = params.get('BlackIs1')
if K == -1:
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise ValueError(K)
parser.feedbytes(data)
return parser.close()
# test
def main(argv):
import pygame
@ -691,10 +730,7 @@ def main(argv):
fp = file(path,'rb')
(_,_,k,w,h,_) = path.split('.')
parser = Parser(int(w))
try:
parser.feedbytes(fp.read())
except parser.EOFB:
pass
parser.close()
fp.close()
return

View File

@ -1,14 +1,12 @@
#!/usr/bin/env python2
import sys, os.path
import sys
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
from layout import LTFigure, LTImage, LTChar, LTTextLine
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str, create_bmp
from utils import enc, bbox2str
## PDFLayoutAnalyzer
@ -140,28 +138,6 @@ class PDFConverter(PDFLayoutAnalyzer):
self.codec = codec
return
def write_image(self, image):
stream = image.stream
filters = stream.get_filters()
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
data = stream.get_rawdata()
elif image.colorspace is LITERAL_DEVICE_RGB:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
elif image.colorspace is LITERAL_DEVICE_GRAY:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
else:
ext = '.img'
data = stream.get_data()
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(data)
fp.close()
return name
## TextConverter
##
@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
pagemargin=50, outdir=None,
pagemargin=50, imagewriter=None,
rect_colors={'curve':'black', 'page':'gray'},
text_colors={'char':'black'}):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter):
self.layoutmode = layoutmode
self.showpageno = showpageno
self.pagemargin = pagemargin
self.outdir = outdir
self.imagewriter = imagewriter
self.rect_colors = rect_colors
self.text_colors = text_colors
if self.debug:
@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter):
return
def place_image(self, item, borderwidth, x, y, w, h):
if self.outdir is not None:
name = self.write_image(item)
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name), borderwidth,
@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter):
##
class XMLConverter(PDFConverter):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
laparams=None, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
self.outdir = outdir
self.imagewriter = imagewriter
self.write_header()
return
@ -479,8 +456,8 @@ class XMLConverter(PDFConverter):
elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.get_text())
elif isinstance(item, LTImage):
if self.outdir:
name = self.write_image(item)
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name), item.width, item.height))
else:

View File

@ -4,6 +4,7 @@ import zlib
from lzw import lzwdecode
from ascii85 import ascii85decode, asciihexdecode
from runlength import rldecode
from ccitt import ccittfaxdecode
from psparser import PSException, PSObject
from psparser import LIT, KWD, STRICT
from utils import apply_png_predictor
@ -206,6 +207,7 @@ class PDFStream(PDFObject):
self.rawdata = None
return
for f in filters:
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
try:
@ -223,15 +225,13 @@ class PDFStream(PDFObject):
elif f in LITERALS_RUNLENGTH_DECODE:
data = rldecode(data)
elif f in LITERALS_CCITTFAX_DECODE:
#data = ccittfaxdecode(data)
raise PDFNotImplementedError('Unsupported filter: %r' % f)
data = ccittfaxdecode(data, params)
elif f == LITERAL_CRYPT:
# not yet..
raise PDFNotImplementedError('/Crypt filter is unsupported')
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred == 1:

View File

@ -306,13 +306,3 @@ class Plane(object):
obj.y1 <= y0 or y1 <= obj.y0): continue
yield obj
return
# create_bmp
def create_bmp(data, bits, width, height):
info = struct.pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
assert len(info) == 40, len(info)
header = struct.pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
assert len(header) == 14, len(header)
# XXX re-rasterize every line
return header+info+data

View File

@ -6,6 +6,7 @@ from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
from pdfminer.image import ImageWriter
# main
def main(argv):
@ -29,7 +30,7 @@ def main(argv):
# output option
outfile = None
outtype = None
outdir = None
imagewriter = None
layoutmode = 'normal'
codec = 'utf-8'
pageno = 1
@ -52,7 +53,7 @@ def main(argv):
elif k == '-W': laparams.word_margin = float(v)
elif k == '-F': laparams.boxes_flow = float(v)
elif k == '-Y': layoutmode = v
elif k == '-O': outdir = v
elif k == '-O': imagewriter = ImageWriter(v)
elif k == '-t': outtype = v
elif k == '-c': codec = v
elif k == '-s': scale = float(v)
@ -81,10 +82,12 @@ def main(argv):
if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
elif outtype == 'xml':
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'html':
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
layoutmode=layoutmode, laparams=laparams,
imagewriter=imagewriter)
elif outtype == 'tag':
device = TagExtractor(rsrcmgr, outfp, codec=codec)
else: