added CCITTFaxFilter support and a very crude image extraction.
parent
2707ba75df
commit
dc8fde0e47
|
@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser):
|
||||||
except self.ByteSkip:
|
except self.ByteSkip:
|
||||||
self._accept = self._parse_mode
|
self._accept = self._parse_mode
|
||||||
self._state = self.MODE
|
self._state = self.MODE
|
||||||
|
except self.EOFB:
|
||||||
|
break
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_mode(self, mode):
|
def _parse_mode(self, mode):
|
||||||
|
@ -394,7 +396,7 @@ class CCITTG4Parser(BitParser):
|
||||||
|
|
||||||
def _get_bits(self):
|
def _get_bits(self):
|
||||||
return ''.join( str(b) for b in self._curline[:self._curpos] )
|
return ''.join( str(b) for b in self._curline[:self._curpos] )
|
||||||
|
|
||||||
def _get_refline(self, i):
|
def _get_refline(self, i):
|
||||||
if i < 0:
|
if i < 0:
|
||||||
return '[]'+''.join( str(b) for b in self._refline )
|
return '[]'+''.join( str(b) for b in self._refline )
|
||||||
|
@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase):
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
## CCITTFaxDecoder
|
||||||
|
##
|
||||||
|
class CCITTFaxDecoder(CCITTG4Parser):
|
||||||
|
|
||||||
|
def __init__(self, width, bytealign=False, reversed=False):
|
||||||
|
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||||
|
self.reversed = reversed
|
||||||
|
self._buf = ''
|
||||||
|
return
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
return self._buf
|
||||||
|
|
||||||
|
def output_line(self, y, bits):
|
||||||
|
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
||||||
|
if self.reversed:
|
||||||
|
bits = [ not b for b in bits ]
|
||||||
|
for (i,b) in enumerate(bits):
|
||||||
|
if b:
|
||||||
|
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
|
||||||
|
self._buf += bytes.tostring()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def ccittfaxdecode(data, params):
|
||||||
|
K = params.get('K')
|
||||||
|
cols = params.get('Columns')
|
||||||
|
bytealign = params.get('EncodedByteAlign')
|
||||||
|
reversed = params.get('BlackIs1')
|
||||||
|
if K == -1:
|
||||||
|
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
||||||
|
else:
|
||||||
|
raise ValueError(K)
|
||||||
|
parser.feedbytes(data)
|
||||||
|
return parser.close()
|
||||||
|
|
||||||
|
|
||||||
# test
|
# test
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import pygame
|
import pygame
|
||||||
|
@ -691,10 +730,7 @@ def main(argv):
|
||||||
fp = file(path,'rb')
|
fp = file(path,'rb')
|
||||||
(_,_,k,w,h,_) = path.split('.')
|
(_,_,k,w,h,_) = path.split('.')
|
||||||
parser = Parser(int(w))
|
parser = Parser(int(w))
|
||||||
try:
|
parser.feedbytes(fp.read())
|
||||||
parser.feedbytes(fp.read())
|
|
||||||
except parser.EOFB:
|
|
||||||
pass
|
|
||||||
parser.close()
|
parser.close()
|
||||||
fp.close()
|
fp.close()
|
||||||
return
|
return
|
||||||
|
|
|
@ -1,14 +1,12 @@
|
||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
import sys, os.path
|
import sys
|
||||||
from pdfdevice import PDFDevice, PDFTextDevice
|
from pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from pdftypes import LITERALS_DCT_DECODE
|
|
||||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
|
||||||
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
|
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
|
||||||
from layout import LTFigure, LTImage, LTChar, LTTextLine
|
from layout import LTFigure, LTImage, LTChar, LTTextLine
|
||||||
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
|
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
|
||||||
from utils import apply_matrix_pt, mult_matrix
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
from utils import enc, bbox2str, create_bmp
|
from utils import enc, bbox2str
|
||||||
|
|
||||||
|
|
||||||
## PDFLayoutAnalyzer
|
## PDFLayoutAnalyzer
|
||||||
|
@ -139,28 +137,6 @@ class PDFConverter(PDFLayoutAnalyzer):
|
||||||
self.outfp = outfp
|
self.outfp = outfp
|
||||||
self.codec = codec
|
self.codec = codec
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_image(self, image):
|
|
||||||
stream = image.stream
|
|
||||||
filters = stream.get_filters()
|
|
||||||
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
|
|
||||||
ext = '.jpg'
|
|
||||||
data = stream.get_rawdata()
|
|
||||||
elif image.colorspace is LITERAL_DEVICE_RGB:
|
|
||||||
ext = '.bmp'
|
|
||||||
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
|
|
||||||
elif image.colorspace is LITERAL_DEVICE_GRAY:
|
|
||||||
ext = '.bmp'
|
|
||||||
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
|
|
||||||
else:
|
|
||||||
ext = '.img'
|
|
||||||
data = stream.get_data()
|
|
||||||
name = image.name+ext
|
|
||||||
path = os.path.join(self.outdir, name)
|
|
||||||
fp = file(path, 'wb')
|
|
||||||
fp.write(data)
|
|
||||||
fp.close()
|
|
||||||
return name
|
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## TextConverter
|
||||||
|
@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||||
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
|
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
|
||||||
pagemargin=50, outdir=None,
|
pagemargin=50, imagewriter=None,
|
||||||
rect_colors={'curve':'black', 'page':'gray'},
|
rect_colors={'curve':'black', 'page':'gray'},
|
||||||
text_colors={'char':'black'}):
|
text_colors={'char':'black'}):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
|
@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter):
|
||||||
self.layoutmode = layoutmode
|
self.layoutmode = layoutmode
|
||||||
self.showpageno = showpageno
|
self.showpageno = showpageno
|
||||||
self.pagemargin = pagemargin
|
self.pagemargin = pagemargin
|
||||||
self.outdir = outdir
|
self.imagewriter = imagewriter
|
||||||
self.rect_colors = rect_colors
|
self.rect_colors = rect_colors
|
||||||
self.text_colors = text_colors
|
self.text_colors = text_colors
|
||||||
if self.debug:
|
if self.debug:
|
||||||
|
@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter):
|
||||||
return
|
return
|
||||||
|
|
||||||
def place_image(self, item, borderwidth, x, y, w, h):
|
def place_image(self, item, borderwidth, x, y, w, h):
|
||||||
if self.outdir is not None:
|
if self.imagewriter is not None:
|
||||||
name = self.write_image(item)
|
name = self.imagewriter.export_image(item)
|
||||||
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||||
'width="%d" height="%d" />\n' %
|
'width="%d" height="%d" />\n' %
|
||||||
(enc(name), borderwidth,
|
(enc(name), borderwidth,
|
||||||
|
@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter):
|
||||||
##
|
##
|
||||||
class XMLConverter(PDFConverter):
|
class XMLConverter(PDFConverter):
|
||||||
|
|
||||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||||
|
laparams=None, imagewriter=None):
|
||||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||||
self.outdir = outdir
|
self.imagewriter = imagewriter
|
||||||
self.write_header()
|
self.write_header()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -479,8 +456,8 @@ class XMLConverter(PDFConverter):
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.get_text())
|
self.outfp.write('<text>%s</text>\n' % item.get_text())
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
if self.outdir:
|
if self.imagewriter is not None:
|
||||||
name = self.write_image(item)
|
name = self.imagewriter.export_image(item)
|
||||||
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
|
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||||
(enc(name), item.width, item.height))
|
(enc(name), item.width, item.height))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -4,6 +4,7 @@ import zlib
|
||||||
from lzw import lzwdecode
|
from lzw import lzwdecode
|
||||||
from ascii85 import ascii85decode, asciihexdecode
|
from ascii85 import ascii85decode, asciihexdecode
|
||||||
from runlength import rldecode
|
from runlength import rldecode
|
||||||
|
from ccitt import ccittfaxdecode
|
||||||
from psparser import PSException, PSObject
|
from psparser import PSException, PSObject
|
||||||
from psparser import LIT, KWD, STRICT
|
from psparser import LIT, KWD, STRICT
|
||||||
from utils import apply_png_predictor
|
from utils import apply_png_predictor
|
||||||
|
@ -206,6 +207,7 @@ class PDFStream(PDFObject):
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
return
|
return
|
||||||
for f in filters:
|
for f in filters:
|
||||||
|
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||||
if f in LITERALS_FLATE_DECODE:
|
if f in LITERALS_FLATE_DECODE:
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
try:
|
try:
|
||||||
|
@ -223,15 +225,13 @@ class PDFStream(PDFObject):
|
||||||
elif f in LITERALS_RUNLENGTH_DECODE:
|
elif f in LITERALS_RUNLENGTH_DECODE:
|
||||||
data = rldecode(data)
|
data = rldecode(data)
|
||||||
elif f in LITERALS_CCITTFAX_DECODE:
|
elif f in LITERALS_CCITTFAX_DECODE:
|
||||||
#data = ccittfaxdecode(data)
|
data = ccittfaxdecode(data, params)
|
||||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
|
||||||
elif f == LITERAL_CRYPT:
|
elif f == LITERAL_CRYPT:
|
||||||
# not yet..
|
# not yet..
|
||||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||||
else:
|
else:
|
||||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||||
# apply predictors
|
# apply predictors
|
||||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
|
||||||
if 'Predictor' in params:
|
if 'Predictor' in params:
|
||||||
pred = int_value(params['Predictor'])
|
pred = int_value(params['Predictor'])
|
||||||
if pred == 1:
|
if pred == 1:
|
||||||
|
|
|
@ -306,13 +306,3 @@ class Plane(object):
|
||||||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||||
yield obj
|
yield obj
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
# create_bmp
|
|
||||||
def create_bmp(data, bits, width, height):
|
|
||||||
info = struct.pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
|
|
||||||
assert len(info) == 40, len(info)
|
|
||||||
header = struct.pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
|
|
||||||
assert len(header) == 14, len(header)
|
|
||||||
# XXX re-rasterize every line
|
|
||||||
return header+info+data
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
||||||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||||
from pdfminer.cmapdb import CMapDB
|
from pdfminer.cmapdb import CMapDB
|
||||||
from pdfminer.layout import LAParams
|
from pdfminer.layout import LAParams
|
||||||
|
from pdfminer.image import ImageWriter
|
||||||
|
|
||||||
# main
|
# main
|
||||||
def main(argv):
|
def main(argv):
|
||||||
|
@ -29,7 +30,7 @@ def main(argv):
|
||||||
# output option
|
# output option
|
||||||
outfile = None
|
outfile = None
|
||||||
outtype = None
|
outtype = None
|
||||||
outdir = None
|
imagewriter = None
|
||||||
layoutmode = 'normal'
|
layoutmode = 'normal'
|
||||||
codec = 'utf-8'
|
codec = 'utf-8'
|
||||||
pageno = 1
|
pageno = 1
|
||||||
|
@ -52,7 +53,7 @@ def main(argv):
|
||||||
elif k == '-W': laparams.word_margin = float(v)
|
elif k == '-W': laparams.word_margin = float(v)
|
||||||
elif k == '-F': laparams.boxes_flow = float(v)
|
elif k == '-F': laparams.boxes_flow = float(v)
|
||||||
elif k == '-Y': layoutmode = v
|
elif k == '-Y': layoutmode = v
|
||||||
elif k == '-O': outdir = v
|
elif k == '-O': imagewriter = ImageWriter(v)
|
||||||
elif k == '-t': outtype = v
|
elif k == '-t': outtype = v
|
||||||
elif k == '-c': codec = v
|
elif k == '-c': codec = v
|
||||||
elif k == '-s': scale = float(v)
|
elif k == '-s': scale = float(v)
|
||||||
|
@ -81,10 +82,12 @@ def main(argv):
|
||||||
if outtype == 'text':
|
if outtype == 'text':
|
||||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
||||||
elif outtype == 'xml':
|
elif outtype == 'xml':
|
||||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||||
|
imagewriter=imagewriter)
|
||||||
elif outtype == 'html':
|
elif outtype == 'html':
|
||||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||||
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
|
layoutmode=layoutmode, laparams=laparams,
|
||||||
|
imagewriter=imagewriter)
|
||||||
elif outtype == 'tag':
|
elif outtype == 'tag':
|
||||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue