added CCITTFaxFilter support and a very crude image extraction.
parent
2707ba75df
commit
dc8fde0e47
|
@ -329,6 +329,8 @@ class CCITTG4Parser(BitParser):
|
|||
except self.ByteSkip:
|
||||
self._accept = self._parse_mode
|
||||
self._state = self.MODE
|
||||
except self.EOFB:
|
||||
break
|
||||
return
|
||||
|
||||
def _parse_mode(self, mode):
|
||||
|
@ -667,6 +669,43 @@ class TestCCITTG4Parser(unittest.TestCase):
|
|||
return
|
||||
|
||||
|
||||
## CCITTFaxDecoder
|
||||
##
|
||||
class CCITTFaxDecoder(CCITTG4Parser):
|
||||
|
||||
def __init__(self, width, bytealign=False, reversed=False):
|
||||
CCITTG4Parser.__init__(self, width, bytealign=bytealign)
|
||||
self.reversed = reversed
|
||||
self._buf = ''
|
||||
return
|
||||
|
||||
def close(self):
|
||||
return self._buf
|
||||
|
||||
def output_line(self, y, bits):
|
||||
bytes = array.array('B', [0]*((len(bits)+7)/8))
|
||||
if self.reversed:
|
||||
bits = [ not b for b in bits ]
|
||||
for (i,b) in enumerate(bits):
|
||||
if b:
|
||||
bytes[i/8] += (128,64,32,16,8,4,2,1)[i%8]
|
||||
self._buf += bytes.tostring()
|
||||
return
|
||||
|
||||
|
||||
def ccittfaxdecode(data, params):
|
||||
K = params.get('K')
|
||||
cols = params.get('Columns')
|
||||
bytealign = params.get('EncodedByteAlign')
|
||||
reversed = params.get('BlackIs1')
|
||||
if K == -1:
|
||||
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
|
||||
else:
|
||||
raise ValueError(K)
|
||||
parser.feedbytes(data)
|
||||
return parser.close()
|
||||
|
||||
|
||||
# test
|
||||
def main(argv):
|
||||
import pygame
|
||||
|
@ -691,10 +730,7 @@ def main(argv):
|
|||
fp = file(path,'rb')
|
||||
(_,_,k,w,h,_) = path.split('.')
|
||||
parser = Parser(int(w))
|
||||
try:
|
||||
parser.feedbytes(fp.read())
|
||||
except parser.EOFB:
|
||||
pass
|
||||
parser.feedbytes(fp.read())
|
||||
parser.close()
|
||||
fp.close()
|
||||
return
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
#!/usr/bin/env python2
|
||||
import sys, os.path
|
||||
import sys
|
||||
from pdfdevice import PDFDevice, PDFTextDevice
|
||||
from pdffont import PDFUnicodeNotDefined
|
||||
from pdftypes import LITERALS_DCT_DECODE
|
||||
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||
from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
|
||||
from layout import LTFigure, LTImage, LTChar, LTTextLine
|
||||
from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
|
||||
from utils import apply_matrix_pt, mult_matrix
|
||||
from utils import enc, bbox2str, create_bmp
|
||||
from utils import enc, bbox2str
|
||||
|
||||
|
||||
## PDFLayoutAnalyzer
|
||||
|
@ -140,28 +138,6 @@ class PDFConverter(PDFLayoutAnalyzer):
|
|||
self.codec = codec
|
||||
return
|
||||
|
||||
def write_image(self, image):
|
||||
stream = image.stream
|
||||
filters = stream.get_filters()
|
||||
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
|
||||
ext = '.jpg'
|
||||
data = stream.get_rawdata()
|
||||
elif image.colorspace is LITERAL_DEVICE_RGB:
|
||||
ext = '.bmp'
|
||||
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
|
||||
elif image.colorspace is LITERAL_DEVICE_GRAY:
|
||||
ext = '.bmp'
|
||||
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
|
||||
else:
|
||||
ext = '.img'
|
||||
data = stream.get_data()
|
||||
name = image.name+ext
|
||||
path = os.path.join(self.outdir, name)
|
||||
fp = file(path, 'wb')
|
||||
fp.write(data)
|
||||
fp.close()
|
||||
return name
|
||||
|
||||
|
||||
## TextConverter
|
||||
##
|
||||
|
@ -222,7 +198,7 @@ class HTMLConverter(PDFConverter):
|
|||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
|
||||
scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
|
||||
pagemargin=50, outdir=None,
|
||||
pagemargin=50, imagewriter=None,
|
||||
rect_colors={'curve':'black', 'page':'gray'},
|
||||
text_colors={'char':'black'}):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
|
@ -231,7 +207,7 @@ class HTMLConverter(PDFConverter):
|
|||
self.layoutmode = layoutmode
|
||||
self.showpageno = showpageno
|
||||
self.pagemargin = pagemargin
|
||||
self.outdir = outdir
|
||||
self.imagewriter = imagewriter
|
||||
self.rect_colors = rect_colors
|
||||
self.text_colors = text_colors
|
||||
if self.debug:
|
||||
|
@ -278,8 +254,8 @@ class HTMLConverter(PDFConverter):
|
|||
return
|
||||
|
||||
def place_image(self, item, borderwidth, x, y, w, h):
|
||||
if self.outdir is not None:
|
||||
name = self.write_image(item)
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.write('<img src="%s" border="%d" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||
'width="%d" height="%d" />\n' %
|
||||
(enc(name), borderwidth,
|
||||
|
@ -400,9 +376,10 @@ class HTMLConverter(PDFConverter):
|
|||
##
|
||||
class XMLConverter(PDFConverter):
|
||||
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, outdir=None):
|
||||
def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1,
|
||||
laparams=None, imagewriter=None):
|
||||
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
|
||||
self.outdir = outdir
|
||||
self.imagewriter = imagewriter
|
||||
self.write_header()
|
||||
return
|
||||
|
||||
|
@ -479,8 +456,8 @@ class XMLConverter(PDFConverter):
|
|||
elif isinstance(item, LTText):
|
||||
self.outfp.write('<text>%s</text>\n' % item.get_text())
|
||||
elif isinstance(item, LTImage):
|
||||
if self.outdir:
|
||||
name = self.write_image(item)
|
||||
if self.imagewriter is not None:
|
||||
name = self.imagewriter.export_image(item)
|
||||
self.outfp.write('<image src="%s" width="%d" height="%d" />\n' %
|
||||
(enc(name), item.width, item.height))
|
||||
else:
|
||||
|
|
|
@ -4,6 +4,7 @@ import zlib
|
|||
from lzw import lzwdecode
|
||||
from ascii85 import ascii85decode, asciihexdecode
|
||||
from runlength import rldecode
|
||||
from ccitt import ccittfaxdecode
|
||||
from psparser import PSException, PSObject
|
||||
from psparser import LIT, KWD, STRICT
|
||||
from utils import apply_png_predictor
|
||||
|
@ -206,6 +207,7 @@ class PDFStream(PDFObject):
|
|||
self.rawdata = None
|
||||
return
|
||||
for f in filters:
|
||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||
if f in LITERALS_FLATE_DECODE:
|
||||
# will get errors if the document is encrypted.
|
||||
try:
|
||||
|
@ -223,15 +225,13 @@ class PDFStream(PDFObject):
|
|||
elif f in LITERALS_RUNLENGTH_DECODE:
|
||||
data = rldecode(data)
|
||||
elif f in LITERALS_CCITTFAX_DECODE:
|
||||
#data = ccittfaxdecode(data)
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
data = ccittfaxdecode(data, params)
|
||||
elif f == LITERAL_CRYPT:
|
||||
# not yet..
|
||||
raise PDFNotImplementedError('/Crypt filter is unsupported')
|
||||
else:
|
||||
raise PDFNotImplementedError('Unsupported filter: %r' % f)
|
||||
# apply predictors
|
||||
params = self.get_any(('DP', 'DecodeParms', 'FDecodeParms'), {})
|
||||
if 'Predictor' in params:
|
||||
pred = int_value(params['Predictor'])
|
||||
if pred == 1:
|
||||
|
|
|
@ -306,13 +306,3 @@ class Plane(object):
|
|||
obj.y1 <= y0 or y1 <= obj.y0): continue
|
||||
yield obj
|
||||
return
|
||||
|
||||
|
||||
# create_bmp
|
||||
def create_bmp(data, bits, width, height):
|
||||
info = struct.pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
|
||||
assert len(info) == 40, len(info)
|
||||
header = struct.pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
|
||||
assert len(header) == 14, len(header)
|
||||
# XXX re-rasterize every line
|
||||
return header+info+data
|
||||
|
|
|
@ -6,6 +6,7 @@ from pdfminer.pdfdevice import PDFDevice, TagExtractor
|
|||
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
|
||||
from pdfminer.cmapdb import CMapDB
|
||||
from pdfminer.layout import LAParams
|
||||
from pdfminer.image import ImageWriter
|
||||
|
||||
# main
|
||||
def main(argv):
|
||||
|
@ -29,7 +30,7 @@ def main(argv):
|
|||
# output option
|
||||
outfile = None
|
||||
outtype = None
|
||||
outdir = None
|
||||
imagewriter = None
|
||||
layoutmode = 'normal'
|
||||
codec = 'utf-8'
|
||||
pageno = 1
|
||||
|
@ -52,7 +53,7 @@ def main(argv):
|
|||
elif k == '-W': laparams.word_margin = float(v)
|
||||
elif k == '-F': laparams.boxes_flow = float(v)
|
||||
elif k == '-Y': layoutmode = v
|
||||
elif k == '-O': outdir = v
|
||||
elif k == '-O': imagewriter = ImageWriter(v)
|
||||
elif k == '-t': outtype = v
|
||||
elif k == '-c': codec = v
|
||||
elif k == '-s': scale = float(v)
|
||||
|
@ -81,10 +82,12 @@ def main(argv):
|
|||
if outtype == 'text':
|
||||
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
|
||||
elif outtype == 'xml':
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
|
||||
device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
elif outtype == 'html':
|
||||
device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
|
||||
layoutmode=layoutmode, laparams=laparams, outdir=outdir)
|
||||
layoutmode=layoutmode, laparams=laparams,
|
||||
imagewriter=imagewriter)
|
||||
elif outtype == 'tag':
|
||||
device = TagExtractor(rsrcmgr, outfp, codec=codec)
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue