image handling addition (untested)

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@202 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2010-04-10 11:05:02 +00:00
parent eafdc8830d
commit c81142aa44
5 changed files with 70 additions and 59 deletions

View File

@ -3,11 +3,12 @@ import sys, os.path
from pdfdevice import PDFDevice, PDFTextDevice from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE from pdftypes import LITERALS_DCT_DECODE
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LayoutContainer from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
from utils import apply_matrix_pt, mult_matrix from utils import apply_matrix_pt, mult_matrix
from utils import enc, bbox2str from utils import enc, bbox2str, create_bmp
## PDFPageAggregator ## PDFPageAggregator
@ -50,17 +51,9 @@ class PDFPageAggregator(PDFTextDevice):
def render_image(self, name, stream): def render_image(self, name, stream):
assert isinstance(self.cur_item, LTFigure) assert isinstance(self.cur_item, LTFigure)
ismask = stream.get_any(('IM', 'ImageMask')) item = LTImage(name, stream,
bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
csp = stream.get_any(('CS', 'ColorSpace'))
if not isinstance(csp, list):
csp = [csp]
item = LTImage(name, stream.get_any(('F', 'Filter')),
(stream.get_any(('W', 'Width')),
stream.get_any(('H', 'Height'))),
(self.cur_item.x0, self.cur_item.y0, (self.cur_item.x0, self.cur_item.y0,
self.cur_item.x1, self.cur_item.y1), self.cur_item.x1, self.cur_item.y1))
stream.get_rawdata())
self.cur_item.add(item) self.cur_item.add(item)
return return
@ -115,6 +108,29 @@ class PDFConverter(PDFPageAggregator):
self.outfp.write(enc(text, self.codec)) self.outfp.write(enc(text, self.codec))
return return
def write_image(self, image):
stream = image.stream
filters = stream.get_filters()
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
ext = '.jpg'
data = stream.get_rawdata()
elif stream.colorspace is LITERAL_DEVICE_RGB:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
elif stream.colorspace is LITERAL_DEVICE_GRAY:
ext = '.bmp'
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
else:
ext = '.img'
data = stream.get_data()
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(data)
fp.close()
return name
return
## TextConverter ## TextConverter
## ##
@ -180,23 +196,6 @@ class HTMLConverter(PDFConverter):
self.outfp.write('</span>\n') self.outfp.write('</span>\n')
return return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name),
image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
image.width*self.scale, image.height*self.scale))
return
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
@ -228,8 +227,14 @@ class HTMLConverter(PDFConverter):
for child in item: for child in item:
render(child) render(child)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
name = ''
if self.outdir: if self.outdir:
self.write_image(item) name = self.write_image(item)
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
'width="%d" height="%d" />\n' %
(enc(name),
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
item.width*self.scale, item.height*self.scale))
return return
page = PDFConverter.end_page(self, page) page = PDFConverter.end_page(self, page)
render(page) render(page)
@ -262,18 +267,6 @@ class XMLConverter(PDFConverter):
self.outfp.write('<pages>\n') self.outfp.write('<pages>\n')
return return
def write_image(self, image):
if image.type in LITERALS_DCT_DECODE:
ext = '.jpg'
else:
return None
name = image.name+ext
path = os.path.join(self.outdir, name)
fp = file(path, 'wb')
fp.write(image.data)
fp.close()
return name
def end_page(self, page): def end_page(self, page):
def render(item): def render(item):
if isinstance(item, LTPage): if isinstance(item, LTPage):
@ -308,21 +301,22 @@ class XMLConverter(PDFConverter):
render(child) render(child)
self.outfp.write('</textbox>\n') self.outfp.write('</textbox>\n')
elif isinstance(item, LTChar): elif isinstance(item, LTChar):
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' % vertical = ''
(enc(item.font.fontname), item.is_vertical(), if item.is_vertical():
vertical = 'vertical="true" '
self.outfp.write('<text font="%s" %sbbox="%s" size="%.3f">' %
(enc(item.font.fontname), vertical,
bbox2str(item.bbox), item.get_size())) bbox2str(item.bbox), item.get_size()))
self.write(item.text) self.write(item.text)
self.outfp.write('</text>\n') self.outfp.write('</text>\n')
elif isinstance(item, LTText): elif isinstance(item, LTText):
self.outfp.write('<text>%s</text>\n' % item.text) self.outfp.write('<text>%s</text>\n' % item.text)
elif isinstance(item, LTImage): elif isinstance(item, LTImage):
x = '' name = ''
if self.outdir: if self.outdir:
name = self.write_image(item) name = self.write_image(item)
if name: self.outfp.write('<image name="%s" width="%d" height="%d" />\n' %
x = 'name="%s" ' % enc(name) (enc(name), item.width, item.height))
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' %
(x, item.type, item.width, item.height))
else: else:
assert 0, item assert 0, item
return return

View File

@ -198,17 +198,22 @@ class LTRect(LTPolygon):
## ##
class LTImage(LayoutItem): class LTImage(LayoutItem):
def __init__(self, name, type, srcsize, bbox, data): def __init__(self, name, stream, bbox):
LayoutItem.__init__(self, bbox) LayoutItem.__init__(self, bbox)
self.name = name self.name = name
self.type = type self.stream = stream
self.srcsize = srcsize self.srcsize = (stream.get_any(('W', 'Width')),
self.data = data stream.get_any(('H', 'Height')))
self.imagemask = stream.get_any(('IM', 'ImageMask'))
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
if not isinstance(self.colorspace, list):
self.colorspace = [colorspace]
return return
def __repr__(self): def __repr__(self):
(w,h) = self.srcsize (w,h) = self.srcsize
return '<image %s %s %dx%d>' % (self.name, self.type, w, h) return '<image %s %dx%d>' % (self.name, w, h)
## LTText ## LTText

View File

@ -457,8 +457,7 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec: if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
else: else:
descriptor = {'FontName':spec.get('Name'), descriptor = {'Ascent':0, 'Descent':0,
'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']} 'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec) PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix'))) self.matrix = tuple(list_value(spec.get('FontMatrix')))

View File

@ -187,6 +187,12 @@ class PDFStream(PDFObject):
return self.attrs[name] return self.attrs[name]
return default return default
def get_filters(self):
filters = self.get_any(('F', 'Filter'))
if not filters: return []
if isinstance(filters, list): return filters
return [ filters ]
def decomp(self,data): def decomp(self,data):
buf = data buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the # some FlateDecode streams have garbage (newlines, etc) appended to the
@ -206,13 +212,11 @@ class PDFStream(PDFObject):
if self.decipher: if self.decipher:
# Handle encryption # Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
filters = self.get_any(('F', 'Filter')) filters = self.get_filters()
if not filters: if not filters:
self.data = data self.data = data
self.rawdata = None self.rawdata = None
return return
if not isinstance(filters, list):
filters = [ filters ]
for f in filters: for f in filters:
if f in LITERALS_FLATE_DECODE: if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted. # will get errors if the document is encrypted.

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
from struct import unpack from struct import pack, unpack
## Matrix operations ## Matrix operations
@ -165,3 +165,12 @@ class ObjIdRange(object):
def get_nobjs(self): def get_nobjs(self):
return self.nobjs return self.nobjs
# create_bmp
def create_bmp(data, bits, width, height):
info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
assert len(info) == 40, len(info)
header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
# XXX re-rasterize every line
return header+info+data