From c81142aa44a73091246b290b944cd8fb6842b6eb Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 10 Apr 2010 11:05:02 +0000 Subject: [PATCH] image handling addition (untested) git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@202 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdfminer/converter.py | 90 ++++++++++++++++++++----------------------- pdfminer/layout.py | 15 +++++--- pdfminer/pdffont.py | 3 +- pdfminer/pdftypes.py | 10 +++-- pdfminer/utils.py | 11 +++++- 5 files changed, 70 insertions(+), 59 deletions(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index b4d8a76..c1d4acb 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -3,11 +3,12 @@ import sys, os.path from pdfdevice import PDFDevice, PDFTextDevice from pdffont import PDFUnicodeNotDefined from pdftypes import LITERALS_DCT_DECODE +from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB from layout import LayoutContainer from layout import LTPage, LTText, LTLine, LTRect, LTPolygon from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup from utils import apply_matrix_pt, mult_matrix -from utils import enc, bbox2str +from utils import enc, bbox2str, create_bmp ## PDFPageAggregator @@ -50,17 +51,9 @@ class PDFPageAggregator(PDFTextDevice): def render_image(self, name, stream): assert isinstance(self.cur_item, LTFigure) - ismask = stream.get_any(('IM', 'ImageMask')) - bits = stream.get_any(('BPC', 'BitsPerComponent'), 1) - csp = stream.get_any(('CS', 'ColorSpace')) - if not isinstance(csp, list): - csp = [csp] - item = LTImage(name, stream.get_any(('F', 'Filter')), - (stream.get_any(('W', 'Width')), - stream.get_any(('H', 'Height'))), + item = LTImage(name, stream, (self.cur_item.x0, self.cur_item.y0, - self.cur_item.x1, self.cur_item.y1), - stream.get_rawdata()) + self.cur_item.x1, self.cur_item.y1)) self.cur_item.add(item) return @@ -115,6 +108,29 @@ class PDFConverter(PDFPageAggregator): self.outfp.write(enc(text, self.codec)) return + def write_image(self, image): + stream = image.stream + filters = stream.get_filters() + if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE: + ext = '.jpg' + data = stream.get_rawdata() + elif stream.colorspace is LITERAL_DEVICE_RGB: + ext = '.bmp' + data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height) + elif stream.colorspace is LITERAL_DEVICE_GRAY: + ext = '.bmp' + data = create_bmp(stream.get_data(), stream.bits, image.width, image.height) + else: + ext = '.img' + data = stream.get_data() + name = image.name+ext + path = os.path.join(self.outdir, name) + fp = file(path, 'wb') + fp.write(data) + fp.close() + return name + return + ## TextConverter ## @@ -180,23 +196,6 @@ class HTMLConverter(PDFConverter): self.outfp.write('\n') return - def write_image(self, image): - if image.type in LITERALS_DCT_DECODE: - ext = '.jpg' - else: - return - name = image.name+ext - path = os.path.join(self.outdir, name) - fp = file(path, 'wb') - fp.write(image.data) - fp.close() - self.outfp.write('\n' % - (enc(name), - image.x0*self.scale, (self.yoffset-image.y1)*self.scale, - image.width*self.scale, image.height*self.scale)) - return - def end_page(self, page): def render(item): if isinstance(item, LTPage): @@ -228,8 +227,14 @@ class HTMLConverter(PDFConverter): for child in item: render(child) elif isinstance(item, LTImage): + name = '' if self.outdir: - self.write_image(item) + name = self.write_image(item) + self.outfp.write('\n' % + (enc(name), + item.x0*self.scale, (self.yoffset-item.y1)*self.scale, + item.width*self.scale, item.height*self.scale)) return page = PDFConverter.end_page(self, page) render(page) @@ -262,18 +267,6 @@ class XMLConverter(PDFConverter): self.outfp.write('\n') return - def write_image(self, image): - if image.type in LITERALS_DCT_DECODE: - ext = '.jpg' - else: - return None - name = image.name+ext - path = os.path.join(self.outdir, name) - fp = file(path, 'wb') - fp.write(image.data) - fp.close() - return name - def end_page(self, page): def render(item): if isinstance(item, LTPage): @@ -308,21 +301,22 @@ class XMLConverter(PDFConverter): render(child) self.outfp.write('\n') elif isinstance(item, LTChar): - self.outfp.write('' % - (enc(item.font.fontname), item.is_vertical(), + vertical = '' + if item.is_vertical(): + vertical = 'vertical="true" ' + self.outfp.write('' % + (enc(item.font.fontname), vertical, bbox2str(item.bbox), item.get_size())) self.write(item.text) self.outfp.write('\n') elif isinstance(item, LTText): self.outfp.write('%s\n' % item.text) elif isinstance(item, LTImage): - x = '' + name = '' if self.outdir: name = self.write_image(item) - if name: - x = 'name="%s" ' % enc(name) - self.outfp.write('\n' % - (x, item.type, item.width, item.height)) + self.outfp.write('\n' % + (enc(name), item.width, item.height)) else: assert 0, item return diff --git a/pdfminer/layout.py b/pdfminer/layout.py index bed7152..943fae3 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -198,17 +198,22 @@ class LTRect(LTPolygon): ## class LTImage(LayoutItem): - def __init__(self, name, type, srcsize, bbox, data): + def __init__(self, name, stream, bbox): LayoutItem.__init__(self, bbox) self.name = name - self.type = type - self.srcsize = srcsize - self.data = data + self.stream = stream + self.srcsize = (stream.get_any(('W', 'Width')), + stream.get_any(('H', 'Height'))) + self.imagemask = stream.get_any(('IM', 'ImageMask')) + self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1) + self.colorspace = stream.get_any(('CS', 'ColorSpace')) + if not isinstance(self.colorspace, list): + self.colorspace = [colorspace] return def __repr__(self): (w,h) = self.srcsize - return '' % (self.name, self.type, w, h) + return '' % (self.name, w, h) ## LTText diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index a62aa17..01ac3b3 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -457,8 +457,7 @@ class PDFType3Font(PDFSimpleFont): if 'FontDescriptor' in spec: descriptor = dict_value(spec['FontDescriptor']) else: - descriptor = {'FontName':spec.get('Name'), - 'Ascent':0, 'Descent':0, + descriptor = {'Ascent':0, 'Descent':0, 'FontBBox':spec['FontBBox']} PDFSimpleFont.__init__(self, descriptor, widths, spec) self.matrix = tuple(list_value(spec.get('FontMatrix'))) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 74721a4..64714c4 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -187,6 +187,12 @@ class PDFStream(PDFObject): return self.attrs[name] return default + def get_filters(self): + filters = self.get_any(('F', 'Filter')) + if not filters: return [] + if isinstance(filters, list): return filters + return [ filters ] + def decomp(self,data): buf = data # some FlateDecode streams have garbage (newlines, etc) appended to the @@ -206,13 +212,11 @@ class PDFStream(PDFObject): if self.decipher: # Handle encryption data = self.decipher(self.objid, self.genno, data) - filters = self.get_any(('F', 'Filter')) + filters = self.get_filters() if not filters: self.data = data self.rawdata = None return - if not isinstance(filters, list): - filters = [ filters ] for f in filters: if f in LITERALS_FLATE_DECODE: # will get errors if the document is encrypted. diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 6347657..9ff18ab 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from struct import unpack +from struct import pack, unpack ## Matrix operations @@ -165,3 +165,12 @@ class ObjIdRange(object): def get_nobjs(self): return self.nobjs + + +# create_bmp +def create_bmp(data, bits, width, height): + info = pack('