diff --git a/pdfminer/converter.py b/pdfminer/converter.py
index b4d8a76..c1d4acb 100644
--- a/pdfminer/converter.py
+++ b/pdfminer/converter.py
@@ -3,11 +3,12 @@ import sys, os.path
from pdfdevice import PDFDevice, PDFTextDevice
from pdffont import PDFUnicodeNotDefined
from pdftypes import LITERALS_DCT_DECODE
+from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
from layout import LayoutContainer
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
from utils import apply_matrix_pt, mult_matrix
-from utils import enc, bbox2str
+from utils import enc, bbox2str, create_bmp
## PDFPageAggregator
@@ -50,17 +51,9 @@ class PDFPageAggregator(PDFTextDevice):
def render_image(self, name, stream):
assert isinstance(self.cur_item, LTFigure)
- ismask = stream.get_any(('IM', 'ImageMask'))
- bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
- csp = stream.get_any(('CS', 'ColorSpace'))
- if not isinstance(csp, list):
- csp = [csp]
- item = LTImage(name, stream.get_any(('F', 'Filter')),
- (stream.get_any(('W', 'Width')),
- stream.get_any(('H', 'Height'))),
+ item = LTImage(name, stream,
(self.cur_item.x0, self.cur_item.y0,
- self.cur_item.x1, self.cur_item.y1),
- stream.get_rawdata())
+ self.cur_item.x1, self.cur_item.y1))
self.cur_item.add(item)
return
@@ -115,6 +108,29 @@ class PDFConverter(PDFPageAggregator):
self.outfp.write(enc(text, self.codec))
return
+ def write_image(self, image):
+ stream = image.stream
+ filters = stream.get_filters()
+ if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
+ ext = '.jpg'
+ data = stream.get_rawdata()
+ elif stream.colorspace is LITERAL_DEVICE_RGB:
+ ext = '.bmp'
+ data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
+ elif stream.colorspace is LITERAL_DEVICE_GRAY:
+ ext = '.bmp'
+ data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
+ else:
+ ext = '.img'
+ data = stream.get_data()
+ name = image.name+ext
+ path = os.path.join(self.outdir, name)
+ fp = file(path, 'wb')
+ fp.write(data)
+ fp.close()
+ return name
+ return
+
## TextConverter
##
@@ -180,23 +196,6 @@ class HTMLConverter(PDFConverter):
self.outfp.write('\n')
return
- def write_image(self, image):
- if image.type in LITERALS_DCT_DECODE:
- ext = '.jpg'
- else:
- return
- name = image.name+ext
- path = os.path.join(self.outdir, name)
- fp = file(path, 'wb')
- fp.write(image.data)
- fp.close()
- self.outfp.write('\n' %
- (enc(name),
- image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
- image.width*self.scale, image.height*self.scale))
- return
-
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
@@ -228,8 +227,14 @@ class HTMLConverter(PDFConverter):
for child in item:
render(child)
elif isinstance(item, LTImage):
+ name = ''
if self.outdir:
- self.write_image(item)
+ name = self.write_image(item)
+ self.outfp.write('\n' %
+ (enc(name),
+ item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
+ item.width*self.scale, item.height*self.scale))
return
page = PDFConverter.end_page(self, page)
render(page)
@@ -262,18 +267,6 @@ class XMLConverter(PDFConverter):
self.outfp.write('\n')
return
- def write_image(self, image):
- if image.type in LITERALS_DCT_DECODE:
- ext = '.jpg'
- else:
- return None
- name = image.name+ext
- path = os.path.join(self.outdir, name)
- fp = file(path, 'wb')
- fp.write(image.data)
- fp.close()
- return name
-
def end_page(self, page):
def render(item):
if isinstance(item, LTPage):
@@ -308,21 +301,22 @@ class XMLConverter(PDFConverter):
render(child)
self.outfp.write('\n')
elif isinstance(item, LTChar):
- self.outfp.write('' %
- (enc(item.font.fontname), item.is_vertical(),
+ vertical = ''
+ if item.is_vertical():
+ vertical = 'vertical="true" '
+ self.outfp.write('' %
+ (enc(item.font.fontname), vertical,
bbox2str(item.bbox), item.get_size()))
self.write(item.text)
self.outfp.write('\n')
elif isinstance(item, LTText):
self.outfp.write('%s\n' % item.text)
elif isinstance(item, LTImage):
- x = ''
+ name = ''
if self.outdir:
name = self.write_image(item)
- if name:
- x = 'name="%s" ' % enc(name)
- self.outfp.write('\n' %
- (x, item.type, item.width, item.height))
+ self.outfp.write('\n' %
+ (enc(name), item.width, item.height))
else:
assert 0, item
return
diff --git a/pdfminer/layout.py b/pdfminer/layout.py
index bed7152..943fae3 100644
--- a/pdfminer/layout.py
+++ b/pdfminer/layout.py
@@ -198,17 +198,22 @@ class LTRect(LTPolygon):
##
class LTImage(LayoutItem):
- def __init__(self, name, type, srcsize, bbox, data):
+ def __init__(self, name, stream, bbox):
LayoutItem.__init__(self, bbox)
self.name = name
- self.type = type
- self.srcsize = srcsize
- self.data = data
+ self.stream = stream
+ self.srcsize = (stream.get_any(('W', 'Width')),
+ stream.get_any(('H', 'Height')))
+ self.imagemask = stream.get_any(('IM', 'ImageMask'))
+ self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
+ self.colorspace = stream.get_any(('CS', 'ColorSpace'))
+ if not isinstance(self.colorspace, list):
+ self.colorspace = [colorspace]
return
def __repr__(self):
(w,h) = self.srcsize
- return '' % (self.name, self.type, w, h)
+ return '' % (self.name, w, h)
## LTText
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
index a62aa17..01ac3b3 100644
--- a/pdfminer/pdffont.py
+++ b/pdfminer/pdffont.py
@@ -457,8 +457,7 @@ class PDFType3Font(PDFSimpleFont):
if 'FontDescriptor' in spec:
descriptor = dict_value(spec['FontDescriptor'])
else:
- descriptor = {'FontName':spec.get('Name'),
- 'Ascent':0, 'Descent':0,
+ descriptor = {'Ascent':0, 'Descent':0,
'FontBBox':spec['FontBBox']}
PDFSimpleFont.__init__(self, descriptor, widths, spec)
self.matrix = tuple(list_value(spec.get('FontMatrix')))
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
index 74721a4..64714c4 100644
--- a/pdfminer/pdftypes.py
+++ b/pdfminer/pdftypes.py
@@ -187,6 +187,12 @@ class PDFStream(PDFObject):
return self.attrs[name]
return default
+ def get_filters(self):
+ filters = self.get_any(('F', 'Filter'))
+ if not filters: return []
+ if isinstance(filters, list): return filters
+ return [ filters ]
+
def decomp(self,data):
buf = data
# some FlateDecode streams have garbage (newlines, etc) appended to the
@@ -206,13 +212,11 @@ class PDFStream(PDFObject):
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
- filters = self.get_any(('F', 'Filter'))
+ filters = self.get_filters()
if not filters:
self.data = data
self.rawdata = None
return
- if not isinstance(filters, list):
- filters = [ filters ]
for f in filters:
if f in LITERALS_FLATE_DECODE:
# will get errors if the document is encrypted.
diff --git a/pdfminer/utils.py b/pdfminer/utils.py
index 6347657..9ff18ab 100644
--- a/pdfminer/utils.py
+++ b/pdfminer/utils.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
-from struct import unpack
+from struct import pack, unpack
## Matrix operations
@@ -165,3 +165,12 @@ class ObjIdRange(object):
def get_nobjs(self):
return self.nobjs
+
+
+# create_bmp
+def create_bmp(data, bits, width, height):
+ info = pack('