image handling addition (untested)
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@202 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
eafdc8830d
commit
c81142aa44
|
@ -3,11 +3,12 @@ import sys, os.path
|
||||||
from pdfdevice import PDFDevice, PDFTextDevice
|
from pdfdevice import PDFDevice, PDFTextDevice
|
||||||
from pdffont import PDFUnicodeNotDefined
|
from pdffont import PDFUnicodeNotDefined
|
||||||
from pdftypes import LITERALS_DCT_DECODE
|
from pdftypes import LITERALS_DCT_DECODE
|
||||||
|
from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
|
||||||
from layout import LayoutContainer
|
from layout import LayoutContainer
|
||||||
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
from layout import LTPage, LTText, LTLine, LTRect, LTPolygon
|
||||||
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
|
from layout import LTFigure, LTImage, LTChar, LTTextLine, LTTextBox, LTTextGroup
|
||||||
from utils import apply_matrix_pt, mult_matrix
|
from utils import apply_matrix_pt, mult_matrix
|
||||||
from utils import enc, bbox2str
|
from utils import enc, bbox2str, create_bmp
|
||||||
|
|
||||||
|
|
||||||
## PDFPageAggregator
|
## PDFPageAggregator
|
||||||
|
@ -50,17 +51,9 @@ class PDFPageAggregator(PDFTextDevice):
|
||||||
|
|
||||||
def render_image(self, name, stream):
|
def render_image(self, name, stream):
|
||||||
assert isinstance(self.cur_item, LTFigure)
|
assert isinstance(self.cur_item, LTFigure)
|
||||||
ismask = stream.get_any(('IM', 'ImageMask'))
|
item = LTImage(name, stream,
|
||||||
bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
|
||||||
csp = stream.get_any(('CS', 'ColorSpace'))
|
|
||||||
if not isinstance(csp, list):
|
|
||||||
csp = [csp]
|
|
||||||
item = LTImage(name, stream.get_any(('F', 'Filter')),
|
|
||||||
(stream.get_any(('W', 'Width')),
|
|
||||||
stream.get_any(('H', 'Height'))),
|
|
||||||
(self.cur_item.x0, self.cur_item.y0,
|
(self.cur_item.x0, self.cur_item.y0,
|
||||||
self.cur_item.x1, self.cur_item.y1),
|
self.cur_item.x1, self.cur_item.y1))
|
||||||
stream.get_rawdata())
|
|
||||||
self.cur_item.add(item)
|
self.cur_item.add(item)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -115,6 +108,29 @@ class PDFConverter(PDFPageAggregator):
|
||||||
self.outfp.write(enc(text, self.codec))
|
self.outfp.write(enc(text, self.codec))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def write_image(self, image):
|
||||||
|
stream = image.stream
|
||||||
|
filters = stream.get_filters()
|
||||||
|
if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
|
||||||
|
ext = '.jpg'
|
||||||
|
data = stream.get_rawdata()
|
||||||
|
elif stream.colorspace is LITERAL_DEVICE_RGB:
|
||||||
|
ext = '.bmp'
|
||||||
|
data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
|
||||||
|
elif stream.colorspace is LITERAL_DEVICE_GRAY:
|
||||||
|
ext = '.bmp'
|
||||||
|
data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
|
||||||
|
else:
|
||||||
|
ext = '.img'
|
||||||
|
data = stream.get_data()
|
||||||
|
name = image.name+ext
|
||||||
|
path = os.path.join(self.outdir, name)
|
||||||
|
fp = file(path, 'wb')
|
||||||
|
fp.write(data)
|
||||||
|
fp.close()
|
||||||
|
return name
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
## TextConverter
|
## TextConverter
|
||||||
##
|
##
|
||||||
|
@ -180,23 +196,6 @@ class HTMLConverter(PDFConverter):
|
||||||
self.outfp.write('</span>\n')
|
self.outfp.write('</span>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_image(self, image):
|
|
||||||
if image.type in LITERALS_DCT_DECODE:
|
|
||||||
ext = '.jpg'
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
name = image.name+ext
|
|
||||||
path = os.path.join(self.outdir, name)
|
|
||||||
fp = file(path, 'wb')
|
|
||||||
fp.write(image.data)
|
|
||||||
fp.close()
|
|
||||||
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
|
||||||
'width="%d" height="%d" />\n' %
|
|
||||||
(enc(name),
|
|
||||||
image.x0*self.scale, (self.yoffset-image.y1)*self.scale,
|
|
||||||
image.width*self.scale, image.height*self.scale))
|
|
||||||
return
|
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
@ -228,8 +227,14 @@ class HTMLConverter(PDFConverter):
|
||||||
for child in item:
|
for child in item:
|
||||||
render(child)
|
render(child)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
|
name = ''
|
||||||
if self.outdir:
|
if self.outdir:
|
||||||
self.write_image(item)
|
name = self.write_image(item)
|
||||||
|
self.outfp.write('<img src="%s" style="position:absolute; left:%dpx; top:%dpx;" '
|
||||||
|
'width="%d" height="%d" />\n' %
|
||||||
|
(enc(name),
|
||||||
|
item.x0*self.scale, (self.yoffset-item.y1)*self.scale,
|
||||||
|
item.width*self.scale, item.height*self.scale))
|
||||||
return
|
return
|
||||||
page = PDFConverter.end_page(self, page)
|
page = PDFConverter.end_page(self, page)
|
||||||
render(page)
|
render(page)
|
||||||
|
@ -262,18 +267,6 @@ class XMLConverter(PDFConverter):
|
||||||
self.outfp.write('<pages>\n')
|
self.outfp.write('<pages>\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def write_image(self, image):
|
|
||||||
if image.type in LITERALS_DCT_DECODE:
|
|
||||||
ext = '.jpg'
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
name = image.name+ext
|
|
||||||
path = os.path.join(self.outdir, name)
|
|
||||||
fp = file(path, 'wb')
|
|
||||||
fp.write(image.data)
|
|
||||||
fp.close()
|
|
||||||
return name
|
|
||||||
|
|
||||||
def end_page(self, page):
|
def end_page(self, page):
|
||||||
def render(item):
|
def render(item):
|
||||||
if isinstance(item, LTPage):
|
if isinstance(item, LTPage):
|
||||||
|
@ -308,21 +301,22 @@ class XMLConverter(PDFConverter):
|
||||||
render(child)
|
render(child)
|
||||||
self.outfp.write('</textbox>\n')
|
self.outfp.write('</textbox>\n')
|
||||||
elif isinstance(item, LTChar):
|
elif isinstance(item, LTChar):
|
||||||
self.outfp.write('<text font="%s" vertical="%s" bbox="%s" size="%.3f">' %
|
vertical = ''
|
||||||
(enc(item.font.fontname), item.is_vertical(),
|
if item.is_vertical():
|
||||||
|
vertical = 'vertical="true" '
|
||||||
|
self.outfp.write('<text font="%s" %sbbox="%s" size="%.3f">' %
|
||||||
|
(enc(item.font.fontname), vertical,
|
||||||
bbox2str(item.bbox), item.get_size()))
|
bbox2str(item.bbox), item.get_size()))
|
||||||
self.write(item.text)
|
self.write(item.text)
|
||||||
self.outfp.write('</text>\n')
|
self.outfp.write('</text>\n')
|
||||||
elif isinstance(item, LTText):
|
elif isinstance(item, LTText):
|
||||||
self.outfp.write('<text>%s</text>\n' % item.text)
|
self.outfp.write('<text>%s</text>\n' % item.text)
|
||||||
elif isinstance(item, LTImage):
|
elif isinstance(item, LTImage):
|
||||||
x = ''
|
name = ''
|
||||||
if self.outdir:
|
if self.outdir:
|
||||||
name = self.write_image(item)
|
name = self.write_image(item)
|
||||||
if name:
|
self.outfp.write('<image name="%s" width="%d" height="%d" />\n' %
|
||||||
x = 'name="%s" ' % enc(name)
|
(enc(name), item.width, item.height))
|
||||||
self.outfp.write('<image %stype="%s" width="%d" height="%d" />\n' %
|
|
||||||
(x, item.type, item.width, item.height))
|
|
||||||
else:
|
else:
|
||||||
assert 0, item
|
assert 0, item
|
||||||
return
|
return
|
||||||
|
|
|
@ -198,17 +198,22 @@ class LTRect(LTPolygon):
|
||||||
##
|
##
|
||||||
class LTImage(LayoutItem):
|
class LTImage(LayoutItem):
|
||||||
|
|
||||||
def __init__(self, name, type, srcsize, bbox, data):
|
def __init__(self, name, stream, bbox):
|
||||||
LayoutItem.__init__(self, bbox)
|
LayoutItem.__init__(self, bbox)
|
||||||
self.name = name
|
self.name = name
|
||||||
self.type = type
|
self.stream = stream
|
||||||
self.srcsize = srcsize
|
self.srcsize = (stream.get_any(('W', 'Width')),
|
||||||
self.data = data
|
stream.get_any(('H', 'Height')))
|
||||||
|
self.imagemask = stream.get_any(('IM', 'ImageMask'))
|
||||||
|
self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
|
||||||
|
self.colorspace = stream.get_any(('CS', 'ColorSpace'))
|
||||||
|
if not isinstance(self.colorspace, list):
|
||||||
|
self.colorspace = [colorspace]
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
(w,h) = self.srcsize
|
(w,h) = self.srcsize
|
||||||
return '<image %s %s %dx%d>' % (self.name, self.type, w, h)
|
return '<image %s %dx%d>' % (self.name, w, h)
|
||||||
|
|
||||||
|
|
||||||
## LTText
|
## LTText
|
||||||
|
|
|
@ -457,8 +457,7 @@ class PDFType3Font(PDFSimpleFont):
|
||||||
if 'FontDescriptor' in spec:
|
if 'FontDescriptor' in spec:
|
||||||
descriptor = dict_value(spec['FontDescriptor'])
|
descriptor = dict_value(spec['FontDescriptor'])
|
||||||
else:
|
else:
|
||||||
descriptor = {'FontName':spec.get('Name'),
|
descriptor = {'Ascent':0, 'Descent':0,
|
||||||
'Ascent':0, 'Descent':0,
|
|
||||||
'FontBBox':spec['FontBBox']}
|
'FontBBox':spec['FontBBox']}
|
||||||
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
||||||
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
self.matrix = tuple(list_value(spec.get('FontMatrix')))
|
||||||
|
|
|
@ -187,6 +187,12 @@ class PDFStream(PDFObject):
|
||||||
return self.attrs[name]
|
return self.attrs[name]
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
def get_filters(self):
|
||||||
|
filters = self.get_any(('F', 'Filter'))
|
||||||
|
if not filters: return []
|
||||||
|
if isinstance(filters, list): return filters
|
||||||
|
return [ filters ]
|
||||||
|
|
||||||
def decomp(self,data):
|
def decomp(self,data):
|
||||||
buf = data
|
buf = data
|
||||||
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
# some FlateDecode streams have garbage (newlines, etc) appended to the
|
||||||
|
@ -206,13 +212,11 @@ class PDFStream(PDFObject):
|
||||||
if self.decipher:
|
if self.decipher:
|
||||||
# Handle encryption
|
# Handle encryption
|
||||||
data = self.decipher(self.objid, self.genno, data)
|
data = self.decipher(self.objid, self.genno, data)
|
||||||
filters = self.get_any(('F', 'Filter'))
|
filters = self.get_filters()
|
||||||
if not filters:
|
if not filters:
|
||||||
self.data = data
|
self.data = data
|
||||||
self.rawdata = None
|
self.rawdata = None
|
||||||
return
|
return
|
||||||
if not isinstance(filters, list):
|
|
||||||
filters = [ filters ]
|
|
||||||
for f in filters:
|
for f in filters:
|
||||||
if f in LITERALS_FLATE_DECODE:
|
if f in LITERALS_FLATE_DECODE:
|
||||||
# will get errors if the document is encrypted.
|
# will get errors if the document is encrypted.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from struct import unpack
|
from struct import pack, unpack
|
||||||
|
|
||||||
|
|
||||||
## Matrix operations
|
## Matrix operations
|
||||||
|
@ -165,3 +165,12 @@ class ObjIdRange(object):
|
||||||
|
|
||||||
def get_nobjs(self):
|
def get_nobjs(self):
|
||||||
return self.nobjs
|
return self.nobjs
|
||||||
|
|
||||||
|
|
||||||
|
# create_bmp
|
||||||
|
def create_bmp(data, bits, width, height):
|
||||||
|
info = pack('<IiiHHIIIIII', 40, width, height, 1, bits, 0, len(data), 0, 0, 0, 0)
|
||||||
|
assert len(info) == 40, len(info)
|
||||||
|
header = pack('<ccIHHI', 'B', 'M', 14+40+len(data), 0, 0, 14+40)
|
||||||
|
# XXX re-rasterize every line
|
||||||
|
return header+info+data
|
||||||
|
|
Loading…
Reference in New Issue