simple image handling.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@23 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 09:34:48 +00:00
parent ecaf68efed
commit f296ed3999
4 changed files with 40 additions and 25 deletions

View File

@ -1,8 +1,8 @@
# Makefile for pdfminer
PACKAGE=pdfminer
VERSION=20080107
TAR=tar
VERSION=20080427
GNUTAR=tar
SVN=svn
PYTHON=python
@ -24,7 +24,7 @@ clean:
pack: clean
$(SVN) cleanup
$(SVN) export . $(WORKDIR)/$(DISTNAME)
$(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
rm -rf $(WORKDIR)/$(DISTNAME)
pychecker:

View File

@ -19,7 +19,7 @@ def esc(s):
# dumpxml
def dumpxml(out, obj):
def dumpxml(out, obj, codec=None):
if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems():
@ -43,16 +43,12 @@ def dumpxml(out, obj):
return
if isinstance(obj, PDFStream):
props = obj.dic.copy()
if 'Filter' in props:
del props['Filter']
if 'DecodeParms' in props:
del props['DecodeParms']
out.write('<stream>\n<props>\n')
dumpxml(out, props)
data = obj.get_data()
dumpxml(out, obj.dic)
out.write('\n</props>\n')
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
if codec:
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>')
return
@ -101,17 +97,17 @@ def dumpallobjs(out, doc):
# dumppdf
def dumppdf(outfp, fname, objids, pageids,
dumpall=False, binary=False, debug=0):
dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug)
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
if objids:
for objid in objids:
obj = doc.getobj(objid)
if binary and isinstance(obj, PDFStream):
if codec == 'binary' and isinstance(obj, PDFStream):
outfp.write(obj.get_data())
else:
dumpxml(outfp, obj)
dumpxml(outfp, obj, codec=codec)
if pageids:
for page in doc.get_pages():
if page.pageid in pageids:
@ -129,17 +125,17 @@ def dumppdf(outfp, fname, objids, pageids,
def main(argv):
import getopt
def usage():
print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0]
print 'usage: %s [-d] [-a] [-c|-b] [-p pageid] [-i objid] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'dabi:p:')
(opts, args) = getopt.getopt(argv[1:], 'dacbi:p:')
except getopt.GetoptError:
return usage()
if not args: return usage()
debug = 0
objids = []
pageids = set()
binary = False
codec = None
dumpall = False
outfp = stdout
for (k, v) in opts:
@ -147,12 +143,13 @@ def main(argv):
elif k == '-i': objids.append(int(v))
elif k == '-p': pageids.add(int(v))
elif k == '-a': dumpall = True
elif k == '-b': binary = True
elif k == '-b': codec = 'binary'
elif k == '-c': codec = 'text'
elif k == '-o': outfp = file(v, 'w')
#
for fname in args:
dumppdf(outfp, fname, objids, pageids,
dumpall=dumpall, binary=binary, debug=debug)
dumpall=dumpall, codec=codec, debug=debug)
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -46,7 +46,8 @@ class FigureItem(PageItem):
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
def dump(self, outfp, codec):
outfp.write(repr(self)+'\n')
bbox = '%d,%d,%d,%d' % self.bbox
outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox))
for obj in self.objs:
obj.dump(outfp, codec)
outfp.write('</figure>\n')
@ -126,6 +127,9 @@ class TextConverter(PDFDevice):
self.context.add(fig)
return
def render_image(self, stream, size, matrix):
return
def handle_undefined_char(self, cidcoding, cid):
if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)

View File

@ -41,6 +41,7 @@ LITERAL_PDF = PSLiteralTable.intern('PDF')
LITERAL_TEXT = PSLiteralTable.intern('Text')
LITERAL_FONT = PSLiteralTable.intern('Font')
LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_IMAGE = PSLiteralTable.intern('Image')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
@ -484,6 +485,8 @@ class PDFDevice:
def render_string(self, textstate, textmatrix, size, seq):
raise NotImplementedError
def render_image(self, stream, size, matrix):
raise NotImplementedError
## PDFContentParser
@ -942,9 +945,10 @@ class PDFPageInterpreter:
if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
subtype = xobj.dic.get('Subtype')
if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
interpreter = PDFPageInterpreter(self.rsrc, self.device)
(x0,y0,x1,y1) = xobj.dic['BBox']
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
@ -954,6 +958,16 @@ class PDFPageInterpreter:
self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
self.device.end_figure(xobjid)
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
(x0,y0) = apply_matrix(self.ctm, (0,0))
(x1,y1) = apply_matrix(self.ctm, (1,1))
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
self.device.render_image(xobj, (w,h), self.ctm)
self.device.end_figure(xobjid)
else:
# unsupported xobject type.
pass
return
def process_page(self, page):