simple image handling.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@23 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 09:34:48 +00:00
parent ecaf68efed
commit f296ed3999
4 changed files with 40 additions and 25 deletions

View File

@ -1,8 +1,8 @@
# Makefile for pdfminer # Makefile for pdfminer
PACKAGE=pdfminer PACKAGE=pdfminer
VERSION=20080107 VERSION=20080427
TAR=tar GNUTAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python
@ -24,7 +24,7 @@ clean:
pack: clean pack: clean
$(SVN) cleanup $(SVN) cleanup
$(SVN) export . $(WORKDIR)/$(DISTNAME) $(SVN) export . $(WORKDIR)/$(DISTNAME)
$(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner $(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
rm -rf $(WORKDIR)/$(DISTNAME) rm -rf $(WORKDIR)/$(DISTNAME)
pychecker: pychecker:

View File

@ -19,7 +19,7 @@ def esc(s):
# dumpxml # dumpxml
def dumpxml(out, obj): def dumpxml(out, obj, codec=None):
if isinstance(obj, dict): if isinstance(obj, dict):
out.write('<dict size="%d">\n' % len(obj)) out.write('<dict size="%d">\n' % len(obj))
for (k,v) in obj.iteritems(): for (k,v) in obj.iteritems():
@ -43,16 +43,12 @@ def dumpxml(out, obj):
return return
if isinstance(obj, PDFStream): if isinstance(obj, PDFStream):
props = obj.dic.copy()
if 'Filter' in props:
del props['Filter']
if 'DecodeParms' in props:
del props['DecodeParms']
out.write('<stream>\n<props>\n') out.write('<stream>\n<props>\n')
dumpxml(out, props) dumpxml(out, obj.dic)
data = obj.get_data()
out.write('\n</props>\n') out.write('\n</props>\n')
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data))) if codec:
data = obj.get_data()
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
out.write('</stream>') out.write('</stream>')
return return
@ -101,17 +97,17 @@ def dumpallobjs(out, doc):
# dumppdf # dumppdf
def dumppdf(outfp, fname, objids, pageids, def dumppdf(outfp, fname, objids, pageids,
dumpall=False, binary=False, debug=0): dumpall=False, codec=None, debug=0):
doc = PDFDocument(debug=debug) doc = PDFDocument(debug=debug)
fp = file(fname) fp = file(fname)
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp, debug=debug)
if objids: if objids:
for objid in objids: for objid in objids:
obj = doc.getobj(objid) obj = doc.getobj(objid)
if binary and isinstance(obj, PDFStream): if codec == 'binary' and isinstance(obj, PDFStream):
outfp.write(obj.get_data()) outfp.write(obj.get_data())
else: else:
dumpxml(outfp, obj) dumpxml(outfp, obj, codec=codec)
if pageids: if pageids:
for page in doc.get_pages(): for page in doc.get_pages():
if page.pageid in pageids: if page.pageid in pageids:
@ -129,17 +125,17 @@ def dumppdf(outfp, fname, objids, pageids,
def main(argv): def main(argv):
import getopt import getopt
def usage(): def usage():
print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0] print 'usage: %s [-d] [-a] [-c|-b] [-p pageid] [-i objid] file ...' % argv[0]
return 100 return 100
try: try:
(opts, args) = getopt.getopt(argv[1:], 'dabi:p:') (opts, args) = getopt.getopt(argv[1:], 'dacbi:p:')
except getopt.GetoptError: except getopt.GetoptError:
return usage() return usage()
if not args: return usage() if not args: return usage()
debug = 0 debug = 0
objids = [] objids = []
pageids = set() pageids = set()
binary = False codec = None
dumpall = False dumpall = False
outfp = stdout outfp = stdout
for (k, v) in opts: for (k, v) in opts:
@ -147,12 +143,13 @@ def main(argv):
elif k == '-i': objids.append(int(v)) elif k == '-i': objids.append(int(v))
elif k == '-p': pageids.add(int(v)) elif k == '-p': pageids.add(int(v))
elif k == '-a': dumpall = True elif k == '-a': dumpall = True
elif k == '-b': binary = True elif k == '-b': codec = 'binary'
elif k == '-c': codec = 'text'
elif k == '-o': outfp = file(v, 'w') elif k == '-o': outfp = file(v, 'w')
# #
for fname in args: for fname in args:
dumppdf(outfp, fname, objids, pageids, dumppdf(outfp, fname, objids, pageids,
dumpall=dumpall, binary=binary, debug=debug) dumpall=dumpall, codec=codec, debug=debug)
return return
if __name__ == '__main__': sys.exit(main(sys.argv)) if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -46,7 +46,8 @@ class FigureItem(PageItem):
return ('<figure id=%r bbox="%s">' % (self.id, bbox)) return ('<figure id=%r bbox="%s">' % (self.id, bbox))
def dump(self, outfp, codec): def dump(self, outfp, codec):
outfp.write(repr(self)+'\n') bbox = '%d,%d,%d,%d' % self.bbox
outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox))
for obj in self.objs: for obj in self.objs:
obj.dump(outfp, codec) obj.dump(outfp, codec)
outfp.write('</figure>\n') outfp.write('</figure>\n')
@ -126,6 +127,9 @@ class TextConverter(PDFDevice):
self.context.add(fig) self.context.add(fig)
return return
def render_image(self, stream, size, matrix):
return
def handle_undefined_char(self, cidcoding, cid): def handle_undefined_char(self, cidcoding, cid):
if self.debug: if self.debug:
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid) print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
@ -155,7 +159,7 @@ class TextConverter(PDFDevice):
font, textstate.fontsize, size, text) font, textstate.fontsize, size, text)
self.context.add(item) self.context.add(item)
return return
def dump(self, outfp, codec): def dump(self, outfp, codec):
for page in self.pages: for page in self.pages:
page.dump(outfp, codec) page.dump(outfp, codec)

View File

@ -41,6 +41,7 @@ LITERAL_PDF = PSLiteralTable.intern('PDF')
LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_TEXT = PSLiteralTable.intern('Text')
LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_FONT = PSLiteralTable.intern('Font')
LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_FORM = PSLiteralTable.intern('Form')
LITERAL_IMAGE = PSLiteralTable.intern('Image')
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
@ -484,6 +485,8 @@ class PDFDevice:
def render_string(self, textstate, textmatrix, size, seq): def render_string(self, textstate, textmatrix, size, seq):
raise NotImplementedError raise NotImplementedError
def render_image(self, stream, size, matrix):
raise NotImplementedError
## PDFContentParser ## PDFContentParser
@ -942,9 +945,10 @@ class PDFPageInterpreter:
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic: if 1 <= self.debug:
if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj
print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype')
if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
interpreter = PDFPageInterpreter(self.rsrc, self.device) interpreter = PDFPageInterpreter(self.rsrc, self.device)
(x0,y0,x1,y1) = xobj.dic['BBox'] (x0,y0,x1,y1) = xobj.dic['BBox']
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm) ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
@ -954,6 +958,16 @@ class PDFPageInterpreter:
self.device.begin_figure(xobjid, bbox) self.device.begin_figure(xobjid, bbox)
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm) interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
self.device.end_figure(xobjid) self.device.end_figure(xobjid)
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
(x0,y0) = apply_matrix(self.ctm, (0,0))
(x1,y1) = apply_matrix(self.ctm, (1,1))
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
self.device.render_image(xobj, (w,h), self.ctm)
self.device.end_figure(xobjid)
else:
# unsupported xobject type.
pass
return return
def process_page(self, page): def process_page(self, page):