simple image handling.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@23 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
ecaf68efed
commit
f296ed3999
6
Makefile
6
Makefile
|
@ -1,8 +1,8 @@
|
||||||
# Makefile for pdfminer
|
# Makefile for pdfminer
|
||||||
|
|
||||||
PACKAGE=pdfminer
|
PACKAGE=pdfminer
|
||||||
VERSION=20080107
|
VERSION=20080427
|
||||||
TAR=tar
|
GNUTAR=tar
|
||||||
SVN=svn
|
SVN=svn
|
||||||
PYTHON=python
|
PYTHON=python
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ clean:
|
||||||
pack: clean
|
pack: clean
|
||||||
$(SVN) cleanup
|
$(SVN) cleanup
|
||||||
$(SVN) export . $(WORKDIR)/$(DISTNAME)
|
$(SVN) export . $(WORKDIR)/$(DISTNAME)
|
||||||
$(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
|
||||||
rm -rf $(WORKDIR)/$(DISTNAME)
|
rm -rf $(WORKDIR)/$(DISTNAME)
|
||||||
|
|
||||||
pychecker:
|
pychecker:
|
||||||
|
|
29
dumppdf.py
29
dumppdf.py
|
@ -19,7 +19,7 @@ def esc(s):
|
||||||
|
|
||||||
|
|
||||||
# dumpxml
|
# dumpxml
|
||||||
def dumpxml(out, obj):
|
def dumpxml(out, obj, codec=None):
|
||||||
if isinstance(obj, dict):
|
if isinstance(obj, dict):
|
||||||
out.write('<dict size="%d">\n' % len(obj))
|
out.write('<dict size="%d">\n' % len(obj))
|
||||||
for (k,v) in obj.iteritems():
|
for (k,v) in obj.iteritems():
|
||||||
|
@ -43,15 +43,11 @@ def dumpxml(out, obj):
|
||||||
return
|
return
|
||||||
|
|
||||||
if isinstance(obj, PDFStream):
|
if isinstance(obj, PDFStream):
|
||||||
props = obj.dic.copy()
|
|
||||||
if 'Filter' in props:
|
|
||||||
del props['Filter']
|
|
||||||
if 'DecodeParms' in props:
|
|
||||||
del props['DecodeParms']
|
|
||||||
out.write('<stream>\n<props>\n')
|
out.write('<stream>\n<props>\n')
|
||||||
dumpxml(out, props)
|
dumpxml(out, obj.dic)
|
||||||
data = obj.get_data()
|
|
||||||
out.write('\n</props>\n')
|
out.write('\n</props>\n')
|
||||||
|
if codec:
|
||||||
|
data = obj.get_data()
|
||||||
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
|
||||||
out.write('</stream>')
|
out.write('</stream>')
|
||||||
return
|
return
|
||||||
|
@ -101,17 +97,17 @@ def dumpallobjs(out, doc):
|
||||||
|
|
||||||
# dumppdf
|
# dumppdf
|
||||||
def dumppdf(outfp, fname, objids, pageids,
|
def dumppdf(outfp, fname, objids, pageids,
|
||||||
dumpall=False, binary=False, debug=0):
|
dumpall=False, codec=None, debug=0):
|
||||||
doc = PDFDocument(debug=debug)
|
doc = PDFDocument(debug=debug)
|
||||||
fp = file(fname)
|
fp = file(fname)
|
||||||
parser = PDFParser(doc, fp, debug=debug)
|
parser = PDFParser(doc, fp, debug=debug)
|
||||||
if objids:
|
if objids:
|
||||||
for objid in objids:
|
for objid in objids:
|
||||||
obj = doc.getobj(objid)
|
obj = doc.getobj(objid)
|
||||||
if binary and isinstance(obj, PDFStream):
|
if codec == 'binary' and isinstance(obj, PDFStream):
|
||||||
outfp.write(obj.get_data())
|
outfp.write(obj.get_data())
|
||||||
else:
|
else:
|
||||||
dumpxml(outfp, obj)
|
dumpxml(outfp, obj, codec=codec)
|
||||||
if pageids:
|
if pageids:
|
||||||
for page in doc.get_pages():
|
for page in doc.get_pages():
|
||||||
if page.pageid in pageids:
|
if page.pageid in pageids:
|
||||||
|
@ -129,17 +125,17 @@ def dumppdf(outfp, fname, objids, pageids,
|
||||||
def main(argv):
|
def main(argv):
|
||||||
import getopt
|
import getopt
|
||||||
def usage():
|
def usage():
|
||||||
print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0]
|
print 'usage: %s [-d] [-a] [-c|-b] [-p pageid] [-i objid] file ...' % argv[0]
|
||||||
return 100
|
return 100
|
||||||
try:
|
try:
|
||||||
(opts, args) = getopt.getopt(argv[1:], 'dabi:p:')
|
(opts, args) = getopt.getopt(argv[1:], 'dacbi:p:')
|
||||||
except getopt.GetoptError:
|
except getopt.GetoptError:
|
||||||
return usage()
|
return usage()
|
||||||
if not args: return usage()
|
if not args: return usage()
|
||||||
debug = 0
|
debug = 0
|
||||||
objids = []
|
objids = []
|
||||||
pageids = set()
|
pageids = set()
|
||||||
binary = False
|
codec = None
|
||||||
dumpall = False
|
dumpall = False
|
||||||
outfp = stdout
|
outfp = stdout
|
||||||
for (k, v) in opts:
|
for (k, v) in opts:
|
||||||
|
@ -147,12 +143,13 @@ def main(argv):
|
||||||
elif k == '-i': objids.append(int(v))
|
elif k == '-i': objids.append(int(v))
|
||||||
elif k == '-p': pageids.add(int(v))
|
elif k == '-p': pageids.add(int(v))
|
||||||
elif k == '-a': dumpall = True
|
elif k == '-a': dumpall = True
|
||||||
elif k == '-b': binary = True
|
elif k == '-b': codec = 'binary'
|
||||||
|
elif k == '-c': codec = 'text'
|
||||||
elif k == '-o': outfp = file(v, 'w')
|
elif k == '-o': outfp = file(v, 'w')
|
||||||
#
|
#
|
||||||
for fname in args:
|
for fname in args:
|
||||||
dumppdf(outfp, fname, objids, pageids,
|
dumppdf(outfp, fname, objids, pageids,
|
||||||
dumpall=dumpall, binary=binary, debug=debug)
|
dumpall=dumpall, codec=codec, debug=debug)
|
||||||
return
|
return
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main(sys.argv))
|
if __name__ == '__main__': sys.exit(main(sys.argv))
|
||||||
|
|
|
@ -46,7 +46,8 @@ class FigureItem(PageItem):
|
||||||
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
|
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
outfp.write(repr(self)+'\n')
|
bbox = '%d,%d,%d,%d' % self.bbox
|
||||||
|
outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox))
|
||||||
for obj in self.objs:
|
for obj in self.objs:
|
||||||
obj.dump(outfp, codec)
|
obj.dump(outfp, codec)
|
||||||
outfp.write('</figure>\n')
|
outfp.write('</figure>\n')
|
||||||
|
@ -126,6 +127,9 @@ class TextConverter(PDFDevice):
|
||||||
self.context.add(fig)
|
self.context.add(fig)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def render_image(self, stream, size, matrix):
|
||||||
|
return
|
||||||
|
|
||||||
def handle_undefined_char(self, cidcoding, cid):
|
def handle_undefined_char(self, cidcoding, cid):
|
||||||
if self.debug:
|
if self.debug:
|
||||||
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
|
||||||
|
|
16
pdfinterp.py
16
pdfinterp.py
|
@ -41,6 +41,7 @@ LITERAL_PDF = PSLiteralTable.intern('PDF')
|
||||||
LITERAL_TEXT = PSLiteralTable.intern('Text')
|
LITERAL_TEXT = PSLiteralTable.intern('Text')
|
||||||
LITERAL_FONT = PSLiteralTable.intern('Font')
|
LITERAL_FONT = PSLiteralTable.intern('Font')
|
||||||
LITERAL_FORM = PSLiteralTable.intern('Form')
|
LITERAL_FORM = PSLiteralTable.intern('Form')
|
||||||
|
LITERAL_IMAGE = PSLiteralTable.intern('Image')
|
||||||
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
|
||||||
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
|
||||||
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
|
||||||
|
@ -484,6 +485,8 @@ class PDFDevice:
|
||||||
|
|
||||||
def render_string(self, textstate, textmatrix, size, seq):
|
def render_string(self, textstate, textmatrix, size, seq):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
def render_image(self, stream, size, matrix):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
## PDFContentParser
|
## PDFContentParser
|
||||||
|
@ -942,9 +945,10 @@ class PDFPageInterpreter:
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
|
||||||
return
|
return
|
||||||
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
|
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'Processing xobj: %r' % xobj
|
print >>stderr, 'Processing xobj: %r' % xobj
|
||||||
|
subtype = xobj.dic.get('Subtype')
|
||||||
|
if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
|
||||||
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
interpreter = PDFPageInterpreter(self.rsrc, self.device)
|
||||||
(x0,y0,x1,y1) = xobj.dic['BBox']
|
(x0,y0,x1,y1) = xobj.dic['BBox']
|
||||||
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
|
||||||
|
@ -954,6 +958,16 @@ class PDFPageInterpreter:
|
||||||
self.device.begin_figure(xobjid, bbox)
|
self.device.begin_figure(xobjid, bbox)
|
||||||
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
|
interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
|
||||||
self.device.end_figure(xobjid)
|
self.device.end_figure(xobjid)
|
||||||
|
elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
|
||||||
|
(x0,y0) = apply_matrix(self.ctm, (0,0))
|
||||||
|
(x1,y1) = apply_matrix(self.ctm, (1,1))
|
||||||
|
self.device.begin_figure(xobjid, (x0,y0,x1,y1))
|
||||||
|
(w,h) = (xobj.dic['Width'], xobj.dic['Height'])
|
||||||
|
self.device.render_image(xobj, (w,h), self.ctm)
|
||||||
|
self.device.end_figure(xobjid)
|
||||||
|
else:
|
||||||
|
# unsupported xobject type.
|
||||||
|
pass
|
||||||
return
|
return
|
||||||
|
|
||||||
def process_page(self, page):
|
def process_page(self, page):
|
||||||
|
|
Loading…
Reference in New Issue