simple image handling.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@23 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-04-27 09:34:48 +00:00 · 2008-04-27 09:34:48 +00:00 · f296ed3999
parent ecaf68efed
commit f296ed3999
4 changed files with 40 additions and 25 deletions
--- a/6
+++ b/6
@ -1,8 +1,8 @@
 # Makefile for pdfminer

 PACKAGE=pdfminer
-VERSION=20080107
-TAR=tar
+VERSION=20080427
+GNUTAR=tar
 SVN=svn
 PYTHON=python

@ -24,7 +24,7 @@ clean:
 pack: clean
 	$(SVN) cleanup
 	$(SVN) export . $(WORKDIR)/$(DISTNAME)
-	$(TAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
+	$(GNUTAR) c -z -C$(WORKDIR) -f $(WORKDIR)/$(DISTFILE) $(DISTNAME) --dereference --numeric-owner
 	rm -rf $(WORKDIR)/$(DISTNAME)

 pychecker:
--- a/dumppdf.py
+++ b/dumppdf.py
@ -19,7 +19,7 @@ def esc(s):


 # dumpxml
-def dumpxml(out, obj):
+def dumpxml(out, obj, codec=None):
  if isinstance(obj, dict):
    out.write('<dict size="%d">\n' % len(obj))
    for (k,v) in obj.iteritems():
@ -43,16 +43,12 @@ def dumpxml(out, obj):
    return
  
  if isinstance(obj, PDFStream):
-    props = obj.dic.copy()
-    if 'Filter' in props:
-      del props['Filter']
-    if 'DecodeParms' in props:
-      del props['DecodeParms']
    out.write('<stream>\n<props>\n')
-    dumpxml(out, props)
-    data = obj.get_data()
+    dumpxml(out, obj.dic)
    out.write('\n</props>\n')
-    out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
+    if codec:
+      data = obj.get_data()
+      out.write('<data size="%d">%s</data>\n' % (len(data), esc(data)))
    out.write('</stream>')
    return
  
@ -101,17 +97,17 @@ def dumpallobjs(out, doc):

 # dumppdf
 def dumppdf(outfp, fname, objids, pageids,
-            dumpall=False, binary=False, debug=0):
+            dumpall=False, codec=None, debug=0):
  doc = PDFDocument(debug=debug)
  fp = file(fname)
  parser = PDFParser(doc, fp, debug=debug)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
-      if binary and isinstance(obj, PDFStream):
+      if codec == 'binary' and isinstance(obj, PDFStream):
        outfp.write(obj.get_data())
      else:
-        dumpxml(outfp, obj)
+        dumpxml(outfp, obj, codec=codec)
  if pageids:
    for page in doc.get_pages():
      if page.pageid in pageids:
@ -129,17 +125,17 @@ def dumppdf(outfp, fname, objids, pageids,
 def main(argv):
  import getopt
  def usage():
-    print 'usage: %s [-d] [-a] [-b] [-p pageid] [-i objid] file ...' % argv[0]
+    print 'usage: %s [-d] [-a] [-c|-b] [-p pageid] [-i objid] file ...' % argv[0]
    return 100
  try:
-    (opts, args) = getopt.getopt(argv[1:], 'dabi:p:')
+    (opts, args) = getopt.getopt(argv[1:], 'dacbi:p:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
  debug = 0
  objids = []
  pageids = set()
-  binary = False
+  codec = None
  dumpall = False
  outfp = stdout
  for (k, v) in opts:
@ -147,12 +143,13 @@ def main(argv):
    elif k == '-i': objids.append(int(v))
    elif k == '-p': pageids.add(int(v))
    elif k == '-a': dumpall = True
-    elif k == '-b': binary = True
+    elif k == '-b': codec = 'binary'
+    elif k == '-c': codec = 'text'
    elif k == '-o': outfp = file(v, 'w')
  #
  for fname in args:
    dumppdf(outfp, fname, objids, pageids,
-            dumpall=dumpall, binary=binary, debug=debug)
+            dumpall=dumpall, codec=codec, debug=debug)
  return

 if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -46,7 +46,8 @@ class FigureItem(PageItem):
    return ('<figure id=%r bbox="%s">' % (self.id, bbox))
  
  def dump(self, outfp, codec):
-    outfp.write(repr(self)+'\n')
+    bbox = '%d,%d,%d,%d' % self.bbox
+    outfp.write('<figure id="%s" bbox="%s">\n' % (self.id, bbox))
    for obj in self.objs:
      obj.dump(outfp, codec)
    outfp.write('</figure>\n')
@ -126,6 +127,9 @@ class TextConverter(PDFDevice):
    self.context.add(fig)
    return

+  def render_image(self, stream, size, matrix):
+    return
+
  def handle_undefined_char(self, cidcoding, cid):
    if self.debug:
      print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
--- a/pdfinterp.py
+++ b/pdfinterp.py
@ -41,6 +41,7 @@ LITERAL_PDF = PSLiteralTable.intern('PDF')
 LITERAL_TEXT = PSLiteralTable.intern('Text')
 LITERAL_FONT = PSLiteralTable.intern('Font')
 LITERAL_FORM = PSLiteralTable.intern('Form')
+LITERAL_IMAGE = PSLiteralTable.intern('Image')
 LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding')
 LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray')
 LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB')
@ -484,6 +485,8 @@ class PDFDevice:
  
  def render_string(self, textstate, textmatrix, size, seq):
    raise NotImplementedError
+  def render_image(self, stream, size, matrix):
+    raise NotImplementedError


 ##  PDFContentParser
@ -942,9 +945,10 @@ class PDFPageInterpreter:
      if STRICT:
        raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
      return
-    if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
-      if 1 <= self.debug:
-        print >>stderr, 'Processing xobj: %r' % xobj
+    if 1 <= self.debug:
+      print >>stderr, 'Processing xobj: %r' % xobj
+    subtype = xobj.dic.get('Subtype')
+    if subtype == LITERAL_FORM and 'BBox' in xobj.dic:
      interpreter = PDFPageInterpreter(self.rsrc, self.device)
      (x0,y0,x1,y1) = xobj.dic['BBox']
      ctm = mult_matrix(xobj.dic.get('Matrix', MATRIX_IDENTITY), self.ctm)
@ -954,6 +958,16 @@ class PDFPageInterpreter:
      self.device.begin_figure(xobjid, bbox)
      interpreter.render_contents(xobj.dic.get('Resources'), [xobj], ctm=ctm)
      self.device.end_figure(xobjid)
+    elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic:
+      (x0,y0) = apply_matrix(self.ctm, (0,0))
+      (x1,y1) = apply_matrix(self.ctm, (1,1))
+      self.device.begin_figure(xobjid, (x0,y0,x1,y1))
+      (w,h) = (xobj.dic['Width'], xobj.dic['Height'])
+      self.device.render_image(xobj, (w,h), self.ctm)
+      self.device.end_figure(xobjid)
+    else:
+      # unsupported xobject type.
+      pass
    return

  def process_page(self, page):