basic encryption support added.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@19 1aa58f4a-7d42-0410-adbc-911cccaed67c
2008-04-26 06:47:56 +00:00 · 2008-04-26 06:47:56 +00:00 · 1300046181
parent 5c1aa960f5
commit 1300046181
10 changed files with 311 additions and 68 deletions
--- a/9
+++ b/9
@ -0,0 +1,9 @@
+TODO:
+  - Code Documentation.
+  - Error handling for invalid type.
+
+  - Outlines.
+  - Named Objects. (pages)
+  - Writers.
+  - Linearized PDF.
+  - Encryption?
--- a/arcfour.py
+++ b/arcfour.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+#
+#  Arcfour implementation
+#  * public domain *
+#
+
+class Arcfour:
+  
+  def __init__(self, key):
+    s = range(256)
+    j = 0
+    klen = len(key)
+    for i in xrange(256):
+      j = (j + s[i] + ord(key[i % klen])) % 256
+      (s[i], s[j]) = (s[j], s[i])
+    self.s = s
+    (self.i, self.j) = (0, 0)
+    return
+
+  def process(self, data):
+    (i, j) = (self.i, self.j)
+    s = self.s
+    r = ''
+    for c in data:
+      i = (i+1) % 256
+      j = (j+s[i]) % 256
+      (s[i], s[j]) = (s[j], s[i])
+      k = s[(s[i]+s[j]) % 256]
+      r += chr(ord(c) ^ k)
+    (self.i, self.j) = (i, j)
+    return r
+
+if __name__ == '__main__':
+  def doit(key, data):
+    cipher = Arcfour(key)
+    return ''.join( '%02X' % ord(c) for c in cipher.process(data) )
+  assert doit("Key", "Plaintext") == 'BBF316E8D940AF0AD3'
+  assert doit("Wiki", "pedia") == '1021BF0420'
+  assert doit("Secret", "Attack at dawn") == '45A01F645FC35B383552544B9BF5'
+  print 'test succeeded'
--- a/extent.py
+++ b/extent.py
@ -20,6 +20,9 @@ class Rect:
      self.y1 = y0+h
    return

+  def __repr__(self):
+    return '<Rect: (%d,%d)-(%d,%d)>' % (self.x0, self.y0, self.x1, self.y1)
+
  def overlap(self, rect):
    return not (rect.x1 <= self.x0 or self.x1 <= rect.x0 or
                rect.y1 <= self.y0 or self.y1 <= rect.y0)
@ -31,7 +34,7 @@ class ExtSet:
  
  def __init__(self, gridsize):
    self.gridsize = gridsize
-    self.grid = []
+    self.grid = {}
    return
  
  def cells(self, x0, x1):
@ -45,12 +48,18 @@ class ExtSet:
  
  def add(self, x0, x1, obj):
    for i in self.cells(x0, x1):
-      self.grid[i].append(obj)
+      if i not in self.grid:
+        a = []
+        self.grid[i] = a
+      else:
+        a = self.grid[i]
+      a.append(obj)
    return
  
  def get(self, x0, x1):
    objs = set()
    for i in self.cells(x0, x1):
+      if i in self.grid:
        objs.update(self.grid[i])
    return objs

@ -78,12 +87,13 @@ class ExtGrid:
    self.vext = ExtSet(gridsize)
    return
  
-  def add(self, rect):
-    self.hext.add(rect.x0, rect.x1, rect)
-    self.vext.add(rect.y0, rect.y1, rect)
+  def add(self, rect, obj):
+    self.hext.add(rect.x0, rect.x1, obj)
+    self.vext.add(rect.y0, rect.y1, obj)
    return
  
-  def get(self, rect):
-    rects = self.hext.get(rect.x0, rect.x1)
-    rects.update_intersect(self.vext.get(rect.y0, rect.y1))
-    return rects
+  def get(self, rect, getrect):
+    objs = self.hext.get(rect.x0, rect.x1)
+    objs.intersection_update(self.vext.get(rect.y0, rect.y1))
+    objs = [ obj for obj in objs if rect.overlap(getrect(obj)) ]
+    return objs
--- a/pdf2txt.py
+++ b/pdf2txt.py
@ -7,86 +7,183 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
     PDFPageInterpreter, PDFUnicodeNotDefined, \
     mult_matrix, apply_matrix
 from cmap import CMapDB
+from extent import Rect, ExtSet, ExtGrid
+
+
+##  PageItem
+##
+class PageItem:
+  
+  GRID_SIZE = 20
+  
+  def __init__(self, id, (x0,y0,x1,y1), rotate=0):
+    self.id = id
+    self.bbox = Rect(x0, y0, x1-x0, y1-y0)
+    self.rotate = rotate
+    self.grid = ExtGrid(self.GRID_SIZE)
+    self.objs = []
+    return
+  
+  def __repr__(self):
+    bbox = self.bbox
+    return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
+            (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
+  
+  def add(self, obj):
+    self.objs.append(obj)
+    self.grid.add(obj.bbox, obj)
+    return
+  
+  def dump(self, outfp, codec):
+    outfp.write(repr(self)+'\n')
+    for obj in self.objs:
+      obj.dump(outfp, codec)
+    outfp.write('</page>\n')
+    return
+
+  def fuse(self):
+    for obj1 in self.objs:
+      f = (lambda obj: obj.bbox)
+      for rect in obj1.search_range():
+        neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
+        #print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
+    return
+
+
+##  FigureItem
+##
+class FigureItem(PageItem):
+  
+  def __repr__(self):
+    bbox = self.bbox
+    return ('<figure id=%r bbox="%d,%d,%d,%d">' %
+            (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
+  
+  def dump(self, outfp, codec):
+    outfp.write(repr(self)+'\n')
+    for obj in self.objs:
+      obj.dump(outfp, codec)
+    outfp.write('</figure>\n')
+    return
+
+  def search_range(self):
+    return []
+
+
+##  TextItem
+##
+class TextItem:
+  
+  def __init__(self, matrix, font, size, width, text):
+    self.matrix = matrix
+    self.font = font
+    (a,b,c,d,tx,ty) = self.matrix
+    (self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size))
+    self.width = abs(self.width)
+    self.origin = (tx,ty)
+    self.direction = 0
+    if not self.font.is_vertical():
+      self.direction = 1
+      (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
+      (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
+      self.bbox = Rect(tx, ty+descent, self.width, self.size)
+    else:
+      self.direction = 2
+      mindisp = min( d for (d,_) in text )
+      (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
+      self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
+    self.text = ''.join( c for (_,c) in text )
+    return
+  
+  def __repr__(self):
+    return ('<text matrix=%r font=%r size=%r width=%r text=%r>' %
+            (self.matrix, self.font, self.size, self.width, self.text))
+  
+  def dump(self, outfp, codec):
+    (a,b,c,d,tx,ty) = self.matrix
+    outfp.write('<text x="%.3f" y="%.3f" font=%r size="%.3f" width="%.3f">' %
+                (tx, ty, self.font.fontname, self.size, self.width))
+    outfp.write(self.text.encode(codec, 'xmlcharrefreplace'))
+    outfp.write('</text>\n')
+    return
+
+  def search_range(self):
+    if self.direction == 1:
+      return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
+    else:
+      return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]


 ##  TextConverter
 ##
 class TextConverter(PDFDevice):

-  def __init__(self, outfp, rsrc, codec, debug=0):
+  def __init__(self, rsrc, debug=0):
    PDFDevice.__init__(self, rsrc, debug=debug)
-    self.outfp = outfp
-    self.codec = codec
-    return
-
-  def close(self):
-    self.outfp.write('\n')
+    self.pages = []
+    self.stack = []
    return

  def begin_page(self, page):
-    (x0,y0,x1,y1) = page.mediabox
-    self.outfp.write('<page id="%d" mediabox="%d,%d,%d,%d" rotate="%d">' %
-                     (page.pageid, x0,y0,x1,y1, page.rotate))
+    self.context = PageItem(str(page.pageid), page.mediabox, page.rotate)
    return
  def end_page(self, _):
-    self.outfp.write('</page>\n')
+    assert not self.stack
+    self.pages.append(self.context)
    return

  def begin_figure(self, name, bbox):
-    (x0,y0,x1,y1) = bbox
-    self.outfp.write('<figure name="%s" bbox="%d,%d,%d,%d">\n' %
-                     (name, x0,y0,x1,y1))
+    self.stack.append(self.context)
+    self.context = FigureItem(name, bbox)
    return
  def end_figure(self, _):
-    self.outfp.write('</figure>\n')
+    fig = self.context
+    self.context = self.stack.pop()
+    self.context.add(fig)
    return

  def handle_undefined_char(self, cidcoding, cid):
    if self.debug:
      print >>stderr, 'undefined: %r, %r' % (cidcoding, cid)
    #return unichr(cid)
-    #return unichr(cid+32)
-    return
+    return None

  def render_string(self, textstate, textmatrix, size, seq):
    font = textstate.font
    spwidth = int(-font.char_width(32) * 0.6) # space width
-    buf = ''
+    text = []
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
        if not font.is_vertical() and x <= spwidth:
-          buf += ' '
+          text.append((0, ' '))
      else:
        chars = font.decode(x)
        for cid in chars:
          try:
            char = font.to_unicode(cid)
-            buf += char
+            text.append((font.char_disp(cid), char))
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
            s = self.handle_undefined_char(cidcoding, cid)
            if s:
-              buf += s
-    (a,b,c,d,tx,ty) = mult_matrix(textmatrix, self.ctm)
-    if font.is_vertical():
-      size = -size
-      tag = 'vtext'
-    else:
-      tag = 'htext'
-    if (b != 0 or c != 0 or a <= 0 or d <= 0):
-      tag += ' skewed'
-    s = buf.encode(self.codec, 'xmlcharrefreplace')
-    (w,fs) = apply_matrix((a,b,c,d,0,0), (size,textstate.fontsize))
-    def f(x): return '%.03f' % x
-    self.outfp.write('<%s font="%s" size="%s" x="%s" y="%s" w="%s">%s</%s>\n' %
-                     (tag, font.fontname, f(fs), f(tx), f(ty), f(w), s, tag))
+              text.append(s)
+    item = TextItem(mult_matrix(textmatrix, self.ctm),
+                    font, textstate.fontsize, size, text)
+    self.context.add(item)
+    return
+
+  def dump(self, outfp, codec):
+    outfp.write('<document>\n')
+    for page in self.pages:
+      #page.fuse()
+      page.dump(outfp, codec)
+    outfp.write('</document>\n')
    return


 # pdf2txt
 def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
-  device = TextConverter(outfp, rsrc, codec, debug=debug)
-  outfp.write('<document>\n')
+  device = TextConverter(rsrc, debug=debug)
  doc = PDFDocument(debug=debug)
  fp = file(fname)
  parser = PDFParser(doc, fp, debug=debug)
@ -95,7 +192,7 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
    if pages and (i not in pages): continue
    interpreter.process_page(page)
  fp.close()
-  outfp.write('</document>\n')
+  device.dump(outfp, codec)
  device.close()
  return

--- a/pdfparser.py
+++ b/pdfparser.py
@ -4,30 +4,30 @@
 #  ver 0.1, Dec 24 2004-
 #  ver 0.2, Dec 24 2007

-# TODO:
-#   - Code Documentation.
-#   - Error handling for invalid type.
-
-#   - Outlines.
-#   - Named Objects. (pages)
-#   - Writers.
-#   - Linearized PDF.
-#   - Encryption?
-
 import sys
+import md5, struct
 stderr = sys.stderr
 from utils import choplist, nunpack
+from arcfour import Arcfour
 from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, \
     PSStackParser, STRICT


+def decrypt_rc4(key, objid, genno, data):
+  key += struct.pack('<L',objid)[:3]+struct.pack('<L',genno)[:2]
+  hash = md5.md5(key)
+  key = hash.digest()[:min(len(key),16)]
+  return Arcfour(key).process(data)
+
+
 ##  PDF Exceptions
 ##
 class PDFException(PSException): pass
 class PDFSyntaxError(PDFException): pass
-class PDFEncrypted(PDFException): pass
+class PDFEncryptionError(PDFException): pass
+class PDFPasswordIncorrect(PDFEncryptionError): pass
 class PDFTypeError(PDFException): pass
 class PDFValueError(PDFException): pass

@ -38,6 +38,7 @@ LITERAL_XREF = PSLiteralTable.intern('XRef')
 LITERAL_PAGE = PSLiteralTable.intern('Page')
 LITERAL_PAGES = PSLiteralTable.intern('Pages')
 LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
+LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
 LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
 KEYWORD_R = PSKeywordTable.intern('R')
 KEYWORD_OBJ = PSKeywordTable.intern('obj')
@ -45,6 +46,7 @@ KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
 KEYWORD_STREAM = PSKeywordTable.intern('stream')
 KEYWORD_XREF = PSKeywordTable.intern('xref')
 KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
+PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'


 ##  PDFObjRef
@ -77,7 +79,7 @@ def resolve1(x):
    x = x.resolve()
  return x

-def resolveall(x):
+def resolve_all(x):
  '''
  Recursively resolve X and all the internals.
  Make sure there is no indirect reference within the nested object.
@ -86,10 +88,23 @@ def resolveall(x):
  while isinstance(x, PDFObjRef):
    x = x.resolve()
  if isinstance(x, list):
-    x = [ resolveall(v) for v in x ]
+    x = [ resolve_all(v) for v in x ]
  elif isinstance(x, dict):
    for (k,v) in x.iteritems():
-      x[k] = resolveall(v)
+      x[k] = resolve_all(v)
+  return x
+
+def decipher_all(decipher, objid, genno, x):
+  '''
+  Recursively decipher X.
+  '''
+  if isinstance(x, str):
+    return decipher(objid, genno, x)
+  if isinstance(x, list):
+    x = [ decipher_all(decipher, objid, genno, v) for v in x ]
+  elif isinstance(x, dict):
+    for (k,v) in x.iteritems():
+      x[k] = decipher_all(decipher, objid, genno, v)
  return x

 # Type cheking
@ -159,6 +174,13 @@ class PDFStream:
    self.rawdata = rawdata
    self.decipher = decipher
    self.data = None
+    self.objid = None
+    self.genno = None
+    return
+
+  def set_objid(self, objid, genno):
+    self.objid = objid
+    self.genno = genno
    return
  
  def __repr__(self):
@ -168,7 +190,7 @@ class PDFStream:
    assert self.data == None and self.rawdata != None
    data = self.rawdata
    if self.decipher:
-      data = self.decipher(data)
+      data = self.decipher(self.objid, self.genno, data)
    if 'Filter' not in self.dic:
      self.data = data
      self.rawdata = None
@ -201,6 +223,8 @@ class PDFStream:
              buf += ent1
              ent0 = ent1
            data = buf
+      if f == LITERAL_CRYPT:
+        raise PDFEncryptionError
      else:
        if STRICT:
          raise PDFValueError('Invalid filter spec: %r' % f)
@ -338,10 +362,11 @@ class PDFDocument:
    self.xrefs = []
    self.objs = {}
    self.parsed_objs = {}
-    self.decipher = None
    self.root = None
    self.catalog = None
    self.parser = None
+    self.encryption = None
+    self.decipher = None
    return

  def set_parser(self, parser):
@ -351,20 +376,74 @@ class PDFDocument:
    for xref in self.xrefs:
      trailer = xref.trailer
      if 'Encrypt' in trailer:
-        raise PDFEncrypted
-        param = dict_value(trailer['Encrypt'])
-        self.decipher = DECRYPTOR(param)
-        self.parser.strfilter = self.decipher
+        self.encryption = (list_value(trailer['ID']),
+                           dict_value(trailer['Encrypt']))
      if 'Root' in trailer:
        self.set_root(dict_value(trailer['Root']))
        break
    else:
      raise PDFValueError('no /Root object!')
+    if self.encryption:
+      self.prepare_cipher()
+    return
+
+  def prepare_cipher(self, password=''):
+    (docid, param) = self.encryption
+    if literal_name(param['Filter']) != 'Standard':
+      raise PDFEncryptionError('unknown filter: param=%r' % param)
+    V = int_value(param.get('V', 0))
+    if not (V == 1 or V == 2):
+      raise PDFEncryptionError('unknown algorithm: param=%r' % param)
+    length = int_value(param.get('Length', 40)) # Key length (bits)
+    O = str_value(param['O'])
+    R = int_value(param['R']) # Revision
+    if 5 <= R:
+      raise PDFEncryptionError('unknown revision: %r' % R)
+    U = str_value(param['U'])
+    P = int_value(param['P'])
+    is_printable = bool(P & 4)        
+    is_modifiable = bool(P & 8)
+    is_extractable = bool(P & 16)
+    # Algorithm 3.2
+    password = (password+PASSWORD_PADDING)[:32] # 1
+    hash = md5.md5(password) # 2
+    hash.update(O) # 3
+    hash.update(struct.pack('<L', P)) # 4
+    hash.update(docid[0]) # 5
+    if 4 <= R:
+      raise NotImplementedError # 6
+    if 3 <= R:
+      # 8
+      for _ in xrange(50):
+        hash = md5.md5(hash.digest()[:length/8])
+    key = hash.digest()[:length/8]
+    if R == 2:
+      # Algorithm 3.4
+      u1 = Arcfour(key).process(password)
+    elif R == 3:
+      # Algorithm 3.5
+      hash = md5.md5(PASSWORD_PADDING) # 2
+      hash.update(docid[0]) # 3
+      x = Arcfour(key).process(hash.digest()[:16]) # 4
+      for i in xrange(1,19+1):
+        k = ''.join( chr(c ^ i) for c in key )
+        x = Arcfour(k).process(x)
+      u1 = x+x # 32bytes total
+    else:
+      raise PDFEncryptionError('unknown revision: %r' % R)
+    if R == 2:
+      is_authenticated = (u1 == U)
+    else:
+      is_authenticated = (u1[:16] == U[:16])
+    if not is_authenticated:
+      raise PDFPasswordIncorrect
+    self.decipher = (lambda objid,genno,data: decrypt_rc4(key, objid, genno, data))
    return

  def getobj(self, objid):
    #assert self.xrefs
    if objid in self.objs:
+      genno = 0
      obj = self.objs[objid]
    else:
      for xref in self.xrefs:
@ -400,18 +479,26 @@ class PDFDocument:
          except PSEOF:
            pass
          self.parsed_objs[stream] = objs
+        genno = 0
        obj = objs[stream.dic['N']*2+index]
+        if isinstance(obj, PDFStream):
+          obj.set_objid(objid, 0)
      else:
        self.parser.seek(index)
        (_,objid1) = self.parser.nextobject() # objid
-        (_,genno1) = self.parser.nextobject() # genno
+        (_,genno) = self.parser.nextobject() # genno
+        assert objid1 == objid
        (_,kwd) = self.parser.nextobject()
        if kwd != KEYWORD_OBJ:
          raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
        (_,obj) = self.parser.nextobject()
+        if isinstance(obj, PDFStream):
+          obj.set_objid(objid, genno)
      if 2 <= self.debug:
        print >>stderr, 'register: objid=%r: %r' % (objid, obj)
      self.objs[objid] = obj
+    if self.decipher:
+      obj = decipher_all(self.decipher, objid, genno, obj)
    return obj
  
  def get_pages(self, debug=0):
--- a/samples/dmca.pdf
+++ b/samples/dmca.pdf
--- a/samples/f1040nr.pdf
+++ b/samples/f1040nr.pdf
--- a/samples/i1040nr.pdf
+++ b/samples/i1040nr.pdf
--- a/samples/kampo.pdf
+++ b/samples/kampo.pdf
--- a/samples/nlp2004slides.pdf
+++ b/samples/nlp2004slides.pdf