tmp commit

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@70 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-02-23 14:00:38 +00:00 · 2009-02-23 14:00:38 +00:00 · 13a6603151
parent 2694de9521
commit 13a6603151
9 changed files with 267 additions and 138 deletions
--- a/pdflib/cmap.py
+++ b/pdflib/cmap.py
@ -2,8 +2,8 @@
 import sys
 stderr = sys.stderr
 from struct import pack, unpack
-from pdflib.utils import choplist, nunpack
-from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
+from utils import choplist, nunpack
+from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
     PSLiteral, PSKeyword, literal_name, keyword_name, \
     PSStackParser
 try:
--- a/pdflib/pdf2txt.py
+++ b/pdflib/pdf2txt.py
@ -2,11 +2,11 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
-from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdflib.pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
-from pdflib.pdffont import PDFUnicodeNotDefined
-from pdflib.cmap import CMapDB
+from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
+from pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
+from pdffont import PDFUnicodeNotDefined
+from cmap import CMapDB


 def enc(x, codec):
@ -121,7 +121,7 @@ class TagExtractor(PDFDevice):
  def render_image(self, stream, size, matrix):
    return

-  def render_string(self, textstate, textmatrix, size, seq):
+  def render_string(self, textstate, textmatrix, seq):
    font = textstate.font
    text = ''
    for x in seq:
--- a/pdflib/pdfcolor.py
+++ b/pdflib/pdfcolor.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python
 import sys
 stderr = sys.stderr
-from pdflib.psparser import PSLiteralTable
+from psparser import PSLiteralTable


 ##  ColorSpace
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -2,8 +2,8 @@
 import sys
 stdout = sys.stdout
 stderr = sys.stderr
-from pdflib.pdffont import PDFUnicodeNotDefined
-from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
+from pdffont import PDFUnicodeNotDefined
+from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix


 ##  PDFDevice
--- a/pdflib/pdffont.py
+++ b/pdflib/pdffont.py
@ -6,13 +6,175 @@ try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
-from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
+from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
     literal_name, keyword_name, STRICT
-from pdflib.pdftypes import PDFException, \
+from pdftypes import PDFException, \
     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
-from utils import apply_matrix_norm
+from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
+from utils import apply_matrix_norm, nunpack
+
+
+NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
+def getnum(fp):
+  b0 = ord(fp.read(1))
+  if b0 == 30:
+    s = ''
+    loop = True
+    while loop:
+      b = ord(fp.read(1))
+      for n in (b >> 4, b & 15):
+        if n == 15:
+          loop = False
+        else:
+          s += NIBBLES[n]
+    return float(s)
+  if 32 <= b0 and b0 <= 246:
+    return b0-139
+  b1 = ord(fp.read(1))
+  if 247 <= b0 and b0 <= 250:
+    return ((b0-247)<<8)+b1+108
+  if 251 <= b0 and b0 <= 254:
+    return -((b0-251)<<8)-b1-108
+  b2 = ord(fp.read(1))
+  if 128 <= b1: b1 -= 256
+  if b0 == 28:
+    return b1<<8 | b2
+  return b1<<24 | b2<<16 | unpack('>H',fp.read(2))[0]
+#assert getop(StringIO('\x8b')) == 0
+#assert getop(StringIO('\xef')) == 100
+#assert getop(StringIO('\x27')) == -100
+#assert getop(StringIO('\xfa\x7c')) == 1000
+#assert getop(StringIO('\xfe\x7c')) == -1000
+#assert getop(StringIO('\x1c\x27\x10')) == 10000
+#assert getop(StringIO('\x1c\xd8\xf0')) == -10000
+#assert getop(StringIO('\x1d\x00\x01\x86\xa0')) == 100000
+#assert getop(StringIO('\x1d\xff\xfe\x79\x60')) == -100000
+#assert getop(StringIO('\x1e\xe2\xa2\x5f')) == -2.25
+#assert getop(StringIO('\x1e\x0a\x14\x05\x41\xc3\xff')) == 0.140541e-3
+
+
+##  CFFFont
+##  (Format specified in Adobe Technical Note: #5176
+##   "The Compact Font Format Specification")
+##
+class CFFFont(object):
+
+  class INDEX(object):
+    
+    def __init__(self, fp):
+      self.fp = fp
+      self.offsets = []
+      (count, offsize) = unpack('>HB', self.fp.read(3))
+      for i in xrange(count+1):
+        self.offsets.append(nunpack(self.fp.read(offsize)))
+      self.base = self.fp.tell()-1
+      self.fp.seek(self.base+self.offsets[-1])
+      return
+
+    def __repr__(self):
+      return '<INDEX: size=%d>' % len(self)
+
+    def __len__(self):
+      return len(self.offsets)-1
+
+    def __getitem__(self, i):
+      self.fp.seek(self.base+self.offsets[i])
+      return self.fp.read(self.offsets[i+1]-self.offsets[i])
+
+  def __init__(self, name, fp):
+    self.name = name
+    self.fp = fp
+    # Header
+    (_major,_minor,hdrsize,self.offsize) = unpack('BBBB', fp.read(4))
+    self.fp.read(hdrsize-4)
+    # Name INDEX
+    self.name_index = self.INDEX(self.fp)
+    # Top DICT INDEX
+    self.dict_index = self.INDEX(self.fp)
+    # String INDEX
+    self.string_index = self.INDEX(self.fp)
+    # Global Subr INDEX
+    self.subr_index = self.INDEX(self.fp)
+    # Encodings
+    # Charsets
+    return
+
+  
+  
+##  TrueTypeFont
+##
+class TrueTypeFont(object):
+
+  class CMapNotFound(Exception): pass
+  
+  def __init__(self, name, fp):
+    self.name = name
+    self.fp = fp
+    self.tables = {}
+    fonttype = fp.read(4)
+    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+    for i in xrange(ntables):
+      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
+      self.tables[name] = (offset, length)
+    return
+
+  def create_cmap(self):
+    if 'cmap' not in self.tables:
+      raise TrueTypeFont.CMapNotFound
+    (base_offset, length) = self.tables['cmap']
+    fp = self.fp
+    fp.seek(base_offset)
+    (version, nsubtables) = unpack('>HH', fp.read(4))
+    subtables = []
+    for i in xrange(nsubtables):
+      subtables.append(unpack('>HHL', fp.read(8)))
+    char2gid = {}
+    # Only supports subtable type 0, 2 and 4.
+    for (_1, _2, st_offset) in subtables:
+      fp.seek(base_offset+st_offset)
+      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
+      if fmttype == 0:
+        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
+      elif fmttype == 2:
+        subheaderkeys = unpack('>256H', fp.read(512))
+        firstbytes = [0]*8192
+        for (i,k) in enumerate(subheaderkeys):
+          firstbytes[k/8] = i
+        nhdrs = max(subheaderkeys)/8 + 1
+        hdrs = []
+        for i in xrange(nhdrs):
+          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
+          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
+        for (i,firstcode,entcount,delta,pos) in hdrs:
+          if not entcount: continue
+          first = firstcode + (firstbytes[i] << 8)
+          fp.seek(pos)
+          for c in xrange(entcount):
+            gid = unpack('>H', fp.read(2))
+            if gid:
+              gid += delta
+            char2gid[first+c] = gid
+      elif fmttype == 4:
+        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
+        segcount /= 2
+        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        fp.read(2)
+        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
+        pos = fp.tell()
+        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
+        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
+          if idr:
+            fp.seek(pos+idr)
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
+          else:
+            for c in xrange(sc, ec+1):
+              char2gid[c] = (c + idd) & 0xffff
+    gid2char = dict( (gid, pack('>H', char))
+                     for (char,gid) in char2gid.iteritems() )
+    return CMap().update(char2gid, gid2char)


 ##  Fonts
@ -96,17 +258,15 @@ class PDFSimpleFont(PDFFont):
    return

  def to_unicode(self, cid):
-    if not self.ucs2_cmap:
+    if self.ucs2_cmap:
+      code = self.ucs2_cmap.tocode(cid)
+      if code:
+        chars = unpack('>%dH' % (len(code)/2), code)
+        return ''.join( unichr(c) for c in chars )
    try:
      return self.encoding[cid]
    except KeyError:
      raise PDFUnicodeNotDefined(None, cid)
-    code = self.ucs2_cmap.tocode(cid)
-    if not code:
-      raise PDFUnicodeNotDefined(None, cid)
-    chars = unpack('>%dH' % (len(code)/2), code)
-    return ''.join( unichr(c) for c in chars )
-

 # PDFType1Font
 class PDFType1Font(PDFSimpleFont):
@ -171,81 +331,6 @@ class PDFType3Font(PDFSimpleFont):


 # PDFCIDFont
-
-##  TrueTypeFont
-##
-class TrueTypeFont(object):
-
-  class CMapNotFound(Exception): pass
-  
-  def __init__(self, name, fp):
-    self.name = name
-    self.fp = fp
-    self.tables = {}
-    fonttype = fp.read(4)
-    (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-    for i in xrange(ntables):
-      (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
-      self.tables[name] = (offset, length)
-    return
-
-  def create_cmap(self):
-    if 'cmap' not in self.tables:
-      raise TrueTypeFont.CMapNotFound
-    (base_offset, length) = self.tables['cmap']
-    fp = self.fp
-    fp.seek(base_offset)
-    (version, nsubtables) = unpack('>HH', fp.read(4))
-    subtables = []
-    for i in xrange(nsubtables):
-      subtables.append(unpack('>HHL', fp.read(8)))
-    char2gid = {}
-    # Only supports subtable type 0, 2 and 4.
-    for (_1, _2, st_offset) in subtables:
-      fp.seek(base_offset+st_offset)
-      (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
-      if fmttype == 0:
-        char2gid.update(enumerate(unpack('>256B', fp.read(256))))
-      elif fmttype == 2:
-        subheaderkeys = unpack('>256H', fp.read(512))
-        firstbytes = [0]*8192
-        for (i,k) in enumerate(subheaderkeys):
-          firstbytes[k/8] = i
-        nhdrs = max(subheaderkeys)/8 + 1
-        hdrs = []
-        for i in xrange(nhdrs):
-          (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
-          hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
-        for (i,firstcode,entcount,delta,pos) in hdrs:
-          if not entcount: continue
-          first = firstcode + (firstbytes[i] << 8)
-          fp.seek(pos)
-          for c in xrange(entcount):
-            gid = unpack('>H', fp.read(2))
-            if gid:
-              gid += delta
-            char2gid[first+c] = gid
-      elif fmttype == 4:
-        (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
-        segcount /= 2
-        ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        fp.read(2)
-        scs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        idds = unpack('>%dh' % segcount, fp.read(2*segcount))
-        pos = fp.tell()
-        idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
-        for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
-          if idr:
-            fp.seek(pos+idr)
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
-          else:
-            for c in xrange(sc, ec+1):
-              char2gid[c] = (c + idd) & 0xffff
-    gid2char = dict( (gid, pack('>H', char))
-                     for (char,gid) in char2gid.iteritems() )
-    return CMap().update(char2gid, gid2char)
-
 class PDFCIDFont(PDFFont):
  
  def __init__(self, rsrc, spec):
@ -358,3 +443,13 @@ class PDFCIDFont(PDFFont):

  def space_width(self):
    return 0
+
+
+# main
+def main(argv):
+  for fname in argv[1:]:
+    fp = file(fname, 'rb')
+    CFFFont(fname, fp)
+    fp.close()
+  return
+if __name__ == '__main__': sys.exit(main(sys.argv))
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -6,17 +6,17 @@ try:
  from cStringIO import StringIO
 except ImportError:
  from StringIO import StringIO
-from pdflib.psparser import PSException, PSTypeError, PSEOF, \
+from psparser import PSException, PSTypeError, PSEOF, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
     PSStackParser, PSKeyword, STRICT
-from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
+from pdftypes import PDFException, PDFStream, PDFObjRef, \
     resolve1, int_value, float_value, num_value, \
     str_value, list_value, dict_value, stream_value
-from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
-from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
-from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
+from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
+from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
+from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
     LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
-from pdflib.cmap import CMapDB
+from cmap import CMapDB


 ##  Exceptions
@ -391,27 +391,27 @@ class PDFPageInterpreter(object):
    return
  # setgray-stroking
  def do_G(self, gray):
-    self.do_CS(LITERAL_DEVICE_GRAY)
+    #self.do_CS(LITERAL_DEVICE_GRAY)
    return
  # setgray-non-stroking
  def do_g(self, gray):
-    self.do_cs(LITERAL_DEVICE_GRAY)
+    #self.do_cs(LITERAL_DEVICE_GRAY)
    return
  # setrgb-stroking
  def do_RG(self, r, g, b):
-    self.do_CS(LITERAL_DEVICE_RGB)
+    #self.do_CS(LITERAL_DEVICE_RGB)
    return
  # setrgb-non-stroking
  def do_rg(self, r, g, b):
-    self.do_cs(LITERAL_DEVICE_RGB)
+    #self.do_cs(LITERAL_DEVICE_RGB)
    return
  # setcmyk-stroking
  def do_K(self, c, m, y, k):
-    self.do_CS(LITERAL_DEVICE_CMYK)
+    #self.do_CS(LITERAL_DEVICE_CMYK)
    return
  # setcmyk-non-stroking
  def do_k(self, c, m, y, k):
-    self.do_cs(LITERAL_DEVICE_CMYK)
+    #self.do_cs(LITERAL_DEVICE_CMYK)
    return

  # setcolor
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -7,12 +7,12 @@
 import sys, re
 import md5, struct
 stderr = sys.stderr
-from pdflib.utils import choplist, nunpack, decode_text
-from pdflib.arcfour import Arcfour
-from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
+from utils import choplist, nunpack, decode_text
+from arcfour import Arcfour
+from psparser import PSStackParser, PSSyntaxError, PSEOF, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
     STRICT
-from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
+from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
     PDFStream, PDFObjRef, resolve1, decipher_all, \
     int_value, float_value, num_value, str_value, list_value, dict_value, stream_value

@ -34,23 +34,50 @@ LITERAL_CATALOG = PSLiteralTable.intern('Catalog')

 ##  XRefs
 ##
+class XRefObjRange(object):
+  def __init__(self, start, nobjs):
+    self.start = start
+    self.nobjs = nobjs
+    return
+
+  def __repr__(self):
+    return '<XRefObjRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
+
+  def get_start_id(self):
+    return self.start
+
+  def get_end_id(self):
+    return self.start + self.nobjs - 1
+
+  def get_nobjs(self):
+    return self.nobjs
+
+class PDFBaseXRef(object):
+  def __init__(self):
+    self.objid_ranges = None
+    self.objid_list = None
+    return
+
+  def objids(self):
+    for objid_range in self.objid_ranges:
+      for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
+        yield objid
+    return

 ##  PDFXRef
 ##
-class PDFXRef(object):
-
+class PDFXRef(PDFBaseXRef):
  def __init__(self):
+    PDFBaseXRef.__init__(self)
    self.offsets = None
    return

  def __repr__(self):
    return '<PDFXRef: objs=%d>' % len(self.offsets)

-  def objids(self):
-    return self.offsets.iterkeys()
-
  def load(self, parser, debug=0):
    self.offsets = {}
+    self.objid_ranges = []
    while 1:
      try:
        (pos, line) = parser.nextline()
@ -68,6 +95,8 @@ class PDFXRef(object):
        (start, nobjs) = map(long, f)
      except ValueError:
        raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
+      self.newoffsets = {}
+      self.objid_ranges.append(XRefObjRange(start, nobjs))
      for objid in xrange(start, start+nobjs):
        try:
          (_, line) = parser.nextline()
@ -108,11 +137,10 @@ class PDFXRef(object):

 ##  PDFXRefStream
 ##
-class PDFXRefStream(object):
+class PDFXRefStream(PDFBaseXRef):

  def __init__(self):
-    self.objid_first = None
-    self.objid_last = None
+    PDFBaseXRef.__init__(self)
    self.data = None
    self.entlen = None
    self.fl1 = self.fl2 = self.fl3 = None
@ -121,9 +149,6 @@ class PDFXRefStream(object):
  def __repr__(self):
    return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)

-  def objids(self):
-    return xrange(self.objid_first, self.objid_last+1)
-
  def load(self, parser, debug=0):
    (_,objid) = parser.nexttoken() # ignored
    (_,genno) = parser.nexttoken() # ignored
@ -132,22 +157,31 @@ class PDFXRefStream(object):
    if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
      raise PDFNoValidXRef('Invalid PDF stream spec.')
    size = stream.dic['Size']
-    (start, nobjs) = stream.dic.get('Index', (0,size))
-    self.objid_first = start
-    self.objid_last = start+nobjs-1
+    index_array = stream.dic.get('Index', (0,size))
+    if len(index_array) % 2 != 0:
+      raise PDFSyntaxError('Invalid index number')
+    self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ]
    (self.fl1, self.fl2, self.fl3) = stream.dic['W']
    self.data = stream.get_data()
    self.entlen = self.fl1+self.fl2+self.fl3
    self.trailer = stream.dic
    if debug:
-      print >>stderr, ('xref stream: objid=%d-%d, fields=%d,%d,%d' %
-                       (self.objid_first, self.objid_last, self.fl1, self.fl2, self.fl3))
+      print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
+                       (', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3)))
    return

  def getpos(self, objid):
-    if objid < self.objid_first or self.objid_last < objid:
-      raise KeyError(objid)
-    i = self.entlen * (objid-self.objid_first)
+    offset = 0
+    found = False
+    for objid_range in self.objid_ranges:
+      if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
+        offset += objid - objid_range.get_start_id()
+        found = True
+        break
+      else:
+        offset += objid_range.get_nobjs()
+    if not found: raise KeyError(objid)
+    i = self.entlen * offset
    ent = self.data[i:i+self.entlen]
    f1 = nunpack(ent[:self.fl1], 1)
    if f1 == 1:
--- a/pdflib/pdftypes.py
+++ b/pdflib/pdftypes.py
@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import sys, zlib
 stderr = sys.stderr
-from pdflib.lzw import LZWDecoder
-from pdflib.psparser import PSException, PSObject, \
+from lzw import LZWDecoder
+from psparser import PSException, PSObject, \
     PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
     literal_name, keyword_name, STRICT

--- a/pdflib/psparser.py
+++ b/pdflib/psparser.py
@ -2,7 +2,7 @@
 import sys, re
 stderr = sys.stderr

-from pdflib.utils import choplist
+from utils import choplist

 STRICT = 0