several bugfixes. 20090201 release.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@64 1aa58f4a-7d42-0410-adbc-911cccaed67c
2009-02-01 15:01:32 +00:00 · 2009-02-01 15:01:32 +00:00 · af55d4675c
parent f8564fa45b
commit af55d4675c
7 changed files with 75 additions and 36 deletions
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 # Makefile for pdfminer

 PACKAGE=pdfminer
-VERSION=20090117
+VERSION=20090201
 GNUTAR=tar
 SVN=svn
 PYTHON=python
--- a/README.html
+++ b/README.html
@ -14,7 +14,7 @@ Python PDF parser and analyzer

 <div align=right class=lastmod>
 <!-- hhmts start -->
-Last Modified: Sun Jan 18 01:31:16 JST 2009
+Last Modified: Mon Feb  2 00:01:01 JST 2009
 <!-- hhmts end -->
 </div>

@ -53,8 +53,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
 <a name="source"></a>
 <p>
 <strong>Download (source):</strong><br>
-<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz">
-http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz
+<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz">
+http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz
 </a>
 (1.8Mbytes)

@ -250,6 +250,7 @@ no stream header is displayed for the ease of saving it to a file.
 <hr noshade>
 <h2>Changes</h2>
 <ul>
+<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
 <li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
 <li> 2009/01/10: Handling Type3 font metrics correctly.
 <li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@ -98,19 +98,18 @@ class TextItem(object):
      w = 0
      dx = 0
      prev = ' '
-      for t in text:
-        if isinstance(t, tuple):
+      for (char,cid,t) in text:
+        if char:
          if prev != ' ' and spwidth < dx:
            self.text += ' '
-          (_,char) = t
-          self.text += char
          prev = char
+          self.text += char
          dx = 0
-          w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
+          w += (font.char_width(cid) * fontsize + charspace) * scaling
        else:
          t *= .001
          dx -= t
-          w += t * fontsize * scaling
+          w -= t * fontsize * scaling
      (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
      ty += descent
      (w,h) = apply_matrix_norm(self.matrix, (w,size))
@ -121,18 +120,16 @@ class TextItem(object):
      self.direction = 2
      disp = 0
      h = 0
-      for t in text:
-        if isinstance(t, tuple):
-          (disp,char) = t
-          (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
-          self.text += char
-          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
-          break
-      for t in text:
-        if isinstance(t, tuple):
-          (_,char) = t
-          self.text += char
-          h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
+      for (char,cid,disp) in text:
+        if not char: continue
+        (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
+        self.text += font.to_unicode(cid)
+        h += (font.char_width(cid) * fontsize + charspace) * scaling
+        break
+      for (char,cid,_) in text:
+        if not char: continue
+        self.text += font.to_unicode(cid)
+        h += (font.char_width(cid) * fontsize + charspace) * scaling
      (w,h) = apply_matrix_norm(self.matrix, (size,h))
      tx -= w/2
      ty += disp
@ -189,18 +186,16 @@ class PDFPageAggregator(PDFDevice):
    textmatrix = mult_matrix(textmatrix, self.ctm)
    for x in seq:
      if isinstance(x, int) or isinstance(x, float):
-        text.append(x)
+        text.append((None, None, x))
      else:
        chars = font.decode(x)
        for cid in chars:
          try:
            char = font.to_unicode(cid)
-            text.append((font.char_disp(cid), char))
          except PDFUnicodeNotDefined, e:
            (cidcoding, cid) = e.args
-            unc = self.handle_undefined_char(cidcoding, cid)
-            if unc:
-              text.append(unc)
+            char = self.handle_undefined_char(cidcoding, cid)
+          text.append((char, cid, font.char_disp(cid)))
          if cid == 32 and not font.is_multibyte():
            if text:
              item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-import sys
+import sys, re
 stderr = sys.stderr
 from struct import pack, unpack
 try:
@ -143,17 +143,19 @@ class PDFContentParser(PSStackParser):
    self.charpos = 0
    return

-  def get_inline_data(self, pos, target='EI '):
+  def get_inline_data(self, pos, target='EI'):
    self.seek(pos)
    i = 0
    data = ''
-    while i < len(target):
+    while i <= len(target):
      self.fillbuf()
      if i:
        c = self.buf[self.charpos]
        data += c
        self.charpos += 1
-        if c == target[i]:
+        if i >= len(target) and c.isspace():
+          i += 1
+        elif c == target[i]:
          i += 1
        else:
          i = 0
@ -161,13 +163,14 @@ class PDFContentParser(PSStackParser):
        try:
          j = self.buf.index(target[0], self.charpos)
          #print 'found', (0, self.buf[j:j+10])
-          data += self.buf[self.charpos:j]
+          data += self.buf[self.charpos:j+1]
          self.charpos = j+1
          i = 1
        except ValueError:
          data += self.buf[self.charpos:]
          self.charpos = len(self.buf)
-    data = data[:-len(target)] # strip the last part
+    data = data[:-(len(target)+1)] # strip the last part
+    data = re.sub(r'(\x0d\x0a|[\x0d\x0a])', '', data)
    return (pos, data)

  def flush(self):
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@ -7,7 +7,7 @@
 import sys, re
 import md5, struct
 stderr = sys.stderr
-from pdflib.utils import choplist, nunpack
+from pdflib.utils import choplist, nunpack, decode_text
 from pdflib.arcfour import Arcfour
 from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
     PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
@ -430,7 +430,7 @@ class PDFDocument(object):
      entry = dict_value(entry)
      if 'Title' in entry:
        if 'A' in entry or 'Dest' in entry:
-          title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
+          title = decode_text(str_value(entry['Title']))
          dest = entry.get('Dest')
          action = entry.get('A')
          se = entry.get('SE')
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@ -51,3 +51,43 @@ def nunpack(s, default=0):
    return unpack('>L', s)[0]
  else:
    return TypeError('invalid length: %d' % l)
+
+PDFDocEncoding = ''.join( unichr(x) for x in (
+  0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+  0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+  0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
+  0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
+  0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+  0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+  0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+  0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+  0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+  0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+  0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+  0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
+  0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+  0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
+  0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+  0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
+  0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
+  0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
+  0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
+  0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
+  0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
+  0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
+  0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
+  0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
+  0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+  0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+  0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
+  0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
+  0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
+  0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
+  0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
+  0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
+))
+def decode_text(s):
+  if s.startswith('\xfe\xff'):
+    return unicode(s[2:], 'utf-16be', 'ignore')
+  else:
+    return ''.join( PDFDocEncoding[ord(c)] for c in s )
--- a/samples/simple1.pdf
+++ b/samples/simple1.pdf
@ -38,7 +38,7 @@ BT
 /F1 24 Tf
 100 600 Td
 0 Tw
-( Hello World ) Tj
+[ ( Hello ) 1000 ( World ) ] TJ
 0 100 Td
 100 Tw
 ( Hello World ) Tj