diff --git a/Makefile b/Makefile
index 15af1ed..d418bf2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Makefile for pdfminer
PACKAGE=pdfminer
-VERSION=20090117
+VERSION=20090201
GNUTAR=tar
SVN=svn
PYTHON=python
diff --git a/README.html b/README.html
index 7c70e5f..1e85876 100644
--- a/README.html
+++ b/README.html
@@ -14,7 +14,7 @@ Python PDF parser and analyzer
-Last Modified: Sun Jan 18 01:31:16 JST 2009
+Last Modified: Mon Feb 2 00:01:01 JST 2009
@@ -53,8 +53,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
Download (source):
-
-http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz
+
+http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz
(1.8Mbytes)
@@ -250,6 +250,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
- 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
- 2009/01/10: Handling Type3 font metrics correctly.
- 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py
index 18954cc..2cc8f37 100644
--- a/pdflib/pdfdevice.py
+++ b/pdflib/pdfdevice.py
@@ -98,19 +98,18 @@ class TextItem(object):
w = 0
dx = 0
prev = ' '
- for t in text:
- if isinstance(t, tuple):
+ for (char,cid,t) in text:
+ if char:
if prev != ' ' and spwidth < dx:
self.text += ' '
- (_,char) = t
- self.text += char
prev = char
+ self.text += char
dx = 0
- w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
+ w += (font.char_width(cid) * fontsize + charspace) * scaling
else:
t *= .001
dx -= t
- w += t * fontsize * scaling
+ w -= t * fontsize * scaling
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size))
@@ -121,18 +120,16 @@ class TextItem(object):
self.direction = 2
disp = 0
h = 0
- for t in text:
- if isinstance(t, tuple):
- (disp,char) = t
- (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
- self.text += char
- h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
- break
- for t in text:
- if isinstance(t, tuple):
- (_,char) = t
- self.text += char
- h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
+ for (char,cid,disp) in text:
+ if not char: continue
+ (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
+ self.text += font.to_unicode(cid)
+ h += (font.char_width(cid) * fontsize + charspace) * scaling
+ break
+ for (char,cid,_) in text:
+ if not char: continue
+ self.text += font.to_unicode(cid)
+ h += (font.char_width(cid) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2
ty += disp
@@ -189,18 +186,16 @@ class PDFPageAggregator(PDFDevice):
textmatrix = mult_matrix(textmatrix, self.ctm)
for x in seq:
if isinstance(x, int) or isinstance(x, float):
- text.append(x)
+ text.append((None, None, x))
else:
chars = font.decode(x)
for cid in chars:
try:
char = font.to_unicode(cid)
- text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args
- unc = self.handle_undefined_char(cidcoding, cid)
- if unc:
- text.append(unc)
+ char = self.handle_undefined_char(cidcoding, cid)
+ text.append((char, cid, font.char_disp(cid)))
if cid == 32 and not font.is_multibyte():
if text:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py
index 90ca1f8..b165a69 100644
--- a/pdflib/pdfinterp.py
+++ b/pdflib/pdfinterp.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
-import sys
+import sys, re
stderr = sys.stderr
from struct import pack, unpack
try:
@@ -143,17 +143,19 @@ class PDFContentParser(PSStackParser):
self.charpos = 0
return
- def get_inline_data(self, pos, target='EI '):
+ def get_inline_data(self, pos, target='EI'):
self.seek(pos)
i = 0
data = ''
- while i < len(target):
+ while i <= len(target):
self.fillbuf()
if i:
c = self.buf[self.charpos]
data += c
self.charpos += 1
- if c == target[i]:
+ if i >= len(target) and c.isspace():
+ i += 1
+ elif c == target[i]:
i += 1
else:
i = 0
@@ -161,13 +163,14 @@ class PDFContentParser(PSStackParser):
try:
j = self.buf.index(target[0], self.charpos)
#print 'found', (0, self.buf[j:j+10])
- data += self.buf[self.charpos:j]
+ data += self.buf[self.charpos:j+1]
self.charpos = j+1
i = 1
except ValueError:
data += self.buf[self.charpos:]
self.charpos = len(self.buf)
- data = data[:-len(target)] # strip the last part
+ data = data[:-(len(target)+1)] # strip the last part
+ data = re.sub(r'(\x0d\x0a|[\x0d\x0a])', '', data)
return (pos, data)
def flush(self):
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index ab96e16..c2d5b0b 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -7,7 +7,7 @@
import sys, re
import md5, struct
stderr = sys.stderr
-from pdflib.utils import choplist, nunpack
+from pdflib.utils import choplist, nunpack, decode_text
from pdflib.arcfour import Arcfour
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
@@ -430,7 +430,7 @@ class PDFDocument(object):
entry = dict_value(entry)
if 'Title' in entry:
if 'A' in entry or 'Dest' in entry:
- title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
+ title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest')
action = entry.get('A')
se = entry.get('SE')
diff --git a/pdflib/utils.py b/pdflib/utils.py
index 6d96a9d..e2849a7 100644
--- a/pdflib/utils.py
+++ b/pdflib/utils.py
@@ -51,3 +51,43 @@ def nunpack(s, default=0):
return unpack('>L', s)[0]
else:
return TypeError('invalid length: %d' % l)
+
+PDFDocEncoding = ''.join( unichr(x) for x in (
+ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+ 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
+ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
+ 0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
+ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
+ 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
+ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
+ 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
+ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+ 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+ 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
+ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
+ 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
+ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
+ 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
+ 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
+ 0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
+ 0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
+ 0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
+ 0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
+ 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
+ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
+ 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
+ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+ 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+ 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
+ 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
+ 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
+ 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
+ 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
+ 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
+))
+def decode_text(s):
+ if s.startswith('\xfe\xff'):
+ return unicode(s[2:], 'utf-16be', 'ignore')
+ else:
+ return ''.join( PDFDocEncoding[ord(c)] for c in s )
diff --git a/samples/simple1.pdf b/samples/simple1.pdf
index fd0e4db..a51a1cc 100644
--- a/samples/simple1.pdf
+++ b/samples/simple1.pdf
@@ -38,7 +38,7 @@ BT
/F1 24 Tf
100 600 Td
0 Tw
-( Hello World ) Tj
+[ ( Hello ) 1000 ( World ) ] TJ
0 100 Td
100 Tw
( Hello World ) Tj