several bugfixes. 20090201 release.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@64 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-02-01 15:01:32 +00:00
parent f8564fa45b
commit af55d4675c
7 changed files with 75 additions and 36 deletions

View File

@ -1,7 +1,7 @@
# Makefile for pdfminer # Makefile for pdfminer
PACKAGE=pdfminer PACKAGE=pdfminer
VERSION=20090117 VERSION=20090201
GNUTAR=tar GNUTAR=tar
SVN=svn SVN=svn
PYTHON=python PYTHON=python

View File

@ -14,7 +14,7 @@ Python PDF parser and analyzer
<div align=right class=lastmod> <div align=right class=lastmod>
<!-- hhmts start --> <!-- hhmts start -->
Last Modified: Sun Jan 18 01:31:16 JST 2009 Last Modified: Mon Feb 2 00:01:01 JST 2009
<!-- hhmts end --> <!-- hhmts end -->
</div> </div>
@ -53,8 +53,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
<a name="source"></a> <a name="source"></a>
<p> <p>
<strong>Download (source):</strong><br> <strong>Download (source):</strong><br>
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz"> <a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz">
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz
</a> </a>
(1.8Mbytes) (1.8Mbytes)
@ -250,6 +250,7 @@ no stream header is displayed for the ease of saving it to a file.
<hr noshade> <hr noshade>
<h2>Changes</h2> <h2>Changes</h2>
<ul> <ul>
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries. <li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
<li> 2009/01/10: Handling Type3 font metrics correctly. <li> 2009/01/10: Handling Type3 font metrics correctly.
<li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich. <li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.

View File

@ -98,19 +98,18 @@ class TextItem(object):
w = 0 w = 0
dx = 0 dx = 0
prev = ' ' prev = ' '
for t in text: for (char,cid,t) in text:
if isinstance(t, tuple): if char:
if prev != ' ' and spwidth < dx: if prev != ' ' and spwidth < dx:
self.text += ' ' self.text += ' '
(_,char) = t
self.text += char
prev = char prev = char
self.text += char
dx = 0 dx = 0
w += (font.char_width(ord(char)) * fontsize + charspace) * scaling w += (font.char_width(cid) * fontsize + charspace) * scaling
else: else:
t *= .001 t *= .001
dx -= t dx -= t
w += t * fontsize * scaling w -= t * fontsize * scaling
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize)) (_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
ty += descent ty += descent
(w,h) = apply_matrix_norm(self.matrix, (w,size)) (w,h) = apply_matrix_norm(self.matrix, (w,size))
@ -121,18 +120,16 @@ class TextItem(object):
self.direction = 2 self.direction = 2
disp = 0 disp = 0
h = 0 h = 0
for t in text: for (char,cid,disp) in text:
if isinstance(t, tuple): if not char: continue
(disp,char) = t
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001)) (_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
self.text += char self.text += font.to_unicode(cid)
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling h += (font.char_width(cid) * fontsize + charspace) * scaling
break break
for t in text: for (char,cid,_) in text:
if isinstance(t, tuple): if not char: continue
(_,char) = t self.text += font.to_unicode(cid)
self.text += char h += (font.char_width(cid) * fontsize + charspace) * scaling
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
(w,h) = apply_matrix_norm(self.matrix, (size,h)) (w,h) = apply_matrix_norm(self.matrix, (size,h))
tx -= w/2 tx -= w/2
ty += disp ty += disp
@ -189,18 +186,16 @@ class PDFPageAggregator(PDFDevice):
textmatrix = mult_matrix(textmatrix, self.ctm) textmatrix = mult_matrix(textmatrix, self.ctm)
for x in seq: for x in seq:
if isinstance(x, int) or isinstance(x, float): if isinstance(x, int) or isinstance(x, float):
text.append(x) text.append((None, None, x))
else: else:
chars = font.decode(x) chars = font.decode(x)
for cid in chars: for cid in chars:
try: try:
char = font.to_unicode(cid) char = font.to_unicode(cid)
text.append((font.char_disp(cid), char))
except PDFUnicodeNotDefined, e: except PDFUnicodeNotDefined, e:
(cidcoding, cid) = e.args (cidcoding, cid) = e.args
unc = self.handle_undefined_char(cidcoding, cid) char = self.handle_undefined_char(cidcoding, cid)
if unc: text.append((char, cid, font.char_disp(cid)))
text.append(unc)
if cid == 32 and not font.is_multibyte(): if cid == 32 and not font.is_multibyte():
if text: if text:
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text) item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys import sys, re
stderr = sys.stderr stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
try: try:
@ -147,13 +147,15 @@ class PDFContentParser(PSStackParser):
self.seek(pos) self.seek(pos)
i = 0 i = 0
data = '' data = ''
while i < len(target): while i <= len(target):
self.fillbuf() self.fillbuf()
if i: if i:
c = self.buf[self.charpos] c = self.buf[self.charpos]
data += c data += c
self.charpos += 1 self.charpos += 1
if c == target[i]: if i >= len(target) and c.isspace():
i += 1
elif c == target[i]:
i += 1 i += 1
else: else:
i = 0 i = 0
@ -161,13 +163,14 @@ class PDFContentParser(PSStackParser):
try: try:
j = self.buf.index(target[0], self.charpos) j = self.buf.index(target[0], self.charpos)
#print 'found', (0, self.buf[j:j+10]) #print 'found', (0, self.buf[j:j+10])
data += self.buf[self.charpos:j] data += self.buf[self.charpos:j+1]
self.charpos = j+1 self.charpos = j+1
i = 1 i = 1
except ValueError: except ValueError:
data += self.buf[self.charpos:] data += self.buf[self.charpos:]
self.charpos = len(self.buf) self.charpos = len(self.buf)
data = data[:-len(target)] # strip the last part data = data[:-(len(target)+1)] # strip the last part
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])', '', data)
return (pos, data) return (pos, data)
def flush(self): def flush(self):

View File

@ -7,7 +7,7 @@
import sys, re import sys, re
import md5, struct import md5, struct
stderr = sys.stderr stderr = sys.stderr
from pdflib.utils import choplist, nunpack from pdflib.utils import choplist, nunpack, decode_text
from pdflib.arcfour import Arcfour from pdflib.arcfour import Arcfour
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \ from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
@ -430,7 +430,7 @@ class PDFDocument(object):
entry = dict_value(entry) entry = dict_value(entry)
if 'Title' in entry: if 'Title' in entry:
if 'A' in entry or 'Dest' in entry: if 'A' in entry or 'Dest' in entry:
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore') title = decode_text(str_value(entry['Title']))
dest = entry.get('Dest') dest = entry.get('Dest')
action = entry.get('A') action = entry.get('A')
se = entry.get('SE') se = entry.get('SE')

View File

@ -51,3 +51,43 @@ def nunpack(s, default=0):
return unpack('>L', s)[0] return unpack('>L', s)[0]
else: else:
return TypeError('invalid length: %d' % l) return TypeError('invalid length: %d' % l)
PDFDocEncoding = ''.join( unichr(x) for x in (
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
))
def decode_text(s):
if s.startswith('\xfe\xff'):
return unicode(s[2:], 'utf-16be', 'ignore')
else:
return ''.join( PDFDocEncoding[ord(c)] for c in s )

View File

@ -38,7 +38,7 @@ BT
/F1 24 Tf /F1 24 Tf
100 600 Td 100 600 Td
0 Tw 0 Tw
( Hello World ) Tj [ ( Hello ) 1000 ( World ) ] TJ
0 100 Td 0 100 Td
100 Tw 100 Tw
( Hello World ) Tj ( Hello World ) Tj