several bugfixes. 20090201 release.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@64 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
f8564fa45b
commit
af55d4675c
2
Makefile
2
Makefile
|
@ -1,7 +1,7 @@
|
|||
# Makefile for pdfminer
|
||||
|
||||
PACKAGE=pdfminer
|
||||
VERSION=20090117
|
||||
VERSION=20090201
|
||||
GNUTAR=tar
|
||||
SVN=svn
|
||||
PYTHON=python
|
||||
|
|
|
@ -14,7 +14,7 @@ Python PDF parser and analyzer
|
|||
|
||||
<div align=right class=lastmod>
|
||||
<!-- hhmts start -->
|
||||
Last Modified: Sun Jan 18 01:31:16 JST 2009
|
||||
Last Modified: Mon Feb 2 00:01:01 JST 2009
|
||||
<!-- hhmts end -->
|
||||
</div>
|
||||
|
||||
|
@ -53,8 +53,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
|
|||
<a name="source"></a>
|
||||
<p>
|
||||
<strong>Download (source):</strong><br>
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz
|
||||
<a href="http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz">
|
||||
http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090201.tar.gz
|
||||
</a>
|
||||
(1.8Mbytes)
|
||||
|
||||
|
@ -250,6 +250,7 @@ no stream header is displayed for the ease of saving it to a file.
|
|||
<hr noshade>
|
||||
<h2>Changes</h2>
|
||||
<ul>
|
||||
<li> 2009/02/01: Various bugfixes. Thanks to Hiroshi Manabe.
|
||||
<li> 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
|
||||
<li> 2009/01/10: Handling Type3 font metrics correctly.
|
||||
<li> 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
|
||||
|
|
|
@ -98,19 +98,18 @@ class TextItem(object):
|
|||
w = 0
|
||||
dx = 0
|
||||
prev = ' '
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
for (char,cid,t) in text:
|
||||
if char:
|
||||
if prev != ' ' and spwidth < dx:
|
||||
self.text += ' '
|
||||
(_,char) = t
|
||||
self.text += char
|
||||
prev = char
|
||||
self.text += char
|
||||
dx = 0
|
||||
w += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
w += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
else:
|
||||
t *= .001
|
||||
dx -= t
|
||||
w += t * fontsize * scaling
|
||||
w -= t * fontsize * scaling
|
||||
(_,descent) = apply_matrix_norm(self.matrix, (0,font.get_descent() * fontsize))
|
||||
ty += descent
|
||||
(w,h) = apply_matrix_norm(self.matrix, (w,size))
|
||||
|
@ -121,18 +120,16 @@ class TextItem(object):
|
|||
self.direction = 2
|
||||
disp = 0
|
||||
h = 0
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
(disp,char) = t
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
break
|
||||
for t in text:
|
||||
if isinstance(t, tuple):
|
||||
(_,char) = t
|
||||
self.text += char
|
||||
h += (font.char_width(ord(char)) * fontsize + charspace) * scaling
|
||||
for (char,cid,disp) in text:
|
||||
if not char: continue
|
||||
(_,disp) = apply_matrix_norm(self.matrix, (0, (1000-disp)*fontsize*.001))
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
break
|
||||
for (char,cid,_) in text:
|
||||
if not char: continue
|
||||
self.text += font.to_unicode(cid)
|
||||
h += (font.char_width(cid) * fontsize + charspace) * scaling
|
||||
(w,h) = apply_matrix_norm(self.matrix, (size,h))
|
||||
tx -= w/2
|
||||
ty += disp
|
||||
|
@ -189,18 +186,16 @@ class PDFPageAggregator(PDFDevice):
|
|||
textmatrix = mult_matrix(textmatrix, self.ctm)
|
||||
for x in seq:
|
||||
if isinstance(x, int) or isinstance(x, float):
|
||||
text.append(x)
|
||||
text.append((None, None, x))
|
||||
else:
|
||||
chars = font.decode(x)
|
||||
for cid in chars:
|
||||
try:
|
||||
char = font.to_unicode(cid)
|
||||
text.append((font.char_disp(cid), char))
|
||||
except PDFUnicodeNotDefined, e:
|
||||
(cidcoding, cid) = e.args
|
||||
unc = self.handle_undefined_char(cidcoding, cid)
|
||||
if unc:
|
||||
text.append(unc)
|
||||
char = self.handle_undefined_char(cidcoding, cid)
|
||||
text.append((char, cid, font.char_disp(cid)))
|
||||
if cid == 32 and not font.is_multibyte():
|
||||
if text:
|
||||
item = TextItem(textmatrix, font, textstate.fontsize, textstate.charspace, textstate.scaling, text)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python
|
||||
import sys
|
||||
import sys, re
|
||||
stderr = sys.stderr
|
||||
from struct import pack, unpack
|
||||
try:
|
||||
|
@ -143,17 +143,19 @@ class PDFContentParser(PSStackParser):
|
|||
self.charpos = 0
|
||||
return
|
||||
|
||||
def get_inline_data(self, pos, target='EI '):
|
||||
def get_inline_data(self, pos, target='EI'):
|
||||
self.seek(pos)
|
||||
i = 0
|
||||
data = ''
|
||||
while i < len(target):
|
||||
while i <= len(target):
|
||||
self.fillbuf()
|
||||
if i:
|
||||
c = self.buf[self.charpos]
|
||||
data += c
|
||||
self.charpos += 1
|
||||
if c == target[i]:
|
||||
if i >= len(target) and c.isspace():
|
||||
i += 1
|
||||
elif c == target[i]:
|
||||
i += 1
|
||||
else:
|
||||
i = 0
|
||||
|
@ -161,13 +163,14 @@ class PDFContentParser(PSStackParser):
|
|||
try:
|
||||
j = self.buf.index(target[0], self.charpos)
|
||||
#print 'found', (0, self.buf[j:j+10])
|
||||
data += self.buf[self.charpos:j]
|
||||
data += self.buf[self.charpos:j+1]
|
||||
self.charpos = j+1
|
||||
i = 1
|
||||
except ValueError:
|
||||
data += self.buf[self.charpos:]
|
||||
self.charpos = len(self.buf)
|
||||
data = data[:-len(target)] # strip the last part
|
||||
data = data[:-(len(target)+1)] # strip the last part
|
||||
data = re.sub(r'(\x0d\x0a|[\x0d\x0a])', '', data)
|
||||
return (pos, data)
|
||||
|
||||
def flush(self):
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
import sys, re
|
||||
import md5, struct
|
||||
stderr = sys.stderr
|
||||
from pdflib.utils import choplist, nunpack
|
||||
from pdflib.utils import choplist, nunpack, decode_text
|
||||
from pdflib.arcfour import Arcfour
|
||||
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
|
||||
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
|
||||
|
@ -430,7 +430,7 @@ class PDFDocument(object):
|
|||
entry = dict_value(entry)
|
||||
if 'Title' in entry:
|
||||
if 'A' in entry or 'Dest' in entry:
|
||||
title = unicode(str_value(entry['Title']), 'utf-8', 'ignore')
|
||||
title = decode_text(str_value(entry['Title']))
|
||||
dest = entry.get('Dest')
|
||||
action = entry.get('A')
|
||||
se = entry.get('SE')
|
||||
|
|
|
@ -51,3 +51,43 @@ def nunpack(s, default=0):
|
|||
return unpack('>L', s)[0]
|
||||
else:
|
||||
return TypeError('invalid length: %d' % l)
|
||||
|
||||
PDFDocEncoding = ''.join( unichr(x) for x in (
|
||||
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
|
||||
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
|
||||
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0017, 0x0017,
|
||||
0x02d8, 0x02c7, 0x02c6, 0x02d9, 0x02dd, 0x02db, 0x02da, 0x02dc,
|
||||
0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
|
||||
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
|
||||
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
|
||||
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
|
||||
0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
|
||||
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
|
||||
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
|
||||
0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
|
||||
0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
|
||||
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
|
||||
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
|
||||
0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x0000,
|
||||
0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044,
|
||||
0x2039, 0x203a, 0x2212, 0x2030, 0x201e, 0x201c, 0x201d, 0x2018,
|
||||
0x2019, 0x201a, 0x2122, 0xfb01, 0xfb02, 0x0141, 0x0152, 0x0160,
|
||||
0x0178, 0x017d, 0x0131, 0x0142, 0x0153, 0x0161, 0x017e, 0x0000,
|
||||
0x20ac, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
|
||||
0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x0000, 0x00ae, 0x00af,
|
||||
0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
|
||||
0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
|
||||
0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
|
||||
0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
|
||||
0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
|
||||
0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
|
||||
0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
|
||||
0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
|
||||
0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
|
||||
0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
|
||||
))
|
||||
def decode_text(s):
|
||||
if s.startswith('\xfe\xff'):
|
||||
return unicode(s[2:], 'utf-16be', 'ignore')
|
||||
else:
|
||||
return ''.join( PDFDocEncoding[ord(c)] for c in s )
|
||||
|
|
|
@ -38,7 +38,7 @@ BT
|
|||
/F1 24 Tf
|
||||
100 600 Td
|
||||
0 Tw
|
||||
( Hello World ) Tj
|
||||
[ ( Hello ) 1000 ( World ) ] TJ
|
||||
0 100 Td
|
||||
100 Tw
|
||||
( Hello World ) Tj
|
||||
|
|
Loading…
Reference in New Issue