diff --git a/Makefile b/Makefile
index 9b7c3f3..15af1ed 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Makefile for pdfminer
PACKAGE=pdfminer
-VERSION=20090110
+VERSION=20090117
GNUTAR=tar
SVN=svn
PYTHON=python
@@ -39,3 +39,8 @@ check:
commit: clean
$(SVN) commit
+
+WEBDIR=$$HOME/Site/unixuser.org/python/pdfminer
+publish: pack
+ cp $(WORKDIR)/$(DISTFILE) $(WEBDIR)
+ cp README.html $(WEBDIR)/index.html
diff --git a/README.html b/README.html
index ba8af44..7c70e5f 100644
--- a/README.html
+++ b/README.html
@@ -14,7 +14,7 @@ Python PDF parser and analyzer
-Last Modified: Sat Jan 10 20:18:36 JST 2009
+Last Modified: Sun Jan 18 01:31:16 JST 2009
@@ -53,8 +53,8 @@ http://www.unixuser.org/~euske/python/pdfminer/index.html
Download (source):
-
-http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090110.tar.gz
+
+http://www.unixuser.org/~euske/python/pdfminer/pdfminer-dist-20090117.tar.gz
(1.8Mbytes)
@@ -250,6 +250,7 @@ no stream header is displayed for the ease of saving it to a file.
Changes
+- 2009/01/17: Handling a trailer correctly that contains both /XrefStm and /Prev entries.
- 2009/01/10: Handling Type3 font metrics correctly.
- 2008/12/28: Better handling of word spacing. Thanks to Christian Nentwich.
- 2008/09/06: A sample pdf2html webapp added.
diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py
index 85a0976..ab96e16 100755
--- a/pdflib/pdfparser.py
+++ b/pdflib/pdfparser.py
@@ -43,10 +43,14 @@ class PDFXRef(object):
self.offsets = None
return
+ def __repr__(self):
+ return '' % len(self.offsets)
+
def objids(self):
return self.offsets.iterkeys()
- def load(self, parser):
+ def load(self, parser, debug=0):
+ self.offsets = {}
while 1:
try:
(pos, line) = parser.nextline()
@@ -64,7 +68,6 @@ class PDFXRef(object):
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
- self.offsets = {}
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
@@ -74,7 +77,10 @@ class PDFXRef(object):
if len(f) != 3:
raise PDFNoValidXRef('Invalid XRef format: %r, line=%r' % (parser, line))
(pos, genno, use) = f
- self.offsets[objid] = (int(genno), long(pos), use)
+ if use != 'n': continue
+ self.offsets[objid] = (int(genno), long(pos))
+ if debug:
+ print >>stderr, 'xref objects:', self.offsets
self.load_trailer(parser)
return
@@ -94,12 +100,9 @@ class PDFXRef(object):
def getpos(self, objid):
try:
- (genno, pos, use) = self.offsets[objid]
+ (genno, pos) = self.offsets[objid]
except KeyError:
raise
- if use != 'n':
- if STRICT:
- raise PDFSyntaxError('Unused objid=%r' % objid)
return (None, pos)
@@ -108,17 +111,20 @@ class PDFXRef(object):
class PDFXRefStream(object):
def __init__(self):
- self.objid0 = None
- self.objid1 = None
+ self.objid_first = None
+ self.objid_last = None
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
return
- def objids(self):
- return xrange(self.objid0, self.objid1)
+ def __repr__(self):
+ return '' % (self.objid_first, self.objid_last)
- def load(self, parser):
+ def objids(self):
+ return xrange(self.objid_first, self.objid_last+1)
+
+ def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken()
@@ -127,18 +133,21 @@ class PDFXRefStream(object):
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size))
- self.objid0 = start
- self.objid1 = start+nobjs
+ self.objid_first = start
+ self.objid_last = start+nobjs-1
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
+ if debug:
+ print >>stderr, ('xref stream: objid=%d-%d, fields=%d,%d,%d' %
+ (self.objid_first, self.objid_last, self.fl1, self.fl2, self.fl3))
return
def getpos(self, objid):
- if objid < self.objid0 or self.objid1 <= objid:
+ if objid < self.objid_first or self.objid_last < objid:
raise KeyError(objid)
- i = self.entlen * (objid-self.objid0)
+ i = self.entlen * (objid-self.objid_first)
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:
@@ -149,6 +158,8 @@ class PDFXRefStream(object):
objid = nunpack(ent[self.fl1:self.fl1+self.fl2])
index = nunpack(ent[self.fl1+self.fl2:])
return (objid, index)
+ # this is a free object
+ raise KeyError(objid)
## PDFPage
@@ -217,7 +228,7 @@ class PDFDocument(object):
self.ready = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
- self.xrefs = list(parser.read_xref())
+ self.xrefs = parser.read_xref()
for xref in self.xrefs:
trailer = xref.trailer
if not trailer: continue
@@ -340,7 +351,7 @@ class PDFDocument(object):
return None
if strmid:
stream = stream_value(self.getobj(strmid))
- if stream.dic['Type'] is not LITERAL_OBJSTM:
+ if stream.dic.get('Type') is not LITERAL_OBJSTM:
if STRICT:
raise PDFSyntaxError('Not a stream object: %r' % stream)
try:
@@ -362,7 +373,11 @@ class PDFDocument(object):
pass
self.parsed_objs[stream] = objs
genno = 0
- obj = objs[stream.dic['N']*2+index]
+ i = n*2+index
+ try:
+ obj = objs[i]
+ except IndexError:
+ raise PDFSyntaxError('Invalid object number: objid=%r' % (objid))
if isinstance(obj, PDFStream):
obj.set_objid(objid, 0)
else:
@@ -550,49 +565,50 @@ class PDFParser(PSStackParser):
raise PDFNoValidXRef('Unexpected EOF')
if 1 <= self.debug:
print >>stderr, 'xref found: pos=%r' % prev
- self.seek(long(prev))
- return
+ return long(prev)
+ # read xref table
+ def read_xref_from(self, start, xrefs):
+ self.seek(start)
+ self.reset()
+ try:
+ (pos, token) = self.nexttoken()
+ except PSEOF:
+ raise PDFNoValidXRef('Unexpected EOF')
+ if 2 <= self.debug:
+ print >>stderr, 'read_xref_from: start=%d, token=%r' % (start, token)
+ if isinstance(token, int):
+ # XRefStream: PDF-1.5
+ self.seek(pos)
+ self.reset()
+ xref = PDFXRefStream()
+ xref.load(self, debug=self.debug)
+ else:
+ if token is not self.KEYWORD_XREF:
+ raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
+ (pos, token))
+ self.nextline()
+ xref = PDFXRef()
+ xref.load(self, debug=self.debug)
+ xrefs.append(xref)
+ trailer = xref.trailer
+ if 1 <= self.debug:
+ print >>stderr, 'trailer: %r' % trailer
+ if 'XRefStm' in trailer:
+ pos = int_value(trailer['XRefStm'])
+ self.read_xref_from(pos, xrefs)
+ if 'Prev' in trailer:
+ # find previous xref
+ pos = int_value(trailer['Prev'])
+ self.read_xref_from(pos, xrefs)
+ return
+
# read xref tables and trailers
def read_xref(self):
+ xrefs = []
try:
- self.find_xref()
- while 1:
- # read xref table
- try:
- (pos, token) = self.nexttoken()
- except PSEOF:
- raise PDFNoValidXRef('Unexpected EOF')
- if 2 <= self.debug:
- print >>stderr, 'read_xref: %r' % token
- if isinstance(token, int):
- # XRefStream: PDF-1.5
- self.seek(pos)
- self.reset()
- xref = PDFXRefStream()
- xref.load(self)
- else:
- if token is not self.KEYWORD_XREF:
- raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
- (pos, token))
- self.nextline()
- xref = PDFXRef()
- xref.load(self)
- yield xref
- trailer = xref.trailer
- if not trailer: continue
- if 1 <= self.debug:
- print >>stderr, 'trailer: %r' % trailer
- if 'XRefStm' in trailer:
- self.seek(int_value(trailer['XRefStm']))
- if 'Prev' in trailer:
- # find previous xref
- pos = int_value(trailer['Prev'])
- self.seek(pos)
- if 1 <= self.debug:
- print >>stderr, 'prev trailer: pos=%d' % pos
- else:
- break
+ pos = self.find_xref()
+ self.read_xref_from(pos, xrefs)
except PDFNoValidXRef:
# fallback
if 1 <= self.debug:
@@ -610,17 +626,15 @@ class PDFParser(PSStackParser):
m = pat.match(line)
if not m: continue
(objid, genno) = m.groups()
- offsets[int(objid)] = (0, pos, 'f')
+ offsets[int(objid)] = (0, pos)
if not offsets: raise
xref.offsets = offsets
- xref.objid0 = min(offsets.iterkeys())
- xref.objid1 = max(offsets.iterkeys())
self.seek(pos)
xref.load_trailer(self)
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % xref.trailer
- yield xref
- return
+ xrefs.append(xref)
+ return xrefs
## PDFObjStrmParser
diff --git a/pdflib/pdftypes.py b/pdflib/pdftypes.py
index 69db608..f090c5c 100644
--- a/pdflib/pdftypes.py
+++ b/pdflib/pdftypes.py
@@ -190,7 +190,10 @@ class PDFStream(PDFObject):
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
- params = self.dic.get('DecodeParms', {})
+ if 'DP' in self.dic:
+ params = self.dic['DP']
+ else:
+ params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred: