added a fallback mechanism in case there's no xref.

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@30 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-06-21 17:22:44 +00:00
parent 3dba71b7d2
commit b86ed8be3c
6 changed files with 166 additions and 118 deletions

View File

@ -73,8 +73,8 @@ def dumpxml(out, obj, codec=None):
# dumptrailers # dumptrailers
def dumptrailers(out, doc): def dumptrailers(out, doc):
for xref in doc.xrefs: for xref in doc.xrefs:
out.write('<trailer objid0="%d" objid1="%d">\n' % out.write('<trailer objid="%d-%d">\n' %
(xref.objid0, xref.objid1)) (xref.objid0, xref.objid1-1))
dumpxml(out, xref.trailer) dumpxml(out, xref.trailer)
out.write('\n</trailer>\n\n') out.write('\n</trailer>\n\n')
return return

View File

@ -57,38 +57,38 @@ class FigureItem(PageItem):
## ##
class TextItem: class TextItem:
def __init__(self, matrix, font, size, width, text): def __init__(self, matrix, font, fontsize, width, text):
self.matrix = matrix self.matrix = matrix
self.font = font self.font = font
(a,b,c,d,tx,ty) = self.matrix (a,b,c,d,tx,ty) = self.matrix
(self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size)) (self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
self.width = abs(self.width) self.width = abs(self.width)
self.origin = (tx,ty) self.origin = (tx,ty)
self.direction = 0 self.direction = 0
if not self.font.is_vertical(): if not self.font.is_vertical():
self.direction = 1 self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001)) (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
self.bbox = (tx, ty+descent, self.width, self.size) self.bbox = (tx, ty+descent, self.width, self.fontsize)
else: else:
self.direction = 2 self.direction = 2
mindisp = min( d for (d,_) in text ) mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0)) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
self.bbox = (tx-mindisp, ty+self.width, self.size, self.width) self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
self.text = ''.join( c for (_,c) in text ) self.text = ''.join( c for (_,c) in text )
return return
def __repr__(self): def __repr__(self):
return ('<text matrix=%r font=%r size=%r width=%r text=%r>' % return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
(self.matrix, self.font, self.size, self.width, self.text)) (self.matrix, self.font, self.fontsize, self.width, self.text))
def dump(self, outfp, codec): def dump(self, outfp, codec):
def e(x): def e(x):
x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;') x = x.replace('&','&amp;').replace('>','&gt;').replace('<','&lt;')
return x.encode(codec, 'xmlcharrefreplace') return x.encode(codec, 'xmlcharrefreplace')
bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
outfp.write('<text font="%s" direction="%s" bbox="%s" size="%.3f">' % outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
(e(self.font.fontname), self.direction, bbox, self.size)) (e(self.font.fontname), self.direction, bbox, self.fontsize))
outfp.write(e(self.text)) outfp.write(e(self.text))
outfp.write('</text>\n') outfp.write('</text>\n')
return return

View File

@ -593,12 +593,6 @@ class PDFPageInterpreter:
def __init__(self): def __init__(self):
self.font = None self.font = None
self.fontsize = 0 self.fontsize = 0
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
self.reset() self.reset()
return return
def __repr__(self): def __repr__(self):
@ -609,6 +603,13 @@ class PDFPageInterpreter:
self.charspace, self.wordspace, self.scaling, self.leading, self.charspace, self.wordspace, self.scaling, self.leading,
self.render, self.rise)) self.render, self.rise))
def reset(self): def reset(self):
self.charspace = 0
self.wordspace = 0
self.scaling = 100
self.leading = 0
self.render = 0
self.rise = 0
#
self.matrix = MATRIX_IDENTITY self.matrix = MATRIX_IDENTITY
self.linematrix = (0, 0) self.linematrix = (0, 0)
return return

View File

@ -4,7 +4,7 @@
# ver 0.1, Dec 24 2004- # ver 0.1, Dec 24 2004-
# ver 0.2, Dec 24 2007 # ver 0.2, Dec 24 2007
import sys import sys, re
import md5, struct import md5, struct
stderr = sys.stderr stderr = sys.stderr
from utils import choplist, nunpack from utils import choplist, nunpack
@ -20,6 +20,7 @@ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
## ##
class PDFException(PSException): pass class PDFException(PSException): pass
class PDFSyntaxError(PDFException): pass class PDFSyntaxError(PDFException): pass
class PDFNoValidXRef(PDFSyntaxError): pass
class PDFEncryptionError(PDFException): pass class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFTypeError(PDFException): pass class PDFTypeError(PDFException): pass
@ -276,64 +277,62 @@ class PDFPage:
## PDFXRef ## PDFXRef
## ##
class PDFXRef: class PDFXRef(object):
def __init__(self, parser): def __init__(self):
self.offsets = None
return
def load(self, parser):
while 1: while 1:
try: try:
(pos, line) = parser.nextline() (pos, line) = parser.nextline()
except PSEOF: except PSEOF:
if STRICT: raise PDFNoValidXRef('Unexpected EOF')
raise PDFSyntaxError('Unexpected EOF')
break
if not line: if not line:
if STRICT: raise PDFNoValidXRef('premature eof: %r' % parser)
raise PDFSyntaxError('premature eof: %r' % parser)
break
if line.startswith('trailer'): if line.startswith('trailer'):
parser.seek(pos) parser.seek(pos)
break break
f = line.strip().split(' ') f = line.strip().split(' ')
if len(f) != 2: if len(f) != 2:
if STRICT: raise PDFNoValidXRef('trailer not found: %r: line=%r' % (parser, line))
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
continue
try: try:
(start, nobjs) = map(long, f) (start, nobjs) = map(long, f)
except ValueError: except ValueError:
if STRICT: raise PDFNoValidXRef('invalid line: %r: line=%r' % (parser, line))
raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line)) self.offsets = {}
continue
self.objid0 = start
self.offsets = []
for objid in xrange(start, start+nobjs): for objid in xrange(start, start+nobjs):
try: try:
(_, line) = parser.nextline() (_, line) = parser.nextline()
except PSEOF: except PSEOF:
break raise PDFNoValidXRef('Unexpected EOF')
f = line.strip().split(' ') f = line.strip().split(' ')
if len(f) != 3: if len(f) != 3:
if STRICT: raise PDFNoValidXRef('invalid xref format: %r, line=%r' % (parser, line))
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
continue
(pos, genno, use) = f (pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use)) self.offsets[objid] = (int(genno), long(pos), use)
# read trailer self.load_trailer(parser)
return
def load_trailer(self, parser):
try: try:
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
assert kwd == KEYWORD_TRAILER assert kwd == KEYWORD_TRAILER
(_,dic) = parser.nextobject() (_,dic) = parser.nextobject()
self.trailer = dict_value(dic)
except PSEOF: except PSEOF:
if STRICT: x = parser.pop(1)
raise PDFSyntaxError('Unexpected EOF') if not x:
self.trailer = None raise PDFNoValidXRef('Unexpected EOF')
(_,dic) = x[0]
self.trailer = dict_value(dic)
return return
def getpos(self, objid): def getpos(self, objid):
if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid: try:
raise IndexError(objid) (genno, pos, use) = self.offsets[objid]
(genno, pos, use) = self.offsets[objid-self.objid0] except KeyError:
raise PDFValueError('object not found: %r' % objid)
if use != 'n': if use != 'n':
if STRICT: if STRICT:
raise PDFValueError('unused objid=%r' % objid) raise PDFValueError('unused objid=%r' % objid)
@ -342,16 +341,23 @@ class PDFXRef:
## PDFXRefStream ## PDFXRefStream
## ##
class PDFXRefStream: class PDFXRefStream(object):
def __init__(self, parser): def __init__(self):
(_,objid) = parser.nexttoken() self.objid0 = None
(_,genno) = parser.nexttoken() self.objid1 = None
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
return
def load(self, parser):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
(_,kwd) = parser.nexttoken() (_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_,stream) = parser.nextobject()
if STRICT:
if stream.dic['Type'] != LITERAL_XREF: if stream.dic['Type'] != LITERAL_XREF:
raise PDFSyntaxError('invalid stream spec.') raise PDFNoValidXRef('invalid stream spec.')
size = stream.dic['Size'] size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size)) (start, nobjs) = stream.dic.get('Index', (0,size))
self.objid0 = start self.objid0 = start
@ -380,7 +386,11 @@ class PDFXRefStream:
## PDFDocument ## PDFDocument
## ##
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate']) ## A PDFDocument object represents a PDF document.
## Since a PDF file is usually pretty big, normally it is not loaded
## at once. Rather it is parsed dynamically as processing goes.
## A PDF parser is associated with the document.
##
class PDFDocument: class PDFDocument:
def __init__(self, debug=0): def __init__(self, debug=0):
@ -393,18 +403,28 @@ class PDFDocument:
self.parser = None self.parser = None
self.encryption = None self.encryption = None
self.decipher = None self.decipher = None
self.initialized = False self.ready = False
return return
# set_parser(parser)
# Associates the document with an (already initialized) parser object.
def set_parser(self, parser): def set_parser(self, parser):
if self.parser: return if self.parser: return
self.initialized = True
self.parser = parser self.parser = parser
# The document is set to be temporarily ready during collecting
# all the basic information about the document, e.g.
# the header, the encryption information, and the access rights
# for the document.
self.ready = True
# Retrieve the information of each header that was appended
# (maybe multiple times) at the end of the document.
self.xrefs = list(parser.read_xref()) self.xrefs = list(parser.read_xref())
for xref in self.xrefs: for xref in self.xrefs:
trailer = xref.trailer trailer = xref.trailer
if not trailer: continue if not trailer: continue
# If there's an encryption info, remember it.
if 'Encrypt' in trailer: if 'Encrypt' in trailer:
#assert not self.encryption
self.encryption = (list_value(trailer['ID']), self.encryption = (list_value(trailer['ID']),
dict_value(trailer['Encrypt'])) dict_value(trailer['Encrypt']))
if 'Root' in trailer: if 'Root' in trailer:
@ -412,9 +432,15 @@ class PDFDocument:
break break
else: else:
raise PDFValueError('no /Root object!') raise PDFValueError('no /Root object!')
self.initialized = False # The document is set to be non-ready again, until all the
# proper initialization (asking the password key and
# verifying the access permission, so on) is finished.
self.ready = False
return return
# set_root(root)
# Set the Root dictionary of the document.
# Each PDF file must have exactly one /Root dictionary.
def set_root(self, root): def set_root(self, root):
self.root = root self.root = root
self.catalog = dict_value(self.root) self.catalog = dict_value(self.root)
@ -424,10 +450,14 @@ class PDFDocument:
self.outline = self.catalog.get('Outline') self.outline = self.catalog.get('Outline')
return return
# initialize(password='')
# Perform the initialization with a given password.
# This step is mandatory even if there's no password associated
# with the document.
def initialize(self, password=''): def initialize(self, password=''):
if not self.encryption: if not self.encryption:
self.is_printable = self.is_modifiable = self.is_extractable = True self.is_printable = self.is_modifiable = self.is_extractable = True
self.initialized = True self.ready = True
return return
(docid, param) = self.encryption (docid, param) = self.encryption
if literal_name(param['Filter']) != 'Standard': if literal_name(param['Filter']) != 'Standard':
@ -449,7 +479,7 @@ class PDFDocument:
password = (password+PASSWORD_PADDING)[:32] # 1 password = (password+PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2 hash = md5.md5(password) # 2
hash.update(O) # 3 hash.update(O) # 3
hash.update(struct.pack('<L', P)) # 4 hash.update(struct.pack('<l', P)) # 4
hash.update(docid[0]) # 5 hash.update(docid[0]) # 5
if 4 <= R: if 4 <= R:
# 6 # 6
@ -479,7 +509,7 @@ class PDFDocument:
raise PDFPasswordIncorrect raise PDFPasswordIncorrect
self.decrypt_key = key self.decrypt_key = key
self.decipher = self.decrypt_rc4 # XXX may be AES self.decipher = self.decrypt_rc4 # XXX may be AES
self.initialized = True self.ready = True
return return
def decrypt_rc4(self, objid, genno, data): def decrypt_rc4(self, objid, genno, data):
@ -489,9 +519,11 @@ class PDFDocument:
return Arcfour(key).process(data) return Arcfour(key).process(data)
def getobj(self, objid): def getobj(self, objid):
if not self.initialized: if not self.ready:
raise PDFException('PDFDocument not initialized') raise PDFException('PDFDocument not initialized')
#assert self.xrefs #assert self.xrefs
if 2 <= self.debug:
print >>stderr, 'getobj: objid=%r' % (objid)
if objid in self.objs: if objid in self.objs:
genno = 0 genno = 0
obj = self.objs[objid] obj = self.objs[objid]
@ -551,14 +583,15 @@ class PDFDocument:
obj = decipher_all(self.decipher, objid, genno, obj) obj = decipher_all(self.decipher, objid, genno, obj)
return obj return obj
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
def get_pages(self, debug=0): def get_pages(self, debug=0):
if not self.initialized: if not self.ready:
raise PDFException('PDFDocument not initialized') raise PDFException('PDFDocument not initialized')
#assert self.xrefs #assert self.xrefs
def search(obj, parent): def search(obj, parent):
tree = dict_value(obj).copy() tree = dict_value(obj).copy()
for (k,v) in parent.iteritems(): for (k,v) in parent.iteritems():
if k in INHERITABLE_ATTRS and k not in tree: if k in self.INHERITABLE_ATTRS and k not in tree:
tree[k] = v tree[k] = v
if tree['Type'] == LITERAL_PAGES: if tree['Type'] == LITERAL_PAGES:
if 1 <= debug: if 1 <= debug:
@ -664,9 +697,7 @@ class PDFParser(PSStackParser):
if line: if line:
prev = line prev = line
else: else:
if STRICT: raise PDFNoValidXRef
raise PDFSyntaxError('startxref not found!')
prev = 0
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'xref found: pos=%r' % prev print >>stderr, 'xref found: pos=%r' % prev
self.seek(long(prev)) self.seek(long(prev))
@ -674,28 +705,29 @@ class PDFParser(PSStackParser):
# read xref tables and trailers # read xref tables and trailers
def read_xref(self): def read_xref(self):
try:
self.find_xref() self.find_xref()
while 1: while 1:
# read xref table # read xref table
try: try:
(pos, token) = self.nexttoken() (pos, token) = self.nexttoken()
except PSEOF: except PSEOF:
if STRICT: raise PDFNoValidXRef('Unexpected EOF')
raise PDFSyntaxError('Unexpected EOF')
break
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'read_xref: %r' % token print >>stderr, 'read_xref: %r' % token
if isinstance(token, int): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
self.seek(pos) self.seek(pos)
self.reset() self.reset()
xref = PDFXRefStream(self) xref = PDFXRefStream()
xref.load(self)
else: else:
if token != KEYWORD_XREF: if token != KEYWORD_XREF:
if STRICT: raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
(pos, token)) (pos, token))
xref = PDFXRef(self) self.nextline()
xref = PDFXRef()
xref.load(self)
yield xref yield xref
trailer = xref.trailer trailer = xref.trailer
if not trailer: continue if not trailer: continue
@ -711,6 +743,32 @@ class PDFParser(PSStackParser):
print >>stderr, 'prev trailer: pos=%d' % pos print >>stderr, 'prev trailer: pos=%d' % pos
else: else:
break break
except PDFNoValidXRef:
# fallback
if 1 <= self.debug:
print >>stderr, 'no xref, fallback'
self.seek(0)
pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
offsets = {}
xref = PDFXRef()
while 1:
try:
(pos, line) = self.nextline()
except PSEOF:
break
if line.startswith('trailer'): break
m = pat.match(line)
if not m: continue
(objid, genno) = m.groups()
offsets[int(objid)] = (0, pos, 'f')
xref.offsets = offsets
xref.objid0 = min(offsets.iterkeys())
xref.objid1 = max(offsets.iterkeys())
self.seek(pos)
xref.load_trailer(self)
if 1 <= self.debug:
print >>stderr, 'trailer: %r' % xref.trailer
yield xref
return return
## PDFObjStrmParser ## PDFObjStrmParser

View File

@ -36,7 +36,7 @@ endobj
stream stream
BT BT
/F1 24 Tf /F1 24 Tf
100 100 TD 1 0 0 1 100 700 TD
( Hello World ) Tj ( Hello World ) Tj
ET ET
endstream endstream
@ -53,21 +53,10 @@ endobj
/Encoding /MacRomanEncoding /Encoding /MacRomanEncoding
>> >>
endobj endobj
xref
0 8
0000000000 65535 f
0000000009 00000 n
0000000077 00000 n
0000000125 00000 n
0000000187 00000 n
0000000340 00000 n
0000000435 00000 n
0000000465 00000 n
trailer trailer
<< <<
/Size 8 /Size 8
/Root 1 0 R /Root 1 0 R
>> >>
startxref
578
%%EOF %%EOF

View File

@ -105,7 +105,7 @@ class PDFSGMLParser(sgmllib.SGMLParser):
font = attrs['font'] font = attrs['font']
direction = attrs['direction'] direction = attrs['direction']
bbox = getbbox(attrs['bbox']) bbox = getbbox(attrs['bbox'])
size = fixed(attrs['size']) size = fixed(attrs['fontsize'])
text = Text(font, direction, bbox, size) text = Text(font, direction, bbox, size)
self.curtext = text self.curtext = text
return return