added a fallback mechanism in case there's no xref.
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@30 1aa58f4a-7d42-0410-adbc-911cccaed67cpull/1/head
parent
3dba71b7d2
commit
b86ed8be3c
|
@ -73,8 +73,8 @@ def dumpxml(out, obj, codec=None):
|
||||||
# dumptrailers
|
# dumptrailers
|
||||||
def dumptrailers(out, doc):
|
def dumptrailers(out, doc):
|
||||||
for xref in doc.xrefs:
|
for xref in doc.xrefs:
|
||||||
out.write('<trailer objid0="%d" objid1="%d">\n' %
|
out.write('<trailer objid="%d-%d">\n' %
|
||||||
(xref.objid0, xref.objid1))
|
(xref.objid0, xref.objid1-1))
|
||||||
dumpxml(out, xref.trailer)
|
dumpxml(out, xref.trailer)
|
||||||
out.write('\n</trailer>\n\n')
|
out.write('\n</trailer>\n\n')
|
||||||
return
|
return
|
||||||
|
|
22
pdf2txt.py
22
pdf2txt.py
|
@ -57,38 +57,38 @@ class FigureItem(PageItem):
|
||||||
##
|
##
|
||||||
class TextItem:
|
class TextItem:
|
||||||
|
|
||||||
def __init__(self, matrix, font, size, width, text):
|
def __init__(self, matrix, font, fontsize, width, text):
|
||||||
self.matrix = matrix
|
self.matrix = matrix
|
||||||
self.font = font
|
self.font = font
|
||||||
(a,b,c,d,tx,ty) = self.matrix
|
(a,b,c,d,tx,ty) = self.matrix
|
||||||
(self.width, self.size) = apply_matrix((a,b,c,d,0,0), (width,size))
|
(self.width, self.fontsize) = apply_matrix((a,b,c,d,0,0), (width,fontsize))
|
||||||
self.width = abs(self.width)
|
self.width = abs(self.width)
|
||||||
self.origin = (tx,ty)
|
self.origin = (tx,ty)
|
||||||
self.direction = 0
|
self.direction = 0
|
||||||
if not self.font.is_vertical():
|
if not self.font.is_vertical():
|
||||||
self.direction = 1
|
self.direction = 1
|
||||||
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
|
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*fontsize*0.001))
|
||||||
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
|
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*fontsize*0.001))
|
||||||
self.bbox = (tx, ty+descent, self.width, self.size)
|
self.bbox = (tx, ty+descent, self.width, self.fontsize)
|
||||||
else:
|
else:
|
||||||
self.direction = 2
|
self.direction = 2
|
||||||
mindisp = min( d for (d,_) in text )
|
mindisp = min( d for (d,_) in text )
|
||||||
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
|
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*fontsize*0.001,0))
|
||||||
self.bbox = (tx-mindisp, ty+self.width, self.size, self.width)
|
self.bbox = (tx-mindisp, ty+self.width, self.fontsize, self.width)
|
||||||
self.text = ''.join( c for (_,c) in text )
|
self.text = ''.join( c for (_,c) in text )
|
||||||
return
|
return
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return ('<text matrix=%r font=%r size=%r width=%r text=%r>' %
|
return ('<text matrix=%r font=%r fontsize=%r width=%r text=%r>' %
|
||||||
(self.matrix, self.font, self.size, self.width, self.text))
|
(self.matrix, self.font, self.fontsize, self.width, self.text))
|
||||||
|
|
||||||
def dump(self, outfp, codec):
|
def dump(self, outfp, codec):
|
||||||
def e(x):
|
def e(x):
|
||||||
x = x.replace('&','&').replace('>','>').replace('<','<')
|
x = x.replace('&','&').replace('>','>').replace('<','<')
|
||||||
return x.encode(codec, 'xmlcharrefreplace')
|
return x.encode(codec, 'xmlcharrefreplace')
|
||||||
bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
|
bbox = '%.3f,%.3f,%.3f,%.3f' % self.bbox
|
||||||
outfp.write('<text font="%s" direction="%s" bbox="%s" size="%.3f">' %
|
outfp.write('<text font="%s" direction="%s" bbox="%s" fontsize="%.3f">' %
|
||||||
(e(self.font.fontname), self.direction, bbox, self.size))
|
(e(self.font.fontname), self.direction, bbox, self.fontsize))
|
||||||
outfp.write(e(self.text))
|
outfp.write(e(self.text))
|
||||||
outfp.write('</text>\n')
|
outfp.write('</text>\n')
|
||||||
return
|
return
|
||||||
|
|
13
pdfinterp.py
13
pdfinterp.py
|
@ -593,12 +593,6 @@ class PDFPageInterpreter:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.font = None
|
self.font = None
|
||||||
self.fontsize = 0
|
self.fontsize = 0
|
||||||
self.charspace = 0
|
|
||||||
self.wordspace = 0
|
|
||||||
self.scaling = 100
|
|
||||||
self.leading = 0
|
|
||||||
self.render = 0
|
|
||||||
self.rise = 0
|
|
||||||
self.reset()
|
self.reset()
|
||||||
return
|
return
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
@ -609,6 +603,13 @@ class PDFPageInterpreter:
|
||||||
self.charspace, self.wordspace, self.scaling, self.leading,
|
self.charspace, self.wordspace, self.scaling, self.leading,
|
||||||
self.render, self.rise))
|
self.render, self.rise))
|
||||||
def reset(self):
|
def reset(self):
|
||||||
|
self.charspace = 0
|
||||||
|
self.wordspace = 0
|
||||||
|
self.scaling = 100
|
||||||
|
self.leading = 0
|
||||||
|
self.render = 0
|
||||||
|
self.rise = 0
|
||||||
|
#
|
||||||
self.matrix = MATRIX_IDENTITY
|
self.matrix = MATRIX_IDENTITY
|
||||||
self.linematrix = (0, 0)
|
self.linematrix = (0, 0)
|
||||||
return
|
return
|
||||||
|
|
228
pdfparser.py
228
pdfparser.py
|
@ -4,7 +4,7 @@
|
||||||
# ver 0.1, Dec 24 2004-
|
# ver 0.1, Dec 24 2004-
|
||||||
# ver 0.2, Dec 24 2007
|
# ver 0.2, Dec 24 2007
|
||||||
|
|
||||||
import sys
|
import sys, re
|
||||||
import md5, struct
|
import md5, struct
|
||||||
stderr = sys.stderr
|
stderr = sys.stderr
|
||||||
from utils import choplist, nunpack
|
from utils import choplist, nunpack
|
||||||
|
@ -20,6 +20,7 @@ from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
|
||||||
##
|
##
|
||||||
class PDFException(PSException): pass
|
class PDFException(PSException): pass
|
||||||
class PDFSyntaxError(PDFException): pass
|
class PDFSyntaxError(PDFException): pass
|
||||||
|
class PDFNoValidXRef(PDFSyntaxError): pass
|
||||||
class PDFEncryptionError(PDFException): pass
|
class PDFEncryptionError(PDFException): pass
|
||||||
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
class PDFPasswordIncorrect(PDFEncryptionError): pass
|
||||||
class PDFTypeError(PDFException): pass
|
class PDFTypeError(PDFException): pass
|
||||||
|
@ -276,64 +277,62 @@ class PDFPage:
|
||||||
|
|
||||||
## PDFXRef
|
## PDFXRef
|
||||||
##
|
##
|
||||||
class PDFXRef:
|
class PDFXRef(object):
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self):
|
||||||
|
self.offsets = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def load(self, parser):
|
||||||
while 1:
|
while 1:
|
||||||
try:
|
try:
|
||||||
(pos, line) = parser.nextline()
|
(pos, line) = parser.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
raise PDFSyntaxError('Unexpected EOF')
|
|
||||||
break
|
|
||||||
if not line:
|
if not line:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('premature eof: %r' % parser)
|
||||||
raise PDFSyntaxError('premature eof: %r' % parser)
|
|
||||||
break
|
|
||||||
if line.startswith('trailer'):
|
if line.startswith('trailer'):
|
||||||
parser.seek(pos)
|
parser.seek(pos)
|
||||||
break
|
break
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if len(f) != 2:
|
if len(f) != 2:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('trailer not found: %r: line=%r' % (parser, line))
|
||||||
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
|
|
||||||
continue
|
|
||||||
try:
|
try:
|
||||||
(start, nobjs) = map(long, f)
|
(start, nobjs) = map(long, f)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('invalid line: %r: line=%r' % (parser, line))
|
||||||
raise PDFSyntaxError('invalid line: %r: line=%r' % (parser, line))
|
self.offsets = {}
|
||||||
continue
|
|
||||||
self.objid0 = start
|
|
||||||
self.offsets = []
|
|
||||||
for objid in xrange(start, start+nobjs):
|
for objid in xrange(start, start+nobjs):
|
||||||
try:
|
try:
|
||||||
(_, line) = parser.nextline()
|
(_, line) = parser.nextline()
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
break
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
f = line.strip().split(' ')
|
f = line.strip().split(' ')
|
||||||
if len(f) != 3:
|
if len(f) != 3:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('invalid xref format: %r, line=%r' % (parser, line))
|
||||||
raise PDFSyntaxError('invalid xref format: %r, line=%r' % (parser, line))
|
|
||||||
continue
|
|
||||||
(pos, genno, use) = f
|
(pos, genno, use) = f
|
||||||
self.offsets.append((int(genno), long(pos), use))
|
self.offsets[objid] = (int(genno), long(pos), use)
|
||||||
# read trailer
|
self.load_trailer(parser)
|
||||||
|
return
|
||||||
|
|
||||||
|
def load_trailer(self, parser):
|
||||||
try:
|
try:
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
assert kwd == KEYWORD_TRAILER
|
assert kwd == KEYWORD_TRAILER
|
||||||
(_,dic) = parser.nextobject()
|
(_,dic) = parser.nextobject()
|
||||||
self.trailer = dict_value(dic)
|
|
||||||
except PSEOF:
|
except PSEOF:
|
||||||
if STRICT:
|
x = parser.pop(1)
|
||||||
raise PDFSyntaxError('Unexpected EOF')
|
if not x:
|
||||||
self.trailer = None
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
|
(_,dic) = x[0]
|
||||||
|
self.trailer = dict_value(dic)
|
||||||
return
|
return
|
||||||
|
|
||||||
def getpos(self, objid):
|
def getpos(self, objid):
|
||||||
if objid < self.objid0 or (self.objid0+len(self.offsets)) <= objid:
|
try:
|
||||||
raise IndexError(objid)
|
(genno, pos, use) = self.offsets[objid]
|
||||||
(genno, pos, use) = self.offsets[objid-self.objid0]
|
except KeyError:
|
||||||
|
raise PDFValueError('object not found: %r' % objid)
|
||||||
if use != 'n':
|
if use != 'n':
|
||||||
if STRICT:
|
if STRICT:
|
||||||
raise PDFValueError('unused objid=%r' % objid)
|
raise PDFValueError('unused objid=%r' % objid)
|
||||||
|
@ -342,16 +341,23 @@ class PDFXRef:
|
||||||
|
|
||||||
## PDFXRefStream
|
## PDFXRefStream
|
||||||
##
|
##
|
||||||
class PDFXRefStream:
|
class PDFXRefStream(object):
|
||||||
|
|
||||||
def __init__(self, parser):
|
def __init__(self):
|
||||||
(_,objid) = parser.nexttoken()
|
self.objid0 = None
|
||||||
(_,genno) = parser.nexttoken()
|
self.objid1 = None
|
||||||
|
self.data = None
|
||||||
|
self.entlen = None
|
||||||
|
self.fl1 = self.fl2 = self.fl3 = None
|
||||||
|
return
|
||||||
|
|
||||||
|
def load(self, parser):
|
||||||
|
(_,objid) = parser.nexttoken() # ignored
|
||||||
|
(_,genno) = parser.nexttoken() # ignored
|
||||||
(_,kwd) = parser.nexttoken()
|
(_,kwd) = parser.nexttoken()
|
||||||
(_,stream) = parser.nextobject()
|
(_,stream) = parser.nextobject()
|
||||||
if STRICT:
|
if stream.dic['Type'] != LITERAL_XREF:
|
||||||
if stream.dic['Type'] != LITERAL_XREF:
|
raise PDFNoValidXRef('invalid stream spec.')
|
||||||
raise PDFSyntaxError('invalid stream spec.')
|
|
||||||
size = stream.dic['Size']
|
size = stream.dic['Size']
|
||||||
(start, nobjs) = stream.dic.get('Index', (0,size))
|
(start, nobjs) = stream.dic.get('Index', (0,size))
|
||||||
self.objid0 = start
|
self.objid0 = start
|
||||||
|
@ -380,7 +386,11 @@ class PDFXRefStream:
|
||||||
|
|
||||||
## PDFDocument
|
## PDFDocument
|
||||||
##
|
##
|
||||||
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
## A PDFDocument object represents a PDF document.
|
||||||
|
## Since a PDF file is usually pretty big, normally it is not loaded
|
||||||
|
## at once. Rather it is parsed dynamically as processing goes.
|
||||||
|
## A PDF parser is associated with the document.
|
||||||
|
##
|
||||||
class PDFDocument:
|
class PDFDocument:
|
||||||
|
|
||||||
def __init__(self, debug=0):
|
def __init__(self, debug=0):
|
||||||
|
@ -393,18 +403,28 @@ class PDFDocument:
|
||||||
self.parser = None
|
self.parser = None
|
||||||
self.encryption = None
|
self.encryption = None
|
||||||
self.decipher = None
|
self.decipher = None
|
||||||
self.initialized = False
|
self.ready = False
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# set_parser(parser)
|
||||||
|
# Associates the document with an (already initialized) parser object.
|
||||||
def set_parser(self, parser):
|
def set_parser(self, parser):
|
||||||
if self.parser: return
|
if self.parser: return
|
||||||
self.initialized = True
|
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
|
# The document is set to be temporarily ready during collecting
|
||||||
|
# all the basic information about the document, e.g.
|
||||||
|
# the header, the encryption information, and the access rights
|
||||||
|
# for the document.
|
||||||
|
self.ready = True
|
||||||
|
# Retrieve the information of each header that was appended
|
||||||
|
# (maybe multiple times) at the end of the document.
|
||||||
self.xrefs = list(parser.read_xref())
|
self.xrefs = list(parser.read_xref())
|
||||||
for xref in self.xrefs:
|
for xref in self.xrefs:
|
||||||
trailer = xref.trailer
|
trailer = xref.trailer
|
||||||
if not trailer: continue
|
if not trailer: continue
|
||||||
|
# If there's an encryption info, remember it.
|
||||||
if 'Encrypt' in trailer:
|
if 'Encrypt' in trailer:
|
||||||
|
#assert not self.encryption
|
||||||
self.encryption = (list_value(trailer['ID']),
|
self.encryption = (list_value(trailer['ID']),
|
||||||
dict_value(trailer['Encrypt']))
|
dict_value(trailer['Encrypt']))
|
||||||
if 'Root' in trailer:
|
if 'Root' in trailer:
|
||||||
|
@ -412,9 +432,15 @@ class PDFDocument:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
raise PDFValueError('no /Root object!')
|
raise PDFValueError('no /Root object!')
|
||||||
self.initialized = False
|
# The document is set to be non-ready again, until all the
|
||||||
|
# proper initialization (asking the password key and
|
||||||
|
# verifying the access permission, so on) is finished.
|
||||||
|
self.ready = False
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# set_root(root)
|
||||||
|
# Set the Root dictionary of the document.
|
||||||
|
# Each PDF file must have exactly one /Root dictionary.
|
||||||
def set_root(self, root):
|
def set_root(self, root):
|
||||||
self.root = root
|
self.root = root
|
||||||
self.catalog = dict_value(self.root)
|
self.catalog = dict_value(self.root)
|
||||||
|
@ -424,10 +450,14 @@ class PDFDocument:
|
||||||
self.outline = self.catalog.get('Outline')
|
self.outline = self.catalog.get('Outline')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# initialize(password='')
|
||||||
|
# Perform the initialization with a given password.
|
||||||
|
# This step is mandatory even if there's no password associated
|
||||||
|
# with the document.
|
||||||
def initialize(self, password=''):
|
def initialize(self, password=''):
|
||||||
if not self.encryption:
|
if not self.encryption:
|
||||||
self.is_printable = self.is_modifiable = self.is_extractable = True
|
self.is_printable = self.is_modifiable = self.is_extractable = True
|
||||||
self.initialized = True
|
self.ready = True
|
||||||
return
|
return
|
||||||
(docid, param) = self.encryption
|
(docid, param) = self.encryption
|
||||||
if literal_name(param['Filter']) != 'Standard':
|
if literal_name(param['Filter']) != 'Standard':
|
||||||
|
@ -449,7 +479,7 @@ class PDFDocument:
|
||||||
password = (password+PASSWORD_PADDING)[:32] # 1
|
password = (password+PASSWORD_PADDING)[:32] # 1
|
||||||
hash = md5.md5(password) # 2
|
hash = md5.md5(password) # 2
|
||||||
hash.update(O) # 3
|
hash.update(O) # 3
|
||||||
hash.update(struct.pack('<L', P)) # 4
|
hash.update(struct.pack('<l', P)) # 4
|
||||||
hash.update(docid[0]) # 5
|
hash.update(docid[0]) # 5
|
||||||
if 4 <= R:
|
if 4 <= R:
|
||||||
# 6
|
# 6
|
||||||
|
@ -479,7 +509,7 @@ class PDFDocument:
|
||||||
raise PDFPasswordIncorrect
|
raise PDFPasswordIncorrect
|
||||||
self.decrypt_key = key
|
self.decrypt_key = key
|
||||||
self.decipher = self.decrypt_rc4 # XXX may be AES
|
self.decipher = self.decrypt_rc4 # XXX may be AES
|
||||||
self.initialized = True
|
self.ready = True
|
||||||
return
|
return
|
||||||
|
|
||||||
def decrypt_rc4(self, objid, genno, data):
|
def decrypt_rc4(self, objid, genno, data):
|
||||||
|
@ -489,9 +519,11 @@ class PDFDocument:
|
||||||
return Arcfour(key).process(data)
|
return Arcfour(key).process(data)
|
||||||
|
|
||||||
def getobj(self, objid):
|
def getobj(self, objid):
|
||||||
if not self.initialized:
|
if not self.ready:
|
||||||
raise PDFException('PDFDocument not initialized')
|
raise PDFException('PDFDocument not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
|
if 2 <= self.debug:
|
||||||
|
print >>stderr, 'getobj: objid=%r' % (objid)
|
||||||
if objid in self.objs:
|
if objid in self.objs:
|
||||||
genno = 0
|
genno = 0
|
||||||
obj = self.objs[objid]
|
obj = self.objs[objid]
|
||||||
|
@ -551,14 +583,15 @@ class PDFDocument:
|
||||||
obj = decipher_all(self.decipher, objid, genno, obj)
|
obj = decipher_all(self.decipher, objid, genno, obj)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
|
||||||
def get_pages(self, debug=0):
|
def get_pages(self, debug=0):
|
||||||
if not self.initialized:
|
if not self.ready:
|
||||||
raise PDFException('PDFDocument not initialized')
|
raise PDFException('PDFDocument not initialized')
|
||||||
#assert self.xrefs
|
#assert self.xrefs
|
||||||
def search(obj, parent):
|
def search(obj, parent):
|
||||||
tree = dict_value(obj).copy()
|
tree = dict_value(obj).copy()
|
||||||
for (k,v) in parent.iteritems():
|
for (k,v) in parent.iteritems():
|
||||||
if k in INHERITABLE_ATTRS and k not in tree:
|
if k in self.INHERITABLE_ATTRS and k not in tree:
|
||||||
tree[k] = v
|
tree[k] = v
|
||||||
if tree['Type'] == LITERAL_PAGES:
|
if tree['Type'] == LITERAL_PAGES:
|
||||||
if 1 <= debug:
|
if 1 <= debug:
|
||||||
|
@ -664,9 +697,7 @@ class PDFParser(PSStackParser):
|
||||||
if line:
|
if line:
|
||||||
prev = line
|
prev = line
|
||||||
else:
|
else:
|
||||||
if STRICT:
|
raise PDFNoValidXRef
|
||||||
raise PDFSyntaxError('startxref not found!')
|
|
||||||
prev = 0
|
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'xref found: pos=%r' % prev
|
print >>stderr, 'xref found: pos=%r' % prev
|
||||||
self.seek(long(prev))
|
self.seek(long(prev))
|
||||||
|
@ -674,43 +705,70 @@ class PDFParser(PSStackParser):
|
||||||
|
|
||||||
# read xref tables and trailers
|
# read xref tables and trailers
|
||||||
def read_xref(self):
|
def read_xref(self):
|
||||||
self.find_xref()
|
try:
|
||||||
while 1:
|
self.find_xref()
|
||||||
# read xref table
|
while 1:
|
||||||
try:
|
# read xref table
|
||||||
(pos, token) = self.nexttoken()
|
try:
|
||||||
except PSEOF:
|
(pos, token) = self.nexttoken()
|
||||||
if STRICT:
|
except PSEOF:
|
||||||
raise PDFSyntaxError('Unexpected EOF')
|
raise PDFNoValidXRef('Unexpected EOF')
|
||||||
break
|
if 2 <= self.debug:
|
||||||
if 2 <= self.debug:
|
print >>stderr, 'read_xref: %r' % token
|
||||||
print >>stderr, 'read_xref: %r' % token
|
if isinstance(token, int):
|
||||||
if isinstance(token, int):
|
# XRefStream: PDF-1.5
|
||||||
# XRefStream: PDF-1.5
|
self.seek(pos)
|
||||||
self.seek(pos)
|
self.reset()
|
||||||
self.reset()
|
xref = PDFXRefStream()
|
||||||
xref = PDFXRefStream(self)
|
xref.load(self)
|
||||||
else:
|
else:
|
||||||
if token != KEYWORD_XREF:
|
if token != KEYWORD_XREF:
|
||||||
if STRICT:
|
raise PDFNoValidXRef('xref not found: pos=%d, token=%r' %
|
||||||
raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
|
|
||||||
(pos, token))
|
(pos, token))
|
||||||
xref = PDFXRef(self)
|
self.nextline()
|
||||||
yield xref
|
xref = PDFXRef()
|
||||||
trailer = xref.trailer
|
xref.load(self)
|
||||||
if not trailer: continue
|
yield xref
|
||||||
if 1 <= self.debug:
|
trailer = xref.trailer
|
||||||
print >>stderr, 'trailer: %r' % trailer
|
if not trailer: continue
|
||||||
if 'XRefStm' in trailer:
|
|
||||||
self.seek(int_value(trailer['XRefStm']))
|
|
||||||
if 'Prev' in trailer:
|
|
||||||
# find previous xref
|
|
||||||
pos = int_value(trailer['Prev'])
|
|
||||||
self.seek(pos)
|
|
||||||
if 1 <= self.debug:
|
if 1 <= self.debug:
|
||||||
print >>stderr, 'prev trailer: pos=%d' % pos
|
print >>stderr, 'trailer: %r' % trailer
|
||||||
else:
|
if 'XRefStm' in trailer:
|
||||||
break
|
self.seek(int_value(trailer['XRefStm']))
|
||||||
|
if 'Prev' in trailer:
|
||||||
|
# find previous xref
|
||||||
|
pos = int_value(trailer['Prev'])
|
||||||
|
self.seek(pos)
|
||||||
|
if 1 <= self.debug:
|
||||||
|
print >>stderr, 'prev trailer: pos=%d' % pos
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
except PDFNoValidXRef:
|
||||||
|
# fallback
|
||||||
|
if 1 <= self.debug:
|
||||||
|
print >>stderr, 'no xref, fallback'
|
||||||
|
self.seek(0)
|
||||||
|
pat = re.compile(r'^(\d+)\s+(\d+)\s+obj\b')
|
||||||
|
offsets = {}
|
||||||
|
xref = PDFXRef()
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
(pos, line) = self.nextline()
|
||||||
|
except PSEOF:
|
||||||
|
break
|
||||||
|
if line.startswith('trailer'): break
|
||||||
|
m = pat.match(line)
|
||||||
|
if not m: continue
|
||||||
|
(objid, genno) = m.groups()
|
||||||
|
offsets[int(objid)] = (0, pos, 'f')
|
||||||
|
xref.offsets = offsets
|
||||||
|
xref.objid0 = min(offsets.iterkeys())
|
||||||
|
xref.objid1 = max(offsets.iterkeys())
|
||||||
|
self.seek(pos)
|
||||||
|
xref.load_trailer(self)
|
||||||
|
if 1 <= self.debug:
|
||||||
|
print >>stderr, 'trailer: %r' % xref.trailer
|
||||||
|
yield xref
|
||||||
return
|
return
|
||||||
|
|
||||||
## PDFObjStrmParser
|
## PDFObjStrmParser
|
||||||
|
|
|
@ -36,7 +36,7 @@ endobj
|
||||||
stream
|
stream
|
||||||
BT
|
BT
|
||||||
/F1 24 Tf
|
/F1 24 Tf
|
||||||
100 100 TD
|
1 0 0 1 100 700 TD
|
||||||
( Hello World ) Tj
|
( Hello World ) Tj
|
||||||
ET
|
ET
|
||||||
endstream
|
endstream
|
||||||
|
@ -53,21 +53,10 @@ endobj
|
||||||
/Encoding /MacRomanEncoding
|
/Encoding /MacRomanEncoding
|
||||||
>>
|
>>
|
||||||
endobj
|
endobj
|
||||||
xref
|
|
||||||
0 8
|
|
||||||
0000000000 65535 f
|
|
||||||
0000000009 00000 n
|
|
||||||
0000000077 00000 n
|
|
||||||
0000000125 00000 n
|
|
||||||
0000000187 00000 n
|
|
||||||
0000000340 00000 n
|
|
||||||
0000000435 00000 n
|
|
||||||
0000000465 00000 n
|
|
||||||
trailer
|
trailer
|
||||||
<<
|
<<
|
||||||
/Size 8
|
/Size 8
|
||||||
/Root 1 0 R
|
/Root 1 0 R
|
||||||
>>
|
>>
|
||||||
startxref
|
|
||||||
578
|
|
||||||
%%EOF
|
%%EOF
|
||||||
|
|
2
sgml.py
2
sgml.py
|
@ -105,7 +105,7 @@ class PDFSGMLParser(sgmllib.SGMLParser):
|
||||||
font = attrs['font']
|
font = attrs['font']
|
||||||
direction = attrs['direction']
|
direction = attrs['direction']
|
||||||
bbox = getbbox(attrs['bbox'])
|
bbox = getbbox(attrs['bbox'])
|
||||||
size = fixed(attrs['size'])
|
size = fixed(attrs['fontsize'])
|
||||||
text = Text(font, direction, bbox, size)
|
text = Text(font, direction, bbox, size)
|
||||||
self.curtext = text
|
self.curtext = text
|
||||||
return
|
return
|
||||||
|
|
Loading…
Reference in New Issue