bug fixes

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@20 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 03:16:27 +00:00
parent 1300046181
commit b1163b69bb
4 changed files with 103 additions and 98 deletions

View File

@ -7,31 +7,26 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix
from cmap import CMapDB
from extent import Rect, ExtSet, ExtGrid
## PageItem
##
class PageItem:
GRID_SIZE = 20
def __init__(self, id, (x0,y0,x1,y1), rotate=0):
self.id = id
self.bbox = Rect(x0, y0, x1-x0, y1-y0)
self.bbox = (x0, y0, x1-x0, y1-y0)
self.rotate = rotate
self.grid = ExtGrid(self.GRID_SIZE)
self.objs = []
return
def __repr__(self):
bbox = self.bbox
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' %
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate))
bbox = '%d,%d,%d,%d' % self.bbox
return ('<page id=%r bbox="%s" rotate="%d">' %
(self.id, bbox, self.rotate))
def add(self, obj):
self.objs.append(obj)
self.grid.add(obj.bbox, obj)
return
def dump(self, outfp, codec):
@ -41,23 +36,14 @@ class PageItem:
outfp.write('</page>\n')
return
def fuse(self):
for obj1 in self.objs:
f = (lambda obj: obj.bbox)
for rect in obj1.search_range():
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
return
## FigureItem
##
class FigureItem(PageItem):
def __repr__(self):
bbox = self.bbox
return ('<figure id=%r bbox="%d,%d,%d,%d">' %
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
bbox = '%d,%d,%d,%d' % self.bbox
return ('<figure id=%r bbox="%s">' % (self.id, bbox))
def dump(self, outfp, codec):
outfp.write(repr(self)+'\n')
@ -66,9 +52,6 @@ class FigureItem(PageItem):
outfp.write('</figure>\n')
return
def search_range(self):
return []
## TextItem
##
@ -86,12 +69,12 @@ class TextItem:
self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
self.bbox = Rect(tx, ty+descent, self.width, self.size)
self.bbox = (tx, ty+descent, self.width, self.size)
else:
self.direction = 2
mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width)
self.bbox = (tx-mindisp, ty+self.width, self.size, self.width)
self.text = ''.join( c for (_,c) in text )
return
@ -107,12 +90,6 @@ class TextItem:
outfp.write('</text>\n')
return
def search_range(self):
if self.direction == 1:
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
else:
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
## TextConverter
##
@ -120,6 +97,10 @@ class TextConverter(PDFDevice):
def __init__(self, rsrc, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug)
self.reset()
return
def reset(self):
self.pages = []
self.stack = []
return
@ -173,11 +154,8 @@ class TextConverter(PDFDevice):
return
def dump(self, outfp, codec):
outfp.write('<document>\n')
for page in self.pages:
#page.fuse()
page.dump(outfp, codec)
outfp.write('</document>\n')
return
@ -188,12 +166,15 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
fp = file(fname)
parser = PDFParser(doc, fp, debug=debug)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
outfp.write('<document>\n')
for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue
device.reset()
interpreter.process_page(page)
device.dump(outfp, codec)
fp.close()
device.dump(outfp, codec)
device.close()
outfp.write('</document>\n')
return

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python
import sys, re
import sys
stderr = sys.stderr
from struct import pack, unpack
try:
@ -292,8 +292,18 @@ class PDFCIDFont(PDFFont):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
self.cidsysteminfo.get('Ordering', 'unknown'))
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
descriptor = dict_value(spec['FontDescriptor'])
try:
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
except KeyError:
if STRICT:
raise PDFFontError('cmap is missing')
self.cmap = None
try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
if STRICT:
raise PDFFontError('FontDescriptor is missing')
descriptor = {}
ttf = None
if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2'))
@ -486,9 +496,6 @@ class PDFContentParser(PSStackParser):
PSStackParser.__init__(self, None, debug=debug)
return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
def fillfp(self):
if not self.fp:
if self.istream < len(self.streams):
@ -611,9 +618,9 @@ class PDFPageInterpreter:
name = literal_name(spec[0])
else:
name = literal_name(spec)
if name == 'ICCBased':
if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN':
elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, len(list_value(spec[1])))
else:
return PREDEFINED_COLORSPACE[name]
@ -935,7 +942,7 @@ class PDFPageInterpreter:
if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return
if xobj.dic['Subtype'] == LITERAL_FORM:
if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device)

View File

@ -30,6 +30,7 @@ class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
# some predefined literals and keywords.
@ -40,11 +41,13 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
@ -184,12 +187,13 @@ class PDFStream:
return
def __repr__(self):
return '<PDFStream: %r>' % (self.dic)
return '<PDFStream(%d): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decode(self):
assert self.data == None and self.rawdata != None
data = self.rawdata
if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic:
self.data = data
@ -203,31 +207,32 @@ class PDFStream:
import zlib
# will get errors if the document is encrypted.
data = zlib.decompress(data)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFValueError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
if f == LITERAL_CRYPT:
elif f == LITERAL_LZW_DECODE:
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
elif f == LITERAL_CRYPT:
raise PDFEncryptionError
else:
if STRICT:
raise PDFValueError('Invalid filter spec: %r' % f)
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors
params = self.dic.get('DecodeParms', {})
if 'Predictor' in params:
pred = int_value(params['Predictor'])
if pred:
if pred != 12:
raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns'])
buf = ''
ent0 = '\x00' * columns
for i in xrange(0, len(data), columns+1):
pred = data[i]
ent1 = data[i+1:i+1+columns]
if pred == '\x02':
ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) )
buf += ent1
ent0 = ent1
data = buf
self.data = data
self.rawdata = None
return
@ -274,18 +279,19 @@ class PDFXRef:
def __init__(self, parser):
while 1:
(_, line) = parser.nextline()
(pos, line) = parser.nextline()
if not line:
if STRICT:
raise PDFSyntaxError('premature eof: %r' % parser)
break
line = line.strip()
f = line.split(' ')
if len(f) != 2:
if line != 'trailer':
if STRICT:
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
if line.startswith('trailer'):
parser.seek(pos)
break
f = line.strip().split(' ')
if len(f) != 2:
if STRICT:
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
continue
(start, nobjs) = map(long, f)
self.objid0 = start
self.objid1 = start+nobjs
@ -300,7 +306,9 @@ class PDFXRef:
(pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use))
# read trailer
(_, dic) = parser.nextobject()
(_,kwd) = parser.nexttoken()
assert kwd == KEYWORD_TRAILER
(_,dic) = parser.nextobject()
self.trailer = dict_value(dic)
return
@ -319,9 +327,9 @@ class PDFXRef:
class PDFXRefStream:
def __init__(self, parser):
(_,objid) = parser.nextobject()
(_,genno) = parser.nextobject()
parser.nextobject()
(_,objid) = parser.nexttoken()
(_,genno) = parser.nexttoken()
(_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject()
if STRICT:
if stream.dic['Type'] != LITERAL_XREF:
@ -367,6 +375,7 @@ class PDFDocument:
self.parser = None
self.encryption = None
self.decipher = None
self.is_printable = self.is_modifiable = self.is_extractable = True
return
def set_parser(self, parser):
@ -401,9 +410,9 @@ class PDFDocument:
raise PDFEncryptionError('unknown revision: %r' % R)
U = str_value(param['U'])
P = int_value(param['P'])
is_printable = bool(P & 4)
is_modifiable = bool(P & 8)
is_extractable = bool(P & 16)
self.is_printable = bool(P & 4)
self.is_modifiable = bool(P & 8)
self.is_extractable = bool(P & 16)
# Algorithm 3.2
password = (password+PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
@ -411,7 +420,8 @@ class PDFDocument:
hash.update(struct.pack('<L', P)) # 4
hash.update(docid[0]) # 5
if 4 <= R:
raise NotImplementedError # 6
# 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R:
# 8
for _ in xrange(50):
@ -429,8 +439,6 @@ class PDFDocument:
k = ''.join( chr(c ^ i) for c in key )
x = Arcfour(k).process(x)
u1 = x+x # 32bytes total
else:
raise PDFEncryptionError('unknown revision: %r' % R)
if R == 2:
is_authenticated = (u1 == U)
else:
@ -485,10 +493,10 @@ class PDFDocument:
obj.set_objid(objid, 0)
else:
self.parser.seek(index)
(_,objid1) = self.parser.nextobject() # objid
(_,genno) = self.parser.nextobject() # genno
assert objid1 == objid
(_,kwd) = self.parser.nextobject()
(_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nexttoken() # genno
assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nexttoken()
if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
(_,obj) = self.parser.nextobject()
@ -582,7 +590,11 @@ class PDFParser(PSStackParser):
self.seek(pos+objlen)
while 1:
(linepos, line) = self.nextline()
if line.startswith('endstream'): break
if 'endstream' in line:
i = line.index('endstream')
objlen += i
data += line[:i]
break
objlen += len(line)
data += line
if 1 <= self.debug:
@ -597,7 +609,7 @@ class PDFParser(PSStackParser):
return
def find_xref(self):
# find the first xref table
# search the last xref table by scanning the file backwards.
prev = None
for line in self.revreadlines():
line = line.strip()
@ -620,19 +632,19 @@ class PDFParser(PSStackParser):
self.find_xref()
while 1:
# read xref table
(linepos, line) = self.nextline()
(pos, token) = self.nexttoken()
if 2 <= self.debug:
print >>stderr, 'read_xref: %r' % line
if line[0].isdigit():
print >>stderr, 'read_xref: %r' % token
if isinstance(token, int):
# XRefStream: PDF-1.5
self.seek(linepos)
self.seek(pos)
self.reset()
xref = PDFXRefStream(self)
else:
if line.strip() != 'xref':
if token != KEYWORD_XREF:
if STRICT:
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' %
(linepos, line))
raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
(pos, token))
xref = PDFXRef(self)
yield xref
trailer = xref.trailer

View File

@ -424,6 +424,11 @@ class PSStackParser(PSBaseParser):
self.results = []
return
def seek(self, pos):
PSBaseParser.seek(self, pos)
self.reset()
return
def push(self, *objs):
self.curstack.extend(objs)
return