bug fixes

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@20 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2008-04-27 03:16:27 +00:00
parent 1300046181
commit b1163b69bb
4 changed files with 103 additions and 98 deletions

View File

@ -7,31 +7,26 @@ from pdfinterp import PDFDevice, PDFResourceManager, \
PDFPageInterpreter, PDFUnicodeNotDefined, \ PDFPageInterpreter, PDFUnicodeNotDefined, \
mult_matrix, apply_matrix mult_matrix, apply_matrix
from cmap import CMapDB from cmap import CMapDB
from extent import Rect, ExtSet, ExtGrid
## PageItem ## PageItem
## ##
class PageItem: class PageItem:
GRID_SIZE = 20
def __init__(self, id, (x0,y0,x1,y1), rotate=0): def __init__(self, id, (x0,y0,x1,y1), rotate=0):
self.id = id self.id = id
self.bbox = Rect(x0, y0, x1-x0, y1-y0) self.bbox = (x0, y0, x1-x0, y1-y0)
self.rotate = rotate self.rotate = rotate
self.grid = ExtGrid(self.GRID_SIZE)
self.objs = [] self.objs = []
return return
def __repr__(self): def __repr__(self):
bbox = self.bbox bbox = '%d,%d,%d,%d' % self.bbox
return ('<page id=%r bbox="%d,%d,%d,%d" rotate="%d">' % return ('<page id=%r bbox="%s" rotate="%d">' %
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate)) (self.id, bbox, self.rotate))
def add(self, obj): def add(self, obj):
self.objs.append(obj) self.objs.append(obj)
self.grid.add(obj.bbox, obj)
return return
def dump(self, outfp, codec): def dump(self, outfp, codec):
@ -41,23 +36,14 @@ class PageItem:
outfp.write('</page>\n') outfp.write('</page>\n')
return return
def fuse(self):
for obj1 in self.objs:
f = (lambda obj: obj.bbox)
for rect in obj1.search_range():
neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ]
#print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ]
return
## FigureItem ## FigureItem
## ##
class FigureItem(PageItem): class FigureItem(PageItem):
def __repr__(self): def __repr__(self):
bbox = self.bbox bbox = '%d,%d,%d,%d' % self.bbox
return ('<figure id=%r bbox="%d,%d,%d,%d">' % return ('<figure id=%r bbox="%s">' % (self.id, bbox))
(self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1))
def dump(self, outfp, codec): def dump(self, outfp, codec):
outfp.write(repr(self)+'\n') outfp.write(repr(self)+'\n')
@ -66,9 +52,6 @@ class FigureItem(PageItem):
outfp.write('</figure>\n') outfp.write('</figure>\n')
return return
def search_range(self):
return []
## TextItem ## TextItem
## ##
@ -86,12 +69,12 @@ class TextItem:
self.direction = 1 self.direction = 1
(_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001)) (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001))
(_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001))
self.bbox = Rect(tx, ty+descent, self.width, self.size) self.bbox = (tx, ty+descent, self.width, self.size)
else: else:
self.direction = 2 self.direction = 2
mindisp = min( d for (d,_) in text ) mindisp = min( d for (d,_) in text )
(mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0)) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0))
self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width) self.bbox = (tx-mindisp, ty+self.width, self.size, self.width)
self.text = ''.join( c for (_,c) in text ) self.text = ''.join( c for (_,c) in text )
return return
@ -107,12 +90,6 @@ class TextItem:
outfp.write('</text>\n') outfp.write('</text>\n')
return return
def search_range(self):
if self.direction == 1:
return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ]
else:
return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ]
## TextConverter ## TextConverter
## ##
@ -120,6 +97,10 @@ class TextConverter(PDFDevice):
def __init__(self, rsrc, debug=0): def __init__(self, rsrc, debug=0):
PDFDevice.__init__(self, rsrc, debug=debug) PDFDevice.__init__(self, rsrc, debug=debug)
self.reset()
return
def reset(self):
self.pages = [] self.pages = []
self.stack = [] self.stack = []
return return
@ -173,11 +154,8 @@ class TextConverter(PDFDevice):
return return
def dump(self, outfp, codec): def dump(self, outfp, codec):
outfp.write('<document>\n')
for page in self.pages: for page in self.pages:
#page.fuse()
page.dump(outfp, codec) page.dump(outfp, codec)
outfp.write('</document>\n')
return return
@ -188,12 +166,15 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0):
fp = file(fname) fp = file(fname)
parser = PDFParser(doc, fp, debug=debug) parser = PDFParser(doc, fp, debug=debug)
interpreter = PDFPageInterpreter(rsrc, device, debug=debug) interpreter = PDFPageInterpreter(rsrc, device, debug=debug)
outfp.write('<document>\n')
for (i,page) in enumerate(doc.get_pages(debug=debug)): for (i,page) in enumerate(doc.get_pages(debug=debug)):
if pages and (i not in pages): continue if pages and (i not in pages): continue
device.reset()
interpreter.process_page(page) interpreter.process_page(page)
fp.close()
device.dump(outfp, codec) device.dump(outfp, codec)
fp.close()
device.close() device.close()
outfp.write('</document>\n')
return return

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
import sys, re import sys
stderr = sys.stderr stderr = sys.stderr
from struct import pack, unpack from struct import pack, unpack
try: try:
@ -292,8 +292,18 @@ class PDFCIDFont(PDFFont):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'),
self.cidsysteminfo.get('Ordering', 'unknown')) self.cidsysteminfo.get('Ordering', 'unknown'))
try:
self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding']))
except KeyError:
if STRICT:
raise PDFFontError('cmap is missing')
self.cmap = None
try:
descriptor = dict_value(spec['FontDescriptor']) descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
if STRICT:
raise PDFFontError('FontDescriptor is missing')
descriptor = {}
ttf = None ttf = None
if 'FontFile2' in descriptor: if 'FontFile2' in descriptor:
self.fontfile = stream_value(descriptor.get('FontFile2')) self.fontfile = stream_value(descriptor.get('FontFile2'))
@ -486,9 +496,6 @@ class PDFContentParser(PSStackParser):
PSStackParser.__init__(self, None, debug=debug) PSStackParser.__init__(self, None, debug=debug)
return return
def __repr__(self):
return '<PDFParser: linepos=%d>' % self.linepos
def fillfp(self): def fillfp(self):
if not self.fp: if not self.fp:
if self.istream < len(self.streams): if self.istream < len(self.streams):
@ -611,9 +618,9 @@ class PDFPageInterpreter:
name = literal_name(spec[0]) name = literal_name(spec[0])
else: else:
name = literal_name(spec) name = literal_name(spec)
if name == 'ICCBased': if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, stream_value(spec[1]).dic['N']) return ColorSpace(name, stream_value(spec[1]).dic['N'])
elif name == 'DeviceN': elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec):
return ColorSpace(name, len(list_value(spec[1]))) return ColorSpace(name, len(list_value(spec[1])))
else: else:
return PREDEFINED_COLORSPACE[name] return PREDEFINED_COLORSPACE[name]
@ -935,7 +942,7 @@ class PDFPageInterpreter:
if STRICT: if STRICT:
raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) raise PDFInterpreterError('Undefined xobject id: %r' % xobjid)
return return
if xobj.dic['Subtype'] == LITERAL_FORM: if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic:
if 1 <= self.debug: if 1 <= self.debug:
print >>stderr, 'Processing xobj: %r' % xobj print >>stderr, 'Processing xobj: %r' % xobj
interpreter = PDFPageInterpreter(self.rsrc, self.device) interpreter = PDFPageInterpreter(self.rsrc, self.device)

View File

@ -30,6 +30,7 @@ class PDFEncryptionError(PDFException): pass
class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFPasswordIncorrect(PDFEncryptionError): pass
class PDFTypeError(PDFException): pass class PDFTypeError(PDFException): pass
class PDFValueError(PDFException): pass class PDFValueError(PDFException): pass
class PDFNotImplementedError(PSException): pass
# some predefined literals and keywords. # some predefined literals and keywords.
@ -40,11 +41,13 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages')
LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERAL_CRYPT = PSLiteralTable.intern('Crypt')
LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode')
LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode')
KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_R = PSKeywordTable.intern('R')
KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_OBJ = PSKeywordTable.intern('obj')
KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj')
KEYWORD_STREAM = PSKeywordTable.intern('stream') KEYWORD_STREAM = PSKeywordTable.intern('stream')
KEYWORD_XREF = PSKeywordTable.intern('xref') KEYWORD_XREF = PSKeywordTable.intern('xref')
KEYWORD_TRAILER = PSKeywordTable.intern('trailer')
KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref')
PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz'
@ -184,12 +187,13 @@ class PDFStream:
return return
def __repr__(self): def __repr__(self):
return '<PDFStream: %r>' % (self.dic) return '<PDFStream(%d): raw=%d, %r>' % (self.objid, len(self.rawdata), self.dic)
def decode(self): def decode(self):
assert self.data == None and self.rawdata != None assert self.data == None and self.rawdata != None
data = self.rawdata data = self.rawdata
if self.decipher: if self.decipher:
# Handle encryption
data = self.decipher(self.objid, self.genno, data) data = self.decipher(self.objid, self.genno, data)
if 'Filter' not in self.dic: if 'Filter' not in self.dic:
self.data = data self.data = data
@ -203,13 +207,19 @@ class PDFStream:
import zlib import zlib
# will get errors if the document is encrypted. # will get errors if the document is encrypted.
data = zlib.decompress(data) data = zlib.decompress(data)
elif f == LITERAL_LZW_DECODE:
raise PDFNotImplementedError('LZWDecode is currently unsupported.')
elif f == LITERAL_CRYPT:
raise PDFEncryptionError
else:
raise PDFNotImplementedError('Unsupported filter: %r' % f)
# apply predictors # apply predictors
params = self.dic.get('DecodeParms', {}) params = self.dic.get('DecodeParms', {})
if 'Predictor' in params: if 'Predictor' in params:
pred = int_value(params['Predictor']) pred = int_value(params['Predictor'])
if pred: if pred:
if pred != 12: if pred != 12:
raise PDFValueError('Unsupported predictor: %r' % pred) raise PDFNotImplementedError('Unsupported predictor: %r' % pred)
if 'Columns' not in params: if 'Columns' not in params:
raise PDFValueError('Columns undefined for predictor=12') raise PDFValueError('Columns undefined for predictor=12')
columns = int_value(params['Columns']) columns = int_value(params['Columns'])
@ -223,11 +233,6 @@ class PDFStream:
buf += ent1 buf += ent1
ent0 = ent1 ent0 = ent1
data = buf data = buf
if f == LITERAL_CRYPT:
raise PDFEncryptionError
else:
if STRICT:
raise PDFValueError('Invalid filter spec: %r' % f)
self.data = data self.data = data
self.rawdata = None self.rawdata = None
return return
@ -274,18 +279,19 @@ class PDFXRef:
def __init__(self, parser): def __init__(self, parser):
while 1: while 1:
(_, line) = parser.nextline() (pos, line) = parser.nextline()
if not line: if not line:
if STRICT: if STRICT:
raise PDFSyntaxError('premature eof: %r' % parser) raise PDFSyntaxError('premature eof: %r' % parser)
break break
line = line.strip() if line.startswith('trailer'):
f = line.split(' ') parser.seek(pos)
break
f = line.strip().split(' ')
if len(f) != 2: if len(f) != 2:
if line != 'trailer':
if STRICT: if STRICT:
raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line))
break continue
(start, nobjs) = map(long, f) (start, nobjs) = map(long, f)
self.objid0 = start self.objid0 = start
self.objid1 = start+nobjs self.objid1 = start+nobjs
@ -300,7 +306,9 @@ class PDFXRef:
(pos, genno, use) = f (pos, genno, use) = f
self.offsets.append((int(genno), long(pos), use)) self.offsets.append((int(genno), long(pos), use))
# read trailer # read trailer
(_, dic) = parser.nextobject() (_,kwd) = parser.nexttoken()
assert kwd == KEYWORD_TRAILER
(_,dic) = parser.nextobject()
self.trailer = dict_value(dic) self.trailer = dict_value(dic)
return return
@ -319,9 +327,9 @@ class PDFXRef:
class PDFXRefStream: class PDFXRefStream:
def __init__(self, parser): def __init__(self, parser):
(_,objid) = parser.nextobject() (_,objid) = parser.nexttoken()
(_,genno) = parser.nextobject() (_,genno) = parser.nexttoken()
parser.nextobject() (_,kwd) = parser.nexttoken()
(_,stream) = parser.nextobject() (_,stream) = parser.nextobject()
if STRICT: if STRICT:
if stream.dic['Type'] != LITERAL_XREF: if stream.dic['Type'] != LITERAL_XREF:
@ -367,6 +375,7 @@ class PDFDocument:
self.parser = None self.parser = None
self.encryption = None self.encryption = None
self.decipher = None self.decipher = None
self.is_printable = self.is_modifiable = self.is_extractable = True
return return
def set_parser(self, parser): def set_parser(self, parser):
@ -401,9 +410,9 @@ class PDFDocument:
raise PDFEncryptionError('unknown revision: %r' % R) raise PDFEncryptionError('unknown revision: %r' % R)
U = str_value(param['U']) U = str_value(param['U'])
P = int_value(param['P']) P = int_value(param['P'])
is_printable = bool(P & 4) self.is_printable = bool(P & 4)
is_modifiable = bool(P & 8) self.is_modifiable = bool(P & 8)
is_extractable = bool(P & 16) self.is_extractable = bool(P & 16)
# Algorithm 3.2 # Algorithm 3.2
password = (password+PASSWORD_PADDING)[:32] # 1 password = (password+PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2 hash = md5.md5(password) # 2
@ -411,7 +420,8 @@ class PDFDocument:
hash.update(struct.pack('<L', P)) # 4 hash.update(struct.pack('<L', P)) # 4
hash.update(docid[0]) # 5 hash.update(docid[0]) # 5
if 4 <= R: if 4 <= R:
raise NotImplementedError # 6 # 6
raise PDFNotImplementedError('Revision 4 encryption is currently unsupported')
if 3 <= R: if 3 <= R:
# 8 # 8
for _ in xrange(50): for _ in xrange(50):
@ -429,8 +439,6 @@ class PDFDocument:
k = ''.join( chr(c ^ i) for c in key ) k = ''.join( chr(c ^ i) for c in key )
x = Arcfour(k).process(x) x = Arcfour(k).process(x)
u1 = x+x # 32bytes total u1 = x+x # 32bytes total
else:
raise PDFEncryptionError('unknown revision: %r' % R)
if R == 2: if R == 2:
is_authenticated = (u1 == U) is_authenticated = (u1 == U)
else: else:
@ -485,10 +493,10 @@ class PDFDocument:
obj.set_objid(objid, 0) obj.set_objid(objid, 0)
else: else:
self.parser.seek(index) self.parser.seek(index)
(_,objid1) = self.parser.nextobject() # objid (_,objid1) = self.parser.nexttoken() # objid
(_,genno) = self.parser.nextobject() # genno (_,genno) = self.parser.nexttoken() # genno
assert objid1 == objid assert objid1 == objid, (objid, objid1)
(_,kwd) = self.parser.nextobject() (_,kwd) = self.parser.nexttoken()
if kwd != KEYWORD_OBJ: if kwd != KEYWORD_OBJ:
raise PDFSyntaxError('invalid obj spec: offset=%r' % index) raise PDFSyntaxError('invalid obj spec: offset=%r' % index)
(_,obj) = self.parser.nextobject() (_,obj) = self.parser.nextobject()
@ -582,7 +590,11 @@ class PDFParser(PSStackParser):
self.seek(pos+objlen) self.seek(pos+objlen)
while 1: while 1:
(linepos, line) = self.nextline() (linepos, line) = self.nextline()
if line.startswith('endstream'): break if 'endstream' in line:
i = line.index('endstream')
objlen += i
data += line[:i]
break
objlen += len(line) objlen += len(line)
data += line data += line
if 1 <= self.debug: if 1 <= self.debug:
@ -597,7 +609,7 @@ class PDFParser(PSStackParser):
return return
def find_xref(self): def find_xref(self):
# find the first xref table # search the last xref table by scanning the file backwards.
prev = None prev = None
for line in self.revreadlines(): for line in self.revreadlines():
line = line.strip() line = line.strip()
@ -620,19 +632,19 @@ class PDFParser(PSStackParser):
self.find_xref() self.find_xref()
while 1: while 1:
# read xref table # read xref table
(linepos, line) = self.nextline() (pos, token) = self.nexttoken()
if 2 <= self.debug: if 2 <= self.debug:
print >>stderr, 'read_xref: %r' % line print >>stderr, 'read_xref: %r' % token
if line[0].isdigit(): if isinstance(token, int):
# XRefStream: PDF-1.5 # XRefStream: PDF-1.5
self.seek(linepos) self.seek(pos)
self.reset() self.reset()
xref = PDFXRefStream(self) xref = PDFXRefStream(self)
else: else:
if line.strip() != 'xref': if token != KEYWORD_XREF:
if STRICT: if STRICT:
raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % raise PDFSyntaxError('xref not found: pos=%d, token=%r' %
(linepos, line)) (pos, token))
xref = PDFXRef(self) xref = PDFXRef(self)
yield xref yield xref
trailer = xref.trailer trailer = xref.trailer

View File

@ -424,6 +424,11 @@ class PSStackParser(PSBaseParser):
self.results = [] self.results = []
return return
def seek(self, pos):
PSBaseParser.seek(self, pos)
self.reset()
return
def push(self, *objs): def push(self, *objs):
self.curstack.extend(objs) self.curstack.extend(objs)
return return