diff --git a/pdf2txt.py b/pdf2txt.py index a6bd115..da15ea0 100755 --- a/pdf2txt.py +++ b/pdf2txt.py @@ -7,31 +7,26 @@ from pdfinterp import PDFDevice, PDFResourceManager, \ PDFPageInterpreter, PDFUnicodeNotDefined, \ mult_matrix, apply_matrix from cmap import CMapDB -from extent import Rect, ExtSet, ExtGrid ## PageItem ## class PageItem: - GRID_SIZE = 20 - def __init__(self, id, (x0,y0,x1,y1), rotate=0): self.id = id - self.bbox = Rect(x0, y0, x1-x0, y1-y0) + self.bbox = (x0, y0, x1-x0, y1-y0) self.rotate = rotate - self.grid = ExtGrid(self.GRID_SIZE) self.objs = [] return def __repr__(self): - bbox = self.bbox - return ('' % - (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1, self.rotate)) + bbox = '%d,%d,%d,%d' % self.bbox + return ('' % + (self.id, bbox, self.rotate)) def add(self, obj): self.objs.append(obj) - self.grid.add(obj.bbox, obj) return def dump(self, outfp, codec): @@ -41,23 +36,14 @@ class PageItem: outfp.write('\n') return - def fuse(self): - for obj1 in self.objs: - f = (lambda obj: obj.bbox) - for rect in obj1.search_range(): - neighbors = [ obj2 for obj2 in self.grid.get(rect, f) if obj2 is not obj1 ] - #print obj1.bbox, obj1.text.encode('euc-jp','ignore'), rect, [ obj.bbox for obj in neighbors ] - return - ## FigureItem ## class FigureItem(PageItem): def __repr__(self): - bbox = self.bbox - return ('
' % - (self.id, bbox.x0,bbox.y0,bbox.x1,bbox.y1)) + bbox = '%d,%d,%d,%d' % self.bbox + return ('
' % (self.id, bbox)) def dump(self, outfp, codec): outfp.write(repr(self)+'\n') @@ -66,9 +52,6 @@ class FigureItem(PageItem): outfp.write('
\n') return - def search_range(self): - return [] - ## TextItem ## @@ -86,12 +69,12 @@ class TextItem: self.direction = 1 (_,ascent) = apply_matrix((a,b,c,d,0,0), (0,font.ascent*size*0.001)) (_,descent) = apply_matrix((a,b,c,d,0,0), (0,font.descent*size*0.001)) - self.bbox = Rect(tx, ty+descent, self.width, self.size) + self.bbox = (tx, ty+descent, self.width, self.size) else: self.direction = 2 mindisp = min( d for (d,_) in text ) (mindisp,_) = apply_matrix((a,b,c,d,0,0), (mindisp*size*0.001,0)) - self.bbox = Rect(tx-mindisp, ty+self.width, self.size, self.width) + self.bbox = (tx-mindisp, ty+self.width, self.size, self.width) self.text = ''.join( c for (_,c) in text ) return @@ -107,12 +90,6 @@ class TextItem: outfp.write('\n') return - def search_range(self): - if self.direction == 1: - return [ Rect(self.bbox.x1, self.bbox.y0, self.size, self.size) ] - else: - return [ Rect(self.bbox.x0, self.bbox.y0-self.size, self.size, self.size) ] - ## TextConverter ## @@ -120,6 +97,10 @@ class TextConverter(PDFDevice): def __init__(self, rsrc, debug=0): PDFDevice.__init__(self, rsrc, debug=debug) + self.reset() + return + + def reset(self): self.pages = [] self.stack = [] return @@ -173,11 +154,8 @@ class TextConverter(PDFDevice): return def dump(self, outfp, codec): - outfp.write('\n') for page in self.pages: - #page.fuse() page.dump(outfp, codec) - outfp.write('\n') return @@ -188,12 +166,15 @@ def pdf2txt(outfp, rsrc, fname, pages, codec, debug=0): fp = file(fname) parser = PDFParser(doc, fp, debug=debug) interpreter = PDFPageInterpreter(rsrc, device, debug=debug) + outfp.write('\n') for (i,page) in enumerate(doc.get_pages(debug=debug)): if pages and (i not in pages): continue + device.reset() interpreter.process_page(page) + device.dump(outfp, codec) fp.close() - device.dump(outfp, codec) device.close() + outfp.write('\n') return diff --git a/pdfinterp.py b/pdfinterp.py index ab0bf8f..efaddeb 100644 --- a/pdfinterp.py +++ b/pdfinterp.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -import sys, re +import sys stderr = sys.stderr from struct import pack, unpack try: @@ -292,8 +292,18 @@ class PDFCIDFont(PDFFont): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), self.cidsysteminfo.get('Ordering', 'unknown')) - self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) - descriptor = dict_value(spec['FontDescriptor']) + try: + self.cmap = CMapDB.get_cmap(literal_name(spec['Encoding'])) + except KeyError: + if STRICT: + raise PDFFontError('cmap is missing') + self.cmap = None + try: + descriptor = dict_value(spec['FontDescriptor']) + except KeyError: + if STRICT: + raise PDFFontError('FontDescriptor is missing') + descriptor = {} ttf = None if 'FontFile2' in descriptor: self.fontfile = stream_value(descriptor.get('FontFile2')) @@ -486,9 +496,6 @@ class PDFContentParser(PSStackParser): PSStackParser.__init__(self, None, debug=debug) return - def __repr__(self): - return '' % self.linepos - def fillfp(self): if not self.fp: if self.istream < len(self.streams): @@ -611,9 +618,9 @@ class PDFPageInterpreter: name = literal_name(spec[0]) else: name = literal_name(spec) - if name == 'ICCBased': + if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return ColorSpace(name, stream_value(spec[1]).dic['N']) - elif name == 'DeviceN': + elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return ColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] @@ -935,7 +942,7 @@ class PDFPageInterpreter: if STRICT: raise PDFInterpreterError('Undefined xobject id: %r' % xobjid) return - if xobj.dic['Subtype'] == LITERAL_FORM: + if xobj.dic.get('Subtype') == LITERAL_FORM and 'BBox' in xobj.dic: if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj interpreter = PDFPageInterpreter(self.rsrc, self.device) diff --git a/pdfparser.py b/pdfparser.py index 308db76..97093c2 100755 --- a/pdfparser.py +++ b/pdfparser.py @@ -30,6 +30,7 @@ class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass class PDFTypeError(PDFException): pass class PDFValueError(PDFException): pass +class PDFNotImplementedError(PSException): pass # some predefined literals and keywords. @@ -40,11 +41,13 @@ LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') LITERAL_CRYPT = PSLiteralTable.intern('Crypt') LITERAL_FLATE_DECODE = PSLiteralTable.intern('FlateDecode') +LITERAL_LZW_DECODE = PSLiteralTable.intern('LZWDecode') KEYWORD_R = PSKeywordTable.intern('R') KEYWORD_OBJ = PSKeywordTable.intern('obj') KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') KEYWORD_STREAM = PSKeywordTable.intern('stream') KEYWORD_XREF = PSKeywordTable.intern('xref') +KEYWORD_TRAILER = PSKeywordTable.intern('trailer') KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' @@ -184,12 +187,13 @@ class PDFStream: return def __repr__(self): - return '' % (self.dic) + return '' % (self.objid, len(self.rawdata), self.dic) def decode(self): assert self.data == None and self.rawdata != None data = self.rawdata if self.decipher: + # Handle encryption data = self.decipher(self.objid, self.genno, data) if 'Filter' not in self.dic: self.data = data @@ -203,31 +207,32 @@ class PDFStream: import zlib # will get errors if the document is encrypted. data = zlib.decompress(data) - # apply predictors - params = self.dic.get('DecodeParms', {}) - if 'Predictor' in params: - pred = int_value(params['Predictor']) - if pred: - if pred != 12: - raise PDFValueError('Unsupported predictor: %r' % pred) - if 'Columns' not in params: - raise PDFValueError('Columns undefined for predictor=12') - columns = int_value(params['Columns']) - buf = '' - ent0 = '\x00' * columns - for i in xrange(0, len(data), columns+1): - pred = data[i] - ent1 = data[i+1:i+1+columns] - if pred == '\x02': - ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) - buf += ent1 - ent0 = ent1 - data = buf - if f == LITERAL_CRYPT: + elif f == LITERAL_LZW_DECODE: + raise PDFNotImplementedError('LZWDecode is currently unsupported.') + elif f == LITERAL_CRYPT: raise PDFEncryptionError else: - if STRICT: - raise PDFValueError('Invalid filter spec: %r' % f) + raise PDFNotImplementedError('Unsupported filter: %r' % f) + # apply predictors + params = self.dic.get('DecodeParms', {}) + if 'Predictor' in params: + pred = int_value(params['Predictor']) + if pred: + if pred != 12: + raise PDFNotImplementedError('Unsupported predictor: %r' % pred) + if 'Columns' not in params: + raise PDFValueError('Columns undefined for predictor=12') + columns = int_value(params['Columns']) + buf = '' + ent0 = '\x00' * columns + for i in xrange(0, len(data), columns+1): + pred = data[i] + ent1 = data[i+1:i+1+columns] + if pred == '\x02': + ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) + buf += ent1 + ent0 = ent1 + data = buf self.data = data self.rawdata = None return @@ -274,18 +279,19 @@ class PDFXRef: def __init__(self, parser): while 1: - (_, line) = parser.nextline() + (pos, line) = parser.nextline() if not line: if STRICT: raise PDFSyntaxError('premature eof: %r' % parser) break - line = line.strip() - f = line.split(' ') - if len(f) != 2: - if line != 'trailer': - if STRICT: - raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) + if line.startswith('trailer'): + parser.seek(pos) break + f = line.strip().split(' ') + if len(f) != 2: + if STRICT: + raise PDFSyntaxError('trailer not found: %r: line=%r' % (parser, line)) + continue (start, nobjs) = map(long, f) self.objid0 = start self.objid1 = start+nobjs @@ -300,7 +306,9 @@ class PDFXRef: (pos, genno, use) = f self.offsets.append((int(genno), long(pos), use)) # read trailer - (_, dic) = parser.nextobject() + (_,kwd) = parser.nexttoken() + assert kwd == KEYWORD_TRAILER + (_,dic) = parser.nextobject() self.trailer = dict_value(dic) return @@ -319,9 +327,9 @@ class PDFXRef: class PDFXRefStream: def __init__(self, parser): - (_,objid) = parser.nextobject() - (_,genno) = parser.nextobject() - parser.nextobject() + (_,objid) = parser.nexttoken() + (_,genno) = parser.nexttoken() + (_,kwd) = parser.nexttoken() (_,stream) = parser.nextobject() if STRICT: if stream.dic['Type'] != LITERAL_XREF: @@ -367,6 +375,7 @@ class PDFDocument: self.parser = None self.encryption = None self.decipher = None + self.is_printable = self.is_modifiable = self.is_extractable = True return def set_parser(self, parser): @@ -401,9 +410,9 @@ class PDFDocument: raise PDFEncryptionError('unknown revision: %r' % R) U = str_value(param['U']) P = int_value(param['P']) - is_printable = bool(P & 4) - is_modifiable = bool(P & 8) - is_extractable = bool(P & 16) + self.is_printable = bool(P & 4) + self.is_modifiable = bool(P & 8) + self.is_extractable = bool(P & 16) # Algorithm 3.2 password = (password+PASSWORD_PADDING)[:32] # 1 hash = md5.md5(password) # 2 @@ -411,7 +420,8 @@ class PDFDocument: hash.update(struct.pack('>stderr, 'read_xref: %r' % line - if line[0].isdigit(): + print >>stderr, 'read_xref: %r' % token + if isinstance(token, int): # XRefStream: PDF-1.5 - self.seek(linepos) + self.seek(pos) self.reset() xref = PDFXRefStream(self) else: - if line.strip() != 'xref': + if token != KEYWORD_XREF: if STRICT: - raise PDFSyntaxError('xref not found: linepos=%d, line=%r' % - (linepos, line)) + raise PDFSyntaxError('xref not found: pos=%d, token=%r' % + (pos, token)) xref = PDFXRef(self) yield xref trailer = xref.trailer diff --git a/psparser.py b/psparser.py index 63175bb..d814575 100644 --- a/psparser.py +++ b/psparser.py @@ -424,6 +424,11 @@ class PSStackParser(PSBaseParser): self.results = [] return + def seek(self, pos): + PSBaseParser.seek(self, pos) + self.reset() + return + def push(self, *objs): self.curstack.extend(objs) return