From c41c2793215b21df5b8650926dccf196ac0cf6d7 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Sat, 10 Jan 2009 09:14:46 +0000 Subject: [PATCH] tmp git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@57 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/page.py | 45 ++++- pdflib/pdf2txt.py | 6 +- pdflib/pdfcolor.py | 35 ++++ pdflib/pdffont.py | 341 ++++++++++++++++++++++++++++++++++ pdflib/pdfinterp.py | 434 +++----------------------------------------- pdflib/pdfparser.py | 352 +++++++---------------------------- pdflib/pdftypes.py | 222 ++++++++++++++++++++++ pdflib/psparser.py | 3 +- pdflib/utils.py | 2 + 9 files changed, 740 insertions(+), 700 deletions(-) create mode 100644 pdflib/pdfcolor.py create mode 100644 pdflib/pdffont.py create mode 100644 pdflib/pdftypes.py diff --git a/pdflib/page.py b/pdflib/page.py index ad75bdd..b8f5514 100644 --- a/pdflib/page.py +++ b/pdflib/page.py @@ -2,10 +2,53 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdflib.pdfinterp import PDFDevice, PDFUnicodeNotDefined +from pdflib.pdffont import PDFUnicodeNotDefined from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix +## PDFDevice +## +class PDFDevice(object): + + debug = 0 + + def __init__(self, rsrc): + self.rsrc = rsrc + self.ctm = None + return + + def __repr__(self): + return '' + + def close(self): + return + + def set_ctm(self, ctm): + self.ctm = ctm + return + + def begin_tag(self, tag, props=None): + return + def end_tag(self): + return + def do_tag(self, tag, props=None): + return + + def begin_page(self, page): + return + def end_page(self, page): + return + def begin_figure(self, name, bbox): + return + def end_figure(self, name): + return + + def render_string(self, textstate, textmatrix, seq): + raise NotImplementedError + def render_image(self, stream, size, matrix): + raise NotImplementedError + + ## PageItem ## class PageItem(object): diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 85bb638..59071ef 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -3,10 +3,10 @@ import sys stdout = sys.stdout stderr = sys.stderr from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdflib.pdfinterp import PDFDevice, PDFResourceManager, \ - PDFPageInterpreter, PDFUnicodeNotDefined +from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdflib.pdffont import PDFUnicodeNotDefined from pdflib.cmap import CMapDB -from pdflib.page import PageItem, FigureItem, TextItem, PageAggregator +from pdflib.page import PDFDevice, PageItem, FigureItem, TextItem, PageAggregator def enc(x, codec): diff --git a/pdflib/pdfcolor.py b/pdflib/pdfcolor.py new file mode 100644 index 0000000..cbdd076 --- /dev/null +++ b/pdflib/pdfcolor.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +import sys +stderr = sys.stderr +from pdflib.psparser import PSLiteralTable + + +## ColorSpace +## +LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') +LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') +LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') + +class ColorSpace(object): + + def __init__(self, name, ncomponents): + self.name = name + self.ncomponents = ncomponents + return + + def __repr__(self): + return '' % (self.name, self.ncomponents) + + +PREDEFINED_COLORSPACE = dict( + (name, ColorSpace(name,n)) for (name,n) in { + 'CalRGB': 3, + 'CalGray': 1, + 'Lab': 3, + 'DeviceRGB': 3, + 'DeviceCMYK': 4, + 'DeviceGray': 1, + 'Separation': 1, + 'Indexed': 1, + 'Pattern': 1, + }.iteritems()) diff --git a/pdflib/pdffont.py b/pdflib/pdffont.py new file mode 100644 index 0000000..3dd0089 --- /dev/null +++ b/pdflib/pdffont.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python +import sys +stderr = sys.stderr +from struct import pack, unpack +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO +from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \ + literal_name, keyword_name, STRICT +from pdflib.pdftypes import PDFException, \ + resolve1, int_value, float_value, num_value, \ + str_value, list_value, dict_value, stream_value +from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB + + +## Fonts +## + +class PDFFontError(PDFException): pass +class PDFUnicodeNotDefined(PDFFontError): pass + +LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') + + +# PDFFont +class PDFFont(object): + + def __init__(self, descriptor, widths, default_width=None, font_matrix=None): + self.descriptor = descriptor + self.widths = widths + self.fontname = descriptor.get('FontName', 'unknown') + if isinstance(self.fontname, PSLiteral): + self.fontname = literal_name(self.fontname) + self.ascent = num_value(descriptor.get('Ascent', 0)) + self.descent = num_value(descriptor.get('Descent', 0)) + self.default_width = default_width or descriptor.get('MissingWidth', 0) + self.leading = num_value(descriptor.get('Leading', 0)) + self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) + self.font_matrix = font_matrix or (.001,0,0,.001,0,0) + return + + def __repr__(self): + return '' + + def is_vertical(self): + return False + + def is_multibyte(self): + return False + + def decode(self, bytes): + return map(ord, bytes) + + def char_width(self, cid): + return self.widths.get(cid, self.default_width) + + def char_disp(self, cid): + return 0 + + def string_width(self, s): + return sum( self.char_width(cid) for cid in self.decode(s) ) + +# PDFSimpleFont +class PDFSimpleFont(PDFFont): + + def __init__(self, descriptor, widths, spec, font_matrix=None): + # Font encoding is specified either by a name of + # built-in encoding or a dictionary that describes + # the differences. + if 'Encoding' in spec: + encoding = resolve1(spec['Encoding']) + else: + encoding = LITERAL_STANDARD_ENCODING + if isinstance(encoding, dict): + name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) + diff = list_value(encoding.get('Differences', None)) + self.encoding = EncodingDB.get_encoding(name, diff) + else: + self.encoding = EncodingDB.get_encoding(literal_name(encoding)) + self.ucs2_cmap = None + if 'ToUnicode' in spec: + strm = stream_value(spec['ToUnicode']) + self.ucs2_cmap = CMap() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() + PDFFont.__init__(self, descriptor, widths, font_matrix=font_matrix) + return + + def to_unicode(self, cid): + if not self.ucs2_cmap: + try: + return self.encoding[cid] + except KeyError: + raise PDFUnicodeNotDefined(None, cid) + code = self.ucs2_cmap.tocode(cid) + if not code: + raise PDFUnicodeNotDefined(None, cid) + chars = unpack('>%dH' % (len(code)/2), code) + return ''.join( unichr(c) for c in chars ) + + +# PDFType1Font +class PDFType1Font(PDFSimpleFont): + + def __init__(self, spec): + try: + self.basefont = literal_name(spec['BaseFont']) + except KeyError: + if STRICT: + raise PDFFontError('BaseFont is missing') + self.basefont = 'unknown' + try: + (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) + except KeyError: + descriptor = dict_value(spec.get('FontDescriptor', {})) + firstchar = int_value(spec.get('FirstChar', 0)) + lastchar = int_value(spec.get('LastChar', 255)) + widths = list_value(spec.get('Widths', [0]*256)) + widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) + PDFSimpleFont.__init__(self, descriptor, widths, spec) + return + + def __repr__(self): + return '' % self.basefont + +# PDFTrueTypeFont +class PDFTrueTypeFont(PDFType1Font): + + def __repr__(self): + return '' % self.basefont + +# PDFType3Font +class PDFType3Font(PDFSimpleFont): + + def __init__(self, spec): + firstchar = int_value(spec.get('FirstChar', 0)) + lastchar = int_value(spec.get('LastChar', 0)) + widths = list_value(spec.get('Widths', [0]*256)) + widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) + if 'FontDescriptor' in spec: + descriptor = dict_value(spec['FontDescriptor']) + else: + descriptor = {'FontName':spec.get('Name'), + 'Ascent':0, 'Descent':0, + 'FontBBox':spec['FontBBox']} + PDFSimpleFont.__init__(self, descriptor, widths, spec, + font_matrix=tuple(list_value(spec.get('FontMatrix')))) + return + + def __repr__(self): + return '' + + +# PDFCIDFont + +## TrueTypeFont +## +class TrueTypeFont(object): + + class CMapNotFound(Exception): pass + + def __init__(self, name, fp): + self.name = name + self.fp = fp + self.tables = {} + fonttype = fp.read(4) + (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + for i in xrange(ntables): + (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) + self.tables[name] = (offset, length) + return + + def create_cmap(self): + if 'cmap' not in self.tables: + raise TrueTypeFont.CMapNotFound + (base_offset, length) = self.tables['cmap'] + fp = self.fp + fp.seek(base_offset) + (version, nsubtables) = unpack('>HH', fp.read(4)) + subtables = [] + for i in xrange(nsubtables): + subtables.append(unpack('>HHL', fp.read(8))) + char2gid = {} + # Only supports subtable type 0, 2 and 4. + for (_1, _2, st_offset) in subtables: + fp.seek(base_offset+st_offset) + (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) + if fmttype == 0: + char2gid.update(enumerate(unpack('>256B', fp.read(256)))) + elif fmttype == 2: + subheaderkeys = unpack('>256H', fp.read(512)) + firstbytes = [0]*8192 + for (i,k) in enumerate(subheaderkeys): + firstbytes[k/8] = i + nhdrs = max(subheaderkeys)/8 + 1 + hdrs = [] + for i in xrange(nhdrs): + (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) + hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) + for (i,firstcode,entcount,delta,pos) in hdrs: + if not entcount: continue + first = firstcode + (firstbytes[i] << 8) + fp.seek(pos) + for c in xrange(entcount): + gid = unpack('>H', fp.read(2)) + if gid: + gid += delta + char2gid[first+c] = gid + elif fmttype == 4: + (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + segcount /= 2 + ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) + fp.read(2) + scs = unpack('>%dH' % segcount, fp.read(2*segcount)) + idds = unpack('>%dh' % segcount, fp.read(2*segcount)) + pos = fp.tell() + idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) + for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): + if idr: + fp.seek(pos+idr) + for c in xrange(sc, ec+1): + char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff + else: + for c in xrange(sc, ec+1): + char2gid[c] = (c + idd) & 0xffff + gid2char = dict( (gid, pack('>H', char)) + for (char,gid) in char2gid.iteritems() ) + cmapname = 'Adobe-Identity-UCS-%s' % self.name + return CMap(cmapname).update(char2gid, gid2char) + +class PDFCIDFont(PDFFont): + + def __init__(self, spec): + try: + self.basefont = literal_name(spec['BaseFont']) + except KeyError: + if STRICT: + raise PDFFontError('BaseFont is missing') + self.basefont = 'unknown' + self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) + self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), + self.cidsysteminfo.get('Ordering', 'unknown')) + try: + name = literal_name(spec['Encoding']) + except KeyError: + if STRICT: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' + try: + self.cmap = CMapDB.get_cmap(name, strict=STRICT) + except CMapDB.CMapNotFound, e: + raise PDFFontError(e) + try: + descriptor = dict_value(spec['FontDescriptor']) + except KeyError: + if STRICT: + raise PDFFontError('FontDescriptor is missing') + descriptor = {} + ttf = None + if 'FontFile2' in descriptor: + self.fontfile = stream_value(descriptor.get('FontFile2')) + ttf = TrueTypeFont(self.basefont, + StringIO(self.fontfile.get_data())) + self.ucs2_cmap = None + if 'ToUnicode' in spec: + strm = stream_value(spec['ToUnicode']) + self.ucs2_cmap = CMap() + CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() + elif self.cidcoding == 'Adobe-Identity': + if ttf: + try: + self.ucs2_cmap = ttf.create_cmap() + except TrueTypeFont.CMapNotFound: + pass + else: + try: + self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding, + strict=STRICT) + except CMapDB.CMapNotFound, e: + raise PDFFontError(e) + + def get_width(seq): + dic = {} + char1 = char2 = None + for v in seq: + if char1 == None: + char1 = v + elif char2 == None and isinstance(v, int): + char2 = v + else: + if char2 == None: + for (i,w) in enumerate(v): + dic[char1+i] = w + else: + for i in xrange(char1, char2+1): + dic[i] = v + char1 = char2 = None + return dic + self.vertical = self.cmap.is_vertical() + if self.vertical: + # writing mode: vertical + dic = get_width(list_value(spec.get('W2', []))) + widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) + self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) + (d,w) = spec.get('DW2', [880, -1000]) + default_width = w + self.default_disp = d + else: + # writing mode: horizontal + widths = get_width(list_value(spec.get('W', []))) + self.disps = {} + default_width = spec.get('DW', 1000) + self.default_disp = 0 + PDFFont.__init__(self, descriptor, widths, default_width=default_width) + return + + def __repr__(self): + return '' % (self.basefont, self.cidcoding) + + def is_vertical(self): + return self.vertical + + def is_multibyte(self): + return True + + def decode(self, bytes): + return self.cmap.decode(bytes) + + def char_disp(self, cid): + return self.disps.get(cid, self.default_disp) + + def to_unicode(self, cid): + if not self.ucs2_cmap: + raise PDFUnicodeNotDefined(self.cidcoding, cid) + code = self.ucs2_cmap.tocode(cid) + if not code: + raise PDFUnicodeNotDefined(self.cidcoding, cid) + chars = unpack('>%dH' % (len(code)/2), code) + return ''.join( unichr(c) for c in chars ) + + diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index 220743c..280c317 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -6,33 +6,22 @@ try: from cStringIO import StringIO except ImportError: from StringIO import StringIO -from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ - PSStackParser, PSLiteral, PSKeyword, STRICT, \ - PSLiteralTable, PSKeywordTable, literal_name, keyword_name -from pdflib.pdfparser import PDFException, PDFObject, PDFStream, PDFObjRef, resolve1, \ - int_value, float_value, num_value, \ +from pdflib.psparser import PSException, PSTypeError, PSEOF, \ + PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ + PSStackParser, PSKeyword, STRICT +from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \ + resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value -from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB -from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix +from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, MATRIX_IDENTITY +from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont +from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ + LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK ## Exceptions ## class PDFResourceError(PDFException): pass class PDFInterpreterError(PDFException): pass -class PDFFontError(PDFException): pass -class PDFUnicodeNotDefined(PDFFontError): pass - - -## ColorSpace -## -class ColorSpace(object): - def __init__(self, name, ncomponents): - self.name = name - self.ncomponents = ncomponents - return - def __repr__(self): - return '' % (self.name, self.ncomponents) ## Constants @@ -42,344 +31,6 @@ LITERAL_TEXT = PSLiteralTable.intern('Text') LITERAL_FONT = PSLiteralTable.intern('Font') LITERAL_FORM = PSLiteralTable.intern('Form') LITERAL_IMAGE = PSLiteralTable.intern('Image') -LITERAL_STANDARD_ENCODING = PSLiteralTable.intern('StandardEncoding') -LITERAL_DEVICE_GRAY = PSLiteralTable.intern('DeviceGray') -LITERAL_DEVICE_RGB = PSLiteralTable.intern('DeviceRGB') -LITERAL_DEVICE_CMYK = PSLiteralTable.intern('DeviceCMYK') -KEYWORD_BI = PSKeywordTable.intern('BI') -KEYWORD_ID = PSKeywordTable.intern('ID') -KEYWORD_EI = PSKeywordTable.intern('EI') -MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) - -PREDEFINED_COLORSPACE = dict( - (name, ColorSpace(name,n)) for (name,n) in { - 'CalRGB': 3, - 'CalGray': 1, - 'Lab': 3, - 'DeviceRGB': 3, - 'DeviceCMYK': 4, - 'DeviceGray': 1, - 'Separation': 1, - 'Indexed': 1, - 'Pattern': 1, - }.iteritems()) - - -## Fonts -## - -# PDFFont -class PDFFont(object): - - def __init__(self, descriptor, widths, default_width=None): - self.descriptor = descriptor - self.widths = widths - self.fontname = descriptor.get('FontName', 'unknown') - if isinstance(self.fontname, PSLiteral): - self.fontname = literal_name(self.fontname) - self.ascent = num_value(descriptor.get('Ascent', 0)) - self.descent = num_value(descriptor.get('Descent', 0)) - self.default_width = default_width or descriptor.get('MissingWidth', 0) - self.leading = num_value(descriptor.get('Leading', 0)) - self.bbox = list_value(descriptor.get('FontBBox', (0,0,0,0))) - return - - def __repr__(self): - return '' - - def is_vertical(self): - return False - - def is_multibyte(self): - return False - - def decode(self, bytes): - return map(ord, bytes) - - def char_width(self, cid): - return self.widths.get(cid, self.default_width) - - def char_disp(self, cid): - return 0 - - def string_width(self, s): - return sum( self.char_width(cid) for cid in self.decode(s) ) - - -# PDFSimpleFont -class PDFSimpleFont(PDFFont): - - def __init__(self, descriptor, widths, spec): - # Font encoding is specified either by a name of - # built-in encoding or a dictionary that describes - # the differences. - if 'Encoding' in spec: - encoding = resolve1(spec['Encoding']) - else: - encoding = LITERAL_STANDARD_ENCODING - if isinstance(encoding, dict): - name = literal_name(encoding.get('BaseEncoding', LITERAL_STANDARD_ENCODING)) - diff = list_value(encoding.get('Differences', None)) - self.encoding = EncodingDB.get_encoding(name, diff) - else: - self.encoding = EncodingDB.get_encoding(literal_name(encoding)) - self.ucs2_cmap = None - if 'ToUnicode' in spec: - strm = stream_value(spec['ToUnicode']) - self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() - PDFFont.__init__(self, descriptor, widths) - return - - def to_unicode(self, cid): - if not self.ucs2_cmap: - try: - return self.encoding[cid] - except KeyError: - raise PDFUnicodeNotDefined(None, cid) - code = self.ucs2_cmap.tocode(cid) - if not code: - raise PDFUnicodeNotDefined(None, cid) - chars = unpack('>%dH' % (len(code)/2), code) - return ''.join( unichr(c) for c in chars ) - - -# PDFType1Font -class PDFType1Font(PDFSimpleFont): - - def __init__(self, spec): - try: - self.basefont = literal_name(spec['BaseFont']) - except KeyError: - if STRICT: - raise PDFFontError('BaseFont is missing') - self.basefont = 'unknown' - try: - (descriptor, widths) = FontMetricsDB.get_metrics(self.basefont) - except KeyError: - descriptor = dict_value(spec.get('FontDescriptor', {})) - firstchar = int_value(spec.get('FirstChar', 0)) - lastchar = int_value(spec.get('LastChar', 255)) - widths = list_value(spec.get('Widths', [0]*256)) - widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths) ) - PDFSimpleFont.__init__(self, descriptor, widths, spec) - return - - def __repr__(self): - return '' % self.basefont - -# PDFTrueTypeFont -class PDFTrueTypeFont(PDFType1Font): - - def __repr__(self): - return '' % self.basefont - -# PDFType3Font -class PDFType3Font(PDFSimpleFont): - def __init__(self, spec): - firstchar = int_value(spec.get('FirstChar', 0)) - lastchar = int_value(spec.get('LastChar', 0)) - widths = list_value(spec.get('Widths', [0]*256)) - widths = dict( (i+firstchar,w) for (i,w) in enumerate(widths)) - if 'FontDescriptor' in spec: - descriptor = dict_value(spec['FontDescriptor']) - else: - descriptor = {'FontName':spec.get('Name'), - 'Ascent':0, 'Descent':0, - 'FontBBox':spec['FontBBox']} - PDFSimpleFont.__init__(self, descriptor, widths, spec) - return - - def __repr__(self): - return '' - - -# PDFCIDFont - -## TrueTypeFont -## -class TrueTypeFont(object): - - class CMapNotFound(Exception): pass - - def __init__(self, name, fp): - self.name = name - self.fp = fp - self.tables = {} - fonttype = fp.read(4) - (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - for i in xrange(ntables): - (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) - self.tables[name] = (offset, length) - return - - def create_cmap(self): - if 'cmap' not in self.tables: - raise TrueTypeFont.CMapNotFound - (base_offset, length) = self.tables['cmap'] - fp = self.fp - fp.seek(base_offset) - (version, nsubtables) = unpack('>HH', fp.read(4)) - subtables = [] - for i in xrange(nsubtables): - subtables.append(unpack('>HHL', fp.read(8))) - char2gid = {} - # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: - fp.seek(base_offset+st_offset) - (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) - if fmttype == 0: - char2gid.update(enumerate(unpack('>256B', fp.read(256)))) - elif fmttype == 2: - subheaderkeys = unpack('>256H', fp.read(512)) - firstbytes = [0]*8192 - for (i,k) in enumerate(subheaderkeys): - firstbytes[k/8] = i - nhdrs = max(subheaderkeys)/8 + 1 - hdrs = [] - for i in xrange(nhdrs): - (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) - hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) - for (i,firstcode,entcount,delta,pos) in hdrs: - if not entcount: continue - first = firstcode + (firstbytes[i] << 8) - fp.seek(pos) - for c in xrange(entcount): - gid = unpack('>H', fp.read(2)) - if gid: - gid += delta - char2gid[first+c] = gid - elif fmttype == 4: - (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - segcount /= 2 - ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) - fp.read(2) - scs = unpack('>%dH' % segcount, fp.read(2*segcount)) - idds = unpack('>%dh' % segcount, fp.read(2*segcount)) - pos = fp.tell() - idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) - for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): - if idr: - fp.seek(pos+idr) - for c in xrange(sc, ec+1): - char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff - else: - for c in xrange(sc, ec+1): - char2gid[c] = (c + idd) & 0xffff - gid2char = dict( (gid, pack('>H', char)) - for (char,gid) in char2gid.iteritems() ) - cmapname = 'Adobe-Identity-UCS-%s' % self.name - return CMap(cmapname).update(char2gid, gid2char) - -class PDFCIDFont(PDFFont): - - def __init__(self, spec): - try: - self.basefont = literal_name(spec['BaseFont']) - except KeyError: - if STRICT: - raise PDFFontError('BaseFont is missing') - self.basefont = 'unknown' - self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) - self.cidcoding = '%s-%s' % (self.cidsysteminfo.get('Registry', 'unknown'), - self.cidsysteminfo.get('Ordering', 'unknown')) - try: - name = literal_name(spec['Encoding']) - except KeyError: - if STRICT: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - try: - self.cmap = CMapDB.get_cmap(name, strict=STRICT) - except CMapDB.CMapNotFound, e: - raise PDFFontError(e) - try: - descriptor = dict_value(spec['FontDescriptor']) - except KeyError: - if STRICT: - raise PDFFontError('FontDescriptor is missing') - descriptor = {} - ttf = None - if 'FontFile2' in descriptor: - self.fontfile = stream_value(descriptor.get('FontFile2')) - ttf = TrueTypeFont(self.basefont, - StringIO(self.fontfile.get_data())) - self.ucs2_cmap = None - if 'ToUnicode' in spec: - strm = stream_value(spec['ToUnicode']) - self.ucs2_cmap = CMap() - CMapParser(self.ucs2_cmap, StringIO(strm.get_data())).run() - elif self.cidcoding == 'Adobe-Identity': - if ttf: - try: - self.ucs2_cmap = ttf.create_cmap() - except TrueTypeFont.CMapNotFound: - pass - else: - try: - self.ucs2_cmap = CMapDB.get_cmap('%s-UCS2' % self.cidcoding, - strict=STRICT) - except CMapDB.CMapNotFound, e: - raise PDFFontError(e) - - def get_width(seq): - dic = {} - char1 = char2 = None - for v in seq: - if char1 == None: - char1 = v - elif char2 == None and isinstance(v, int): - char2 = v - else: - if char2 == None: - for (i,w) in enumerate(v): - dic[char1+i] = w - else: - for i in xrange(char1, char2+1): - dic[i] = v - char1 = char2 = None - return dic - self.vertical = self.cmap.is_vertical() - if self.vertical: - # writing mode: vertical - dic = get_width(list_value(spec.get('W2', []))) - widths = dict( (cid,w) for (cid,(d,w)) in dic.iteritems() ) - self.disps = dict( (cid,d) for (cid,(d,w)) in dic.iteritems() ) - (d,w) = spec.get('DW2', [880, -1000]) - default_width = w - self.default_disp = d - else: - # writing mode: horizontal - widths = get_width(list_value(spec.get('W', []))) - self.disps = {} - default_width = spec.get('DW', 1000) - self.default_disp = 0 - PDFFont.__init__(self, descriptor, widths, default_width) - return - - def __repr__(self): - return '' % (self.basefont, self.cidcoding) - - def is_vertical(self): - return self.vertical - - def is_multibyte(self): - return True - - def decode(self, bytes): - return self.cmap.decode(bytes) - - def char_disp(self, cid): - return self.disps.get(cid, self.default_disp) - - def to_unicode(self, cid): - if not self.ucs2_cmap: - raise PDFUnicodeNotDefined(self.cidcoding, cid) - code = self.ucs2_cmap.tocode(cid) - if not code: - raise PDFUnicodeNotDefined(self.cidcoding, cid) - chars = unpack('>%dH' % (len(code)/2), code) - return ''.join( unichr(c) for c in chars ) ## Resource Manager @@ -388,7 +39,7 @@ class PDFResourceManager(object): ''' ResourceManager facilitates reuse of shared resources - such as fonts, images and cmaps so that large objects are not + such as fonts and images so that large objects are not allocated multiple times. ''' debug = 0 @@ -399,24 +50,21 @@ class PDFResourceManager(object): def get_procset(self, procs): for proc in procs: - if proc == LITERAL_PDF: + if proc is LITERAL_PDF: pass - elif proc == LITERAL_TEXT: + elif proc is LITERAL_TEXT: pass else: #raise PDFResourceError('ProcSet %r is not supported.' % proc) pass return - def get_cmap(self, name): - return CMapDB.get_cmap(name, strict=STRICT) - def get_font(self, objid, spec): if objid and objid in self.fonts: font = self.fonts[objid] else: if STRICT: - if spec['Type'] != LITERAL_FONT: + if spec['Type'] is not LITERAL_FONT: raise PDFFontError('Type is not /Font') # Create a Font object. if 'Subtype' in spec: @@ -455,49 +103,6 @@ class PDFResourceManager(object): return font -## PDFDevice -## -class PDFDevice(object): - - debug = 0 - - def __init__(self, rsrc): - self.rsrc = rsrc - self.ctm = None - return - - def __repr__(self): - return '' - - def close(self): - return - - def set_ctm(self, ctm): - self.ctm = ctm - return - - def begin_tag(self, tag, props=None): - return - def end_tag(self): - return - def do_tag(self, tag, props=None): - return - - def begin_page(self, page): - return - def end_page(self, page): - return - def begin_figure(self, name, bbox): - return - def end_figure(self, name): - return - - def render_string(self, textstate, textmatrix, seq): - raise NotImplementedError - def render_image(self, stream, size, matrix): - raise NotImplementedError - - ## PDFContentParser ## class PDFContentParser(PSStackParser): @@ -565,11 +170,14 @@ class PDFContentParser(PSStackParser): self.add_results(*self.popall()) return + KEYWORD_BI = PSKeywordTable.intern('BI') + KEYWORD_ID = PSKeywordTable.intern('ID') + KEYWORD_EI = PSKeywordTable.intern('EI') def do_keyword(self, pos, token): - if token == KEYWORD_BI: + if token is self.KEYWORD_BI: # inline image within a content stream self.start_type(pos, 'inline') - elif token == KEYWORD_ID: + elif token is self.KEYWORD_ID: try: (_, objs) = self.end_type('inline') if len(objs) % 2 != 0: @@ -578,7 +186,7 @@ class PDFContentParser(PSStackParser): (pos, data) = self.get_inline_data(pos+len('ID ')) obj = PDFStream(d, data) self.push((pos, obj)) - self.push((pos, KEYWORD_EI)) + self.push((pos, self.KEYWORD_EI)) except PSTypeError: if STRICT: raise else: @@ -975,7 +583,7 @@ class PDFPageInterpreter(object): if 1 <= self.debug: print >>stderr, 'Processing xobj: %r' % xobj subtype = xobj.dic.get('Subtype') - if subtype == LITERAL_FORM and 'BBox' in xobj.dic: + if subtype is LITERAL_FORM and 'BBox' in xobj.dic: interpreter = self.dup() (x0,y0,x1,y1) = list_value(xobj.dic['BBox']) ctm = mult_matrix(list_value(xobj.dic.get('Matrix', MATRIX_IDENTITY)), self.ctm) @@ -985,7 +593,7 @@ class PDFPageInterpreter(object): self.device.begin_figure(xobjid, bbox) interpreter.render_contents(dict_value(xobj.dic.get('Resources')), [xobj], ctm=ctm) self.device.end_figure(xobjid) - elif subtype == LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: + elif subtype is LITERAL_IMAGE and 'Width' in xobj.dic and 'Height' in xobj.dic: (x0,y0) = apply_matrix(self.ctm, (0,0)) (x1,y1) = apply_matrix(self.ctm, (1,1)) self.device.begin_figure(xobjid, (x0,y0,x1,y1)) diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 3319b21..85a0976 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -7,26 +7,22 @@ import sys, re import md5, struct stderr = sys.stderr -from utils import choplist, nunpack -from arcfour import Arcfour -from lzw import LZWDecoder -from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ - PSObject, PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ - literal_name, keyword_name, \ - PSStackParser, STRICT +from pdflib.utils import choplist, nunpack +from pdflib.arcfour import Arcfour +from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \ + PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ + STRICT +from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \ + PDFStream, PDFObjRef, resolve1, decipher_all, \ + int_value, float_value, num_value, str_value, list_value, dict_value, stream_value -## PDF Exceptions +## Exceptions ## -class PDFException(PSException): pass class PDFSyntaxError(PDFException): pass class PDFNoValidXRef(PDFSyntaxError): pass class PDFEncryptionError(PDFException): pass class PDFPasswordIncorrect(PDFEncryptionError): pass -class PDFTypeError(PDFException): pass -class PDFValueError(PDFException): pass -class PDFNotImplementedError(PSException): pass - # some predefined literals and keywords. LITERAL_OBJSTM = PSLiteralTable.intern('ObjStm') @@ -34,258 +30,10 @@ LITERAL_XREF = PSLiteralTable.intern('XRef') LITERAL_PAGE = PSLiteralTable.intern('Page') LITERAL_PAGES = PSLiteralTable.intern('Pages') LITERAL_CATALOG = PSLiteralTable.intern('Catalog') -LITERAL_CRYPT = PSLiteralTable.intern('Crypt') -LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), - PSLiteralTable.intern('Fl')) -LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), - PSLiteralTable.intern('LZW')) -LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), - PSLiteralTable.intern('A85')) -KEYWORD_R = PSKeywordTable.intern('R') -KEYWORD_OBJ = PSKeywordTable.intern('obj') -KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') -KEYWORD_STREAM = PSKeywordTable.intern('stream') -KEYWORD_XREF = PSKeywordTable.intern('xref') -KEYWORD_TRAILER = PSKeywordTable.intern('trailer') -KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') -PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' - -class PDFObject(PSObject): pass - - -## PDFObjRef -## -class PDFObjRef(PDFObject): - - def __init__(self, doc, objid, _): - if objid == 0: - if STRICT: - raise PDFValueError('PDF object id cannot be 0.') - self.doc = doc - self.objid = objid - #self.genno = genno # Never used. - return - - def __repr__(self): - return '' % (self.objid) - - def resolve(self): - return self.doc.getobj(self.objid) - - -# resolve -def resolve1(x): - ''' - Resolve an object. If this is an array or dictionary, - it may still contains some indirect objects inside. - ''' - while isinstance(x, PDFObjRef): - x = x.resolve() - return x - -def resolve_all(x): - ''' - Recursively resolve X and all the internals. - Make sure there is no indirect reference within the nested object. - This procedure might be slow. - ''' - while isinstance(x, PDFObjRef): - x = x.resolve() - if isinstance(x, list): - x = [ resolve_all(v) for v in x ] - elif isinstance(x, dict): - for (k,v) in x.iteritems(): - x[k] = resolve_all(v) - return x - -def decipher_all(decipher, objid, genno, x): - ''' - Recursively decipher X. - ''' - if isinstance(x, str): - return decipher(objid, genno, x) - if isinstance(x, list): - x = [ decipher_all(decipher, objid, genno, v) for v in x ] - elif isinstance(x, dict): - for (k,v) in x.iteritems(): - x[k] = decipher_all(decipher, objid, genno, v) - return x - -# Type cheking -def int_value(x): - x = resolve1(x) - if not isinstance(x, int): - if STRICT: - raise PDFTypeError('Integer required: %r' % x) - return 0 - return x - -def float_value(x): - x = resolve1(x) - if not isinstance(x, float): - if STRICT: - raise PDFTypeError('Float required: %r' % x) - return 0.0 - return x - -def num_value(x): - x = resolve1(x) - if not (isinstance(x, int) or isinstance(x, float)): - if STRICT: - raise PDFTypeError('Int or Float required: %r' % x) - return 0 - return x - -def str_value(x): - x = resolve1(x) - if not isinstance(x, str): - if STRICT: - raise PDFTypeError('String required: %r' % x) - return '' - return x - -def list_value(x): - x = resolve1(x) - if not (isinstance(x, list) or isinstance(x, tuple)): - if STRICT: - raise PDFTypeError('List required: %r' % x) - return [] - return x - -def dict_value(x): - x = resolve1(x) - if not isinstance(x, dict): - if STRICT: - raise PDFTypeError('Dict required: %r' % x) - return {} - return x - -def stream_value(x): - x = resolve1(x) - if not isinstance(x, PDFStream): - if STRICT: - raise PDFTypeError('PDFStream required: %r' % x) - return PDFStream({}, '') - return x - - -## PDFStream type -## -class PDFStream(PDFObject): - - def __init__(self, dic, rawdata, decipher=None): - self.dic = dic - self.rawdata = rawdata - self.decipher = decipher - self.data = None - self.objid = None - self.genno = None - return - - def set_objid(self, objid, genno): - self.objid = objid - self.genno = genno - return - - def __repr__(self): - return '' % (self.objid, len(self.rawdata), self.dic) - - def decode(self): - assert self.data == None and self.rawdata != None - data = self.rawdata - if self.decipher: - # Handle encryption - data = self.decipher(self.objid, self.genno, data) - if 'Filter' not in self.dic: - self.data = data - self.rawdata = None - return - filters = self.dic['Filter'] - if not isinstance(filters, list): - filters = [ filters ] - for f in filters: - if f in LITERALS_FLATE_DECODE: - import zlib - # will get errors if the document is encrypted. - data = zlib.decompress(data) - elif f in LITERALS_LZW_DECODE: - try: - from cStringIO import StringIO - except ImportError: - from StringIO import StringIO - data = ''.join(LZWDecoder(StringIO(data)).run()) - elif f in LITERALS_ASCII85_DECODE: - import ascii85 - data = ascii85.ascii85decode(data) - elif f == LITERAL_CRYPT: - raise PDFEncryptionError('/Crypt filter is unsupported') - else: - raise PDFNotImplementedError('Unsupported filter: %r' % f) - # apply predictors - params = self.dic.get('DecodeParms', {}) - if 'Predictor' in params: - pred = int_value(params['Predictor']) - if pred: - if pred != 12: - raise PDFNotImplementedError('Unsupported predictor: %r' % pred) - if 'Columns' not in params: - raise PDFValueError('Columns undefined for predictor=12') - columns = int_value(params['Columns']) - buf = '' - ent0 = '\x00' * columns - for i in xrange(0, len(data), columns+1): - pred = data[i] - ent1 = data[i+1:i+1+columns] - if pred == '\x02': - ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) - buf += ent1 - ent0 = ent1 - data = buf - self.data = data - self.rawdata = None - return - - def get_data(self): - if self.data == None: - self.decode() - return self.data - - def get_rawdata(self): - return self.rawdata - - -## PDFPage -## -class PDFPage(object): - - def __init__(self, doc, pageid, attrs): - self.doc = doc - self.pageid = pageid - self.attrs = dict_value(attrs) - self.lastmod = self.attrs.get('LastModified') - self.resources = resolve1(self.attrs['Resources']) - self.mediabox = resolve1(self.attrs['MediaBox']) - if 'CropBox' in self.attrs: - self.cropbox = resolve1(self.attrs['CropBox']) - else: - self.cropbox = self.mediabox - self.rotate = self.attrs.get('Rotate', 0) - self.annots = self.attrs.get('Annots') - self.beads = self.attrs.get('B') - if 'Contents' in self.attrs: - contents = resolve1(self.attrs['Contents']) - else: - contents = [] - if not isinstance(contents, list): - contents = [ contents ] - self.contents = contents - return - - def __repr__(self): - return '' % (self.resources, self.mediabox) ## XRefs +## ## PDFXRef ## @@ -296,7 +44,7 @@ class PDFXRef(object): return def objids(self): - return self.offsets.keys() + return self.offsets.iterkeys() def load(self, parser): while 1: @@ -330,10 +78,11 @@ class PDFXRef(object): self.load_trailer(parser) return + KEYWORD_TRAILER = PSKeywordTable.intern('trailer') def load_trailer(self, parser): try: (_,kwd) = parser.nexttoken() - assert kwd == KEYWORD_TRAILER + assert kwd is self.KEYWORD_TRAILER (_,dic) = parser.nextobject() except PSEOF: x = parser.pop(1) @@ -350,7 +99,7 @@ class PDFXRef(object): raise if use != 'n': if STRICT: - raise PDFValueError('Unused objid=%r' % objid) + raise PDFSyntaxError('Unused objid=%r' % objid) return (None, pos) @@ -367,14 +116,14 @@ class PDFXRefStream(object): return def objids(self): - return range(self.objid0, self.objid1+1) + return xrange(self.objid0, self.objid1) def load(self, parser): (_,objid) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored (_,kwd) = parser.nexttoken() (_,stream) = parser.nextobject() - if not isinstance(stream, PDFStream) or stream.dic['Type'] != LITERAL_XREF: + if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream.dic['Size'] (start, nobjs) = stream.dic.get('Index', (0,size)) @@ -402,6 +151,37 @@ class PDFXRefStream(object): return (objid, index) +## PDFPage +## +class PDFPage(object): + + def __init__(self, doc, pageid, attrs): + self.doc = doc + self.pageid = pageid + self.attrs = dict_value(attrs) + self.lastmod = resolve1(self.attrs.get('LastModified')) + self.resources = resolve1(self.attrs['Resources']) + self.mediabox = resolve1(self.attrs['MediaBox']) + if 'CropBox' in self.attrs: + self.cropbox = resolve1(self.attrs['CropBox']) + else: + self.cropbox = self.mediabox + self.rotate = self.attrs.get('Rotate', 0) + self.annots = self.attrs.get('Annots') + self.beads = self.attrs.get('B') + if 'Contents' in self.attrs: + contents = resolve1(self.attrs['Contents']) + else: + contents = [] + if not isinstance(contents, list): + contents = [ contents ] + self.contents = contents + return + + def __repr__(self): + return '' % (self.resources, self.mediabox) + + ## PDFDocument ## ## A PDFDocument object represents a PDF document. @@ -463,15 +243,16 @@ class PDFDocument(object): def set_root(self, root): self.root = root self.catalog = dict_value(self.root) - if self.catalog.get('Type') != LITERAL_CATALOG: + if self.catalog.get('Type') is not LITERAL_CATALOG: if STRICT: - raise PDFValueError('Catalog not found!') + raise PDFSyntaxError('Catalog not found!') return # initialize(password='') # Perform the initialization with a given password. # This step is mandatory even if there's no password associated # with the document. + PASSWORD_PADDING = '(\xbfN^Nu\x8aAd\x00NV\xff\xfa\x01\x08..\x00\xb6\xd0h>\x80/\x0c\xa9\xfedSiz' def initialize(self, password=''): if not self.encryption: self.is_printable = self.is_modifiable = self.is_extractable = True @@ -494,7 +275,7 @@ class PDFDocument(object): self.is_modifiable = bool(P & 8) self.is_extractable = bool(P & 16) # Algorithm 3.2 - password = (password+PASSWORD_PADDING)[:32] # 1 + password = (password+self.PASSWORD_PADDING)[:32] # 1 hash = md5.md5(password) # 2 hash.update(O) # 3 hash.update(struct.pack('>stderr, 'Pages: Kids=%r' % tree['Kids'] for c in tree['Kids']: for x in search(c, tree): yield x - elif tree.get('Type') == LITERAL_PAGE: + elif tree.get('Type') is LITERAL_PAGE: if 1 <= self.debug: print >>stderr, 'Page: %r' % tree yield (obj.objid, tree) @@ -683,15 +465,20 @@ class PDFParser(PSStackParser): def __repr__(self): return '' + KEYWORD_R = PSKeywordTable.intern('R') + KEYWORD_ENDOBJ = PSKeywordTable.intern('endobj') + KEYWORD_STREAM = PSKeywordTable.intern('stream') + KEYWORD_XREF = PSKeywordTable.intern('xref') + KEYWORD_STARTXREF = PSKeywordTable.intern('startxref') def do_keyword(self, pos, token): - if token in (KEYWORD_XREF, KEYWORD_STARTXREF): + if token in (self.KEYWORD_XREF, self.KEYWORD_STARTXREF): self.add_results(*self.pop(1)) return - if token == KEYWORD_ENDOBJ: + if token is self.KEYWORD_ENDOBJ: self.add_results(*self.pop(4)) return - if token == KEYWORD_R: + if token is self.KEYWORD_R: # reference to indirect object try: ((_,objid), (_,genno)) = self.pop(2) @@ -702,7 +489,7 @@ class PDFParser(PSStackParser): pass return - if token == KEYWORD_STREAM: + if token is self.KEYWORD_STREAM: # stream object ((_,dic),) = self.pop(1) dic = dict_value(dic) @@ -710,7 +497,7 @@ class PDFParser(PSStackParser): objlen = int_value(dic['Length']) except KeyError: if STRICT: - raise PDFValueError('/Length is undefined: %r' % dic) + raise PDFSyntaxError('/Length is undefined: %r' % dic) objlen = 0 self.seek(pos) try: @@ -785,7 +572,7 @@ class PDFParser(PSStackParser): xref = PDFXRefStream() xref.load(self) else: - if token != KEYWORD_XREF: + if token is not self.KEYWORD_XREF: raise PDFNoValidXRef('xref not found: pos=%d, token=%r' % (pos, token)) self.nextline() @@ -835,6 +622,7 @@ class PDFParser(PSStackParser): yield xref return + ## PDFObjStrmParser ## class PDFObjStrmParser(PDFParser): diff --git a/pdflib/pdftypes.py b/pdflib/pdftypes.py new file mode 100644 index 0000000..c13a763 --- /dev/null +++ b/pdflib/pdftypes.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python +import sys, zlib +stderr = sys.stderr +from pdflib.lzw import LZWDecoder +from pdflib.psparser import PSException, PSObject, \ + PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ + literal_name, keyword_name, STRICT + +LITERAL_CRYPT = PSLiteralTable.intern('Crypt') +LITERALS_FLATE_DECODE = (PSLiteralTable.intern('FlateDecode'), PSLiteralTable.intern('Fl')) +LITERALS_LZW_DECODE = (PSLiteralTable.intern('LZWDecode'), PSLiteralTable.intern('LZW')) +LITERALS_ASCII85_DECODE = (PSLiteralTable.intern('ASCII85Decode'), PSLiteralTable.intern('A85')) + + +## PDF Objects +## +class PDFObject(PSObject): pass + +class PDFException(PSException): pass +class PDFTypeError(PDFException): pass +class PDFValueError(PDFException): pass +class PDFNotImplementedError(PSException): pass + + +## PDFObjRef +## +class PDFObjRef(PDFObject): + + def __init__(self, doc, objid, _): + if objid == 0: + if STRICT: + raise PDFValueError('PDF object id cannot be 0.') + self.doc = doc + self.objid = objid + #self.genno = genno # Never used. + return + + def __repr__(self): + return '' % (self.objid) + + def resolve(self): + return self.doc.getobj(self.objid) + + +# resolve +def resolve1(x): + ''' + Resolve an object. If this is an array or dictionary, + it may still contains some indirect objects inside. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + return x + +def resolve_all(x): + ''' + Recursively resolve X and all the internals. + Make sure there is no indirect reference within the nested object. + This procedure might be slow. + ''' + while isinstance(x, PDFObjRef): + x = x.resolve() + if isinstance(x, list): + x = [ resolve_all(v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = resolve_all(v) + return x + +def decipher_all(decipher, objid, genno, x): + ''' + Recursively decipher X. + ''' + if isinstance(x, str): + return decipher(objid, genno, x) + if isinstance(x, list): + x = [ decipher_all(decipher, objid, genno, v) for v in x ] + elif isinstance(x, dict): + for (k,v) in x.iteritems(): + x[k] = decipher_all(decipher, objid, genno, v) + return x + +# Type cheking +def int_value(x): + x = resolve1(x) + if not isinstance(x, int): + if STRICT: + raise PDFTypeError('Integer required: %r' % x) + return 0 + return x + +def float_value(x): + x = resolve1(x) + if not isinstance(x, float): + if STRICT: + raise PDFTypeError('Float required: %r' % x) + return 0.0 + return x + +def num_value(x): + x = resolve1(x) + if not (isinstance(x, int) or isinstance(x, float)): + if STRICT: + raise PDFTypeError('Int or Float required: %r' % x) + return 0 + return x + +def str_value(x): + x = resolve1(x) + if not isinstance(x, str): + if STRICT: + raise PDFTypeError('String required: %r' % x) + return '' + return x + +def list_value(x): + x = resolve1(x) + if not (isinstance(x, list) or isinstance(x, tuple)): + if STRICT: + raise PDFTypeError('List required: %r' % x) + return [] + return x + +def dict_value(x): + x = resolve1(x) + if not isinstance(x, dict): + if STRICT: + raise PDFTypeError('Dict required: %r' % x) + return {} + return x + +def stream_value(x): + x = resolve1(x) + if not isinstance(x, PDFStream): + if STRICT: + raise PDFTypeError('PDFStream required: %r' % x) + return PDFStream({}, '') + return x + + +## PDFStream type +## +class PDFStream(PDFObject): + + def __init__(self, dic, rawdata, decipher=None): + self.dic = dic + self.rawdata = rawdata + self.decipher = decipher + self.data = None + self.objid = None + self.genno = None + return + + def set_objid(self, objid, genno): + self.objid = objid + self.genno = genno + return + + def __repr__(self): + return '' % (self.objid, len(self.rawdata), self.dic) + + def decode(self): + assert self.data == None and self.rawdata != None + data = self.rawdata + if self.decipher: + # Handle encryption + data = self.decipher(self.objid, self.genno, data) + if 'Filter' not in self.dic: + self.data = data + self.rawdata = None + return + filters = self.dic['Filter'] + if not isinstance(filters, list): + filters = [ filters ] + for f in filters: + if f in LITERALS_FLATE_DECODE: + # will get errors if the document is encrypted. + data = zlib.decompress(data) + elif f in LITERALS_LZW_DECODE: + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO + data = ''.join(LZWDecoder(StringIO(data)).run()) + elif f in LITERALS_ASCII85_DECODE: + import ascii85 + data = ascii85.ascii85decode(data) + elif f == LITERAL_CRYPT: + raise PDFEncryptionError('/Crypt filter is unsupported') + else: + raise PDFNotImplementedError('Unsupported filter: %r' % f) + # apply predictors + params = self.dic.get('DecodeParms', {}) + if 'Predictor' in params: + pred = int_value(params['Predictor']) + if pred: + if pred != 12: + raise PDFNotImplementedError('Unsupported predictor: %r' % pred) + if 'Columns' not in params: + raise PDFValueError('Columns undefined for predictor=12') + columns = int_value(params['Columns']) + buf = '' + ent0 = '\x00' * columns + for i in xrange(0, len(data), columns+1): + pred = data[i] + ent1 = data[i+1:i+1+columns] + if pred == '\x02': + ent1 = ''.join( chr((ord(a)+ord(b)) & 255) for (a,b) in zip(ent0,ent1) ) + buf += ent1 + ent0 = ent1 + data = buf + self.data = data + self.rawdata = None + return + + def get_data(self): + if self.data == None: + self.decode() + return self.data + + def get_rawdata(self): + return self.rawdata diff --git a/pdflib/psparser.py b/pdflib/psparser.py index 2c5660d..05e8933 100644 --- a/pdflib/psparser.py +++ b/pdflib/psparser.py @@ -1,7 +1,8 @@ #!/usr/bin/env python import sys, re stderr = sys.stderr -from utils import choplist + +from pdflib.utils import choplist STRICT = 0 diff --git a/pdflib/utils.py b/pdflib/utils.py index 2bf0c7a..6d96a9d 100644 --- a/pdflib/utils.py +++ b/pdflib/utils.py @@ -4,6 +4,8 @@ from struct import unpack ## Matrix operations ## +MATRIX_IDENTITY = (1, 0, 0, 1, 0, 0) + def mult_matrix((a1,b1,c1,d1,e1,f1), (a0,b0,c0,d0,e0,f0)): '''Multiplies two matrices.''' return (a0*a1+c0*b1, b0*a1+d0*b1,