From 13a6603151e5a65919c0d58dd2817c4e20e6df14 Mon Sep 17 00:00:00 2001 From: "yusuke.shinyama.dummy" Date: Mon, 23 Feb 2009 14:00:38 +0000 Subject: [PATCH] tmp commit git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@70 1aa58f4a-7d42-0410-adbc-911cccaed67c --- pdflib/cmap.py | 4 +- pdflib/pdf2txt.py | 12 +- pdflib/pdfcolor.py | 2 +- pdflib/pdfdevice.py | 4 +- pdflib/pdffont.py | 273 +++++++++++++++++++++++++++++--------------- pdflib/pdfinterp.py | 24 ++-- pdflib/pdfparser.py | 80 +++++++++---- pdflib/pdftypes.py | 4 +- pdflib/psparser.py | 2 +- 9 files changed, 267 insertions(+), 138 deletions(-) diff --git a/pdflib/cmap.py b/pdflib/cmap.py index 41d0f7b..bcdd9e9 100644 --- a/pdflib/cmap.py +++ b/pdflib/cmap.py @@ -2,8 +2,8 @@ import sys stderr = sys.stderr from struct import pack, unpack -from pdflib.utils import choplist, nunpack -from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ +from utils import choplist, nunpack +from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \ PSLiteral, PSKeyword, literal_name, keyword_name, \ PSStackParser try: diff --git a/pdflib/pdf2txt.py b/pdflib/pdf2txt.py index 478423e..59f7916 100755 --- a/pdflib/pdf2txt.py +++ b/pdflib/pdf2txt.py @@ -2,11 +2,11 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect -from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdflib.pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator -from pdflib.pdffont import PDFUnicodeNotDefined -from pdflib.cmap import CMapDB +from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect +from pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator +from pdffont import PDFUnicodeNotDefined +from cmap import CMapDB def enc(x, codec): @@ -121,7 +121,7 @@ class TagExtractor(PDFDevice): def render_image(self, stream, size, matrix): return - def render_string(self, textstate, textmatrix, size, seq): + def render_string(self, textstate, textmatrix, seq): font = textstate.font text = '' for x in seq: diff --git a/pdflib/pdfcolor.py b/pdflib/pdfcolor.py index cbdd076..72b770a 100644 --- a/pdflib/pdfcolor.py +++ b/pdflib/pdfcolor.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import sys stderr = sys.stderr -from pdflib.psparser import PSLiteralTable +from psparser import PSLiteralTable ## ColorSpace diff --git a/pdflib/pdfdevice.py b/pdflib/pdfdevice.py index 3a8d154..0a8a9ea 100644 --- a/pdflib/pdfdevice.py +++ b/pdflib/pdfdevice.py @@ -2,8 +2,8 @@ import sys stdout = sys.stdout stderr = sys.stderr -from pdflib.pdffont import PDFUnicodeNotDefined -from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix +from pdffont import PDFUnicodeNotDefined +from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix ## PDFDevice diff --git a/pdflib/pdffont.py b/pdflib/pdffont.py index bb311f8..5735f05 100644 --- a/pdflib/pdffont.py +++ b/pdflib/pdffont.py @@ -6,13 +6,175 @@ try: from cStringIO import StringIO except ImportError: from StringIO import StringIO -from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \ +from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \ literal_name, keyword_name, STRICT -from pdflib.pdftypes import PDFException, \ +from pdftypes import PDFException, \ resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value -from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB -from utils import apply_matrix_norm +from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB +from utils import apply_matrix_norm, nunpack + + +NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-') +def getnum(fp): + b0 = ord(fp.read(1)) + if b0 == 30: + s = '' + loop = True + while loop: + b = ord(fp.read(1)) + for n in (b >> 4, b & 15): + if n == 15: + loop = False + else: + s += NIBBLES[n] + return float(s) + if 32 <= b0 and b0 <= 246: + return b0-139 + b1 = ord(fp.read(1)) + if 247 <= b0 and b0 <= 250: + return ((b0-247)<<8)+b1+108 + if 251 <= b0 and b0 <= 254: + return -((b0-251)<<8)-b1-108 + b2 = ord(fp.read(1)) + if 128 <= b1: b1 -= 256 + if b0 == 28: + return b1<<8 | b2 + return b1<<24 | b2<<16 | unpack('>H',fp.read(2))[0] +#assert getop(StringIO('\x8b')) == 0 +#assert getop(StringIO('\xef')) == 100 +#assert getop(StringIO('\x27')) == -100 +#assert getop(StringIO('\xfa\x7c')) == 1000 +#assert getop(StringIO('\xfe\x7c')) == -1000 +#assert getop(StringIO('\x1c\x27\x10')) == 10000 +#assert getop(StringIO('\x1c\xd8\xf0')) == -10000 +#assert getop(StringIO('\x1d\x00\x01\x86\xa0')) == 100000 +#assert getop(StringIO('\x1d\xff\xfe\x79\x60')) == -100000 +#assert getop(StringIO('\x1e\xe2\xa2\x5f')) == -2.25 +#assert getop(StringIO('\x1e\x0a\x14\x05\x41\xc3\xff')) == 0.140541e-3 + + +## CFFFont +## (Format specified in Adobe Technical Note: #5176 +## "The Compact Font Format Specification") +## +class CFFFont(object): + + class INDEX(object): + + def __init__(self, fp): + self.fp = fp + self.offsets = [] + (count, offsize) = unpack('>HB', self.fp.read(3)) + for i in xrange(count+1): + self.offsets.append(nunpack(self.fp.read(offsize))) + self.base = self.fp.tell()-1 + self.fp.seek(self.base+self.offsets[-1]) + return + + def __repr__(self): + return '' % len(self) + + def __len__(self): + return len(self.offsets)-1 + + def __getitem__(self, i): + self.fp.seek(self.base+self.offsets[i]) + return self.fp.read(self.offsets[i+1]-self.offsets[i]) + + def __init__(self, name, fp): + self.name = name + self.fp = fp + # Header + (_major,_minor,hdrsize,self.offsize) = unpack('BBBB', fp.read(4)) + self.fp.read(hdrsize-4) + # Name INDEX + self.name_index = self.INDEX(self.fp) + # Top DICT INDEX + self.dict_index = self.INDEX(self.fp) + # String INDEX + self.string_index = self.INDEX(self.fp) + # Global Subr INDEX + self.subr_index = self.INDEX(self.fp) + # Encodings + # Charsets + return + + + +## TrueTypeFont +## +class TrueTypeFont(object): + + class CMapNotFound(Exception): pass + + def __init__(self, name, fp): + self.name = name + self.fp = fp + self.tables = {} + fonttype = fp.read(4) + (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + for i in xrange(ntables): + (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) + self.tables[name] = (offset, length) + return + + def create_cmap(self): + if 'cmap' not in self.tables: + raise TrueTypeFont.CMapNotFound + (base_offset, length) = self.tables['cmap'] + fp = self.fp + fp.seek(base_offset) + (version, nsubtables) = unpack('>HH', fp.read(4)) + subtables = [] + for i in xrange(nsubtables): + subtables.append(unpack('>HHL', fp.read(8))) + char2gid = {} + # Only supports subtable type 0, 2 and 4. + for (_1, _2, st_offset) in subtables: + fp.seek(base_offset+st_offset) + (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) + if fmttype == 0: + char2gid.update(enumerate(unpack('>256B', fp.read(256)))) + elif fmttype == 2: + subheaderkeys = unpack('>256H', fp.read(512)) + firstbytes = [0]*8192 + for (i,k) in enumerate(subheaderkeys): + firstbytes[k/8] = i + nhdrs = max(subheaderkeys)/8 + 1 + hdrs = [] + for i in xrange(nhdrs): + (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) + hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) + for (i,firstcode,entcount,delta,pos) in hdrs: + if not entcount: continue + first = firstcode + (firstbytes[i] << 8) + fp.seek(pos) + for c in xrange(entcount): + gid = unpack('>H', fp.read(2)) + if gid: + gid += delta + char2gid[first+c] = gid + elif fmttype == 4: + (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) + segcount /= 2 + ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) + fp.read(2) + scs = unpack('>%dH' % segcount, fp.read(2*segcount)) + idds = unpack('>%dh' % segcount, fp.read(2*segcount)) + pos = fp.tell() + idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) + for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): + if idr: + fp.seek(pos+idr) + for c in xrange(sc, ec+1): + char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff + else: + for c in xrange(sc, ec+1): + char2gid[c] = (c + idd) & 0xffff + gid2char = dict( (gid, pack('>H', char)) + for (char,gid) in char2gid.iteritems() ) + return CMap().update(char2gid, gid2char) ## Fonts @@ -96,17 +258,15 @@ class PDFSimpleFont(PDFFont): return def to_unicode(self, cid): - if not self.ucs2_cmap: - try: - return self.encoding[cid] - except KeyError: - raise PDFUnicodeNotDefined(None, cid) - code = self.ucs2_cmap.tocode(cid) - if not code: + if self.ucs2_cmap: + code = self.ucs2_cmap.tocode(cid) + if code: + chars = unpack('>%dH' % (len(code)/2), code) + return ''.join( unichr(c) for c in chars ) + try: + return self.encoding[cid] + except KeyError: raise PDFUnicodeNotDefined(None, cid) - chars = unpack('>%dH' % (len(code)/2), code) - return ''.join( unichr(c) for c in chars ) - # PDFType1Font class PDFType1Font(PDFSimpleFont): @@ -171,81 +331,6 @@ class PDFType3Font(PDFSimpleFont): # PDFCIDFont - -## TrueTypeFont -## -class TrueTypeFont(object): - - class CMapNotFound(Exception): pass - - def __init__(self, name, fp): - self.name = name - self.fp = fp - self.tables = {} - fonttype = fp.read(4) - (ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - for i in xrange(ntables): - (name, tsum, offset, length) = unpack('>4sLLL', fp.read(16)) - self.tables[name] = (offset, length) - return - - def create_cmap(self): - if 'cmap' not in self.tables: - raise TrueTypeFont.CMapNotFound - (base_offset, length) = self.tables['cmap'] - fp = self.fp - fp.seek(base_offset) - (version, nsubtables) = unpack('>HH', fp.read(4)) - subtables = [] - for i in xrange(nsubtables): - subtables.append(unpack('>HHL', fp.read(8))) - char2gid = {} - # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: - fp.seek(base_offset+st_offset) - (fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6)) - if fmttype == 0: - char2gid.update(enumerate(unpack('>256B', fp.read(256)))) - elif fmttype == 2: - subheaderkeys = unpack('>256H', fp.read(512)) - firstbytes = [0]*8192 - for (i,k) in enumerate(subheaderkeys): - firstbytes[k/8] = i - nhdrs = max(subheaderkeys)/8 + 1 - hdrs = [] - for i in xrange(nhdrs): - (firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8)) - hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset)) - for (i,firstcode,entcount,delta,pos) in hdrs: - if not entcount: continue - first = firstcode + (firstbytes[i] << 8) - fp.seek(pos) - for c in xrange(entcount): - gid = unpack('>H', fp.read(2)) - if gid: - gid += delta - char2gid[first+c] = gid - elif fmttype == 4: - (segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8)) - segcount /= 2 - ecs = unpack('>%dH' % segcount, fp.read(2*segcount)) - fp.read(2) - scs = unpack('>%dH' % segcount, fp.read(2*segcount)) - idds = unpack('>%dh' % segcount, fp.read(2*segcount)) - pos = fp.tell() - idrs = unpack('>%dH' % segcount, fp.read(2*segcount)) - for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs): - if idr: - fp.seek(pos+idr) - for c in xrange(sc, ec+1): - char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff - else: - for c in xrange(sc, ec+1): - char2gid[c] = (c + idd) & 0xffff - gid2char = dict( (gid, pack('>H', char)) - for (char,gid) in char2gid.iteritems() ) - return CMap().update(char2gid, gid2char) - class PDFCIDFont(PDFFont): def __init__(self, rsrc, spec): @@ -358,3 +443,13 @@ class PDFCIDFont(PDFFont): def space_width(self): return 0 + + +# main +def main(argv): + for fname in argv[1:]: + fp = file(fname, 'rb') + CFFFont(fname, fp) + fp.close() + return +if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdflib/pdfinterp.py b/pdflib/pdfinterp.py index e0ba9aa..9681c38 100644 --- a/pdflib/pdfinterp.py +++ b/pdflib/pdfinterp.py @@ -6,17 +6,17 @@ try: from cStringIO import StringIO except ImportError: from StringIO import StringIO -from pdflib.psparser import PSException, PSTypeError, PSEOF, \ +from psparser import PSException, PSTypeError, PSEOF, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ PSStackParser, PSKeyword, STRICT -from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \ +from pdftypes import PDFException, PDFStream, PDFObjRef, \ resolve1, int_value, float_value, num_value, \ str_value, list_value, dict_value, stream_value -from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY -from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont -from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ +from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY +from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont +from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \ LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK -from pdflib.cmap import CMapDB +from cmap import CMapDB ## Exceptions @@ -391,27 +391,27 @@ class PDFPageInterpreter(object): return # setgray-stroking def do_G(self, gray): - self.do_CS(LITERAL_DEVICE_GRAY) + #self.do_CS(LITERAL_DEVICE_GRAY) return # setgray-non-stroking def do_g(self, gray): - self.do_cs(LITERAL_DEVICE_GRAY) + #self.do_cs(LITERAL_DEVICE_GRAY) return # setrgb-stroking def do_RG(self, r, g, b): - self.do_CS(LITERAL_DEVICE_RGB) + #self.do_CS(LITERAL_DEVICE_RGB) return # setrgb-non-stroking def do_rg(self, r, g, b): - self.do_cs(LITERAL_DEVICE_RGB) + #self.do_cs(LITERAL_DEVICE_RGB) return # setcmyk-stroking def do_K(self, c, m, y, k): - self.do_CS(LITERAL_DEVICE_CMYK) + #self.do_CS(LITERAL_DEVICE_CMYK) return # setcmyk-non-stroking def do_k(self, c, m, y, k): - self.do_cs(LITERAL_DEVICE_CMYK) + #self.do_cs(LITERAL_DEVICE_CMYK) return # setcolor diff --git a/pdflib/pdfparser.py b/pdflib/pdfparser.py index 97c79ae..a4156d2 100755 --- a/pdflib/pdfparser.py +++ b/pdflib/pdfparser.py @@ -7,12 +7,12 @@ import sys, re import md5, struct stderr = sys.stderr -from pdflib.utils import choplist, nunpack, decode_text -from pdflib.arcfour import Arcfour -from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \ +from utils import choplist, nunpack, decode_text +from arcfour import Arcfour +from psparser import PSStackParser, PSSyntaxError, PSEOF, \ PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \ STRICT -from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \ +from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \ PDFStream, PDFObjRef, resolve1, decipher_all, \ int_value, float_value, num_value, str_value, list_value, dict_value, stream_value @@ -34,23 +34,50 @@ LITERAL_CATALOG = PSLiteralTable.intern('Catalog') ## XRefs ## +class XRefObjRange(object): + def __init__(self, start, nobjs): + self.start = start + self.nobjs = nobjs + return + + def __repr__(self): + return '' % (self.get_start_id(), self.get_end_id()) + + def get_start_id(self): + return self.start + + def get_end_id(self): + return self.start + self.nobjs - 1 + + def get_nobjs(self): + return self.nobjs + +class PDFBaseXRef(object): + def __init__(self): + self.objid_ranges = None + self.objid_list = None + return + + def objids(self): + for objid_range in self.objid_ranges: + for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1): + yield objid + return ## PDFXRef ## -class PDFXRef(object): - +class PDFXRef(PDFBaseXRef): def __init__(self): + PDFBaseXRef.__init__(self) self.offsets = None return def __repr__(self): return '' % len(self.offsets) - def objids(self): - return self.offsets.iterkeys() - def load(self, parser, debug=0): self.offsets = {} + self.objid_ranges = [] while 1: try: (pos, line) = parser.nextline() @@ -68,6 +95,8 @@ class PDFXRef(object): (start, nobjs) = map(long, f) except ValueError: raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line)) + self.newoffsets = {} + self.objid_ranges.append(XRefObjRange(start, nobjs)) for objid in xrange(start, start+nobjs): try: (_, line) = parser.nextline() @@ -108,11 +137,10 @@ class PDFXRef(object): ## PDFXRefStream ## -class PDFXRefStream(object): +class PDFXRefStream(PDFBaseXRef): def __init__(self): - self.objid_first = None - self.objid_last = None + PDFBaseXRef.__init__(self) self.data = None self.entlen = None self.fl1 = self.fl2 = self.fl3 = None @@ -121,9 +149,6 @@ class PDFXRefStream(object): def __repr__(self): return '' % (self.objid_first, self.objid_last) - def objids(self): - return xrange(self.objid_first, self.objid_last+1) - def load(self, parser, debug=0): (_,objid) = parser.nexttoken() # ignored (_,genno) = parser.nexttoken() # ignored @@ -132,22 +157,31 @@ class PDFXRefStream(object): if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF: raise PDFNoValidXRef('Invalid PDF stream spec.') size = stream.dic['Size'] - (start, nobjs) = stream.dic.get('Index', (0,size)) - self.objid_first = start - self.objid_last = start+nobjs-1 + index_array = stream.dic.get('Index', (0,size)) + if len(index_array) % 2 != 0: + raise PDFSyntaxError('Invalid index number') + self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ] (self.fl1, self.fl2, self.fl3) = stream.dic['W'] self.data = stream.get_data() self.entlen = self.fl1+self.fl2+self.fl3 self.trailer = stream.dic if debug: - print >>stderr, ('xref stream: objid=%d-%d, fields=%d,%d,%d' % - (self.objid_first, self.objid_last, self.fl1, self.fl2, self.fl3)) + print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' % + (', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3))) return def getpos(self, objid): - if objid < self.objid_first or self.objid_last < objid: - raise KeyError(objid) - i = self.entlen * (objid-self.objid_first) + offset = 0 + found = False + for objid_range in self.objid_ranges: + if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id(): + offset += objid - objid_range.get_start_id() + found = True + break + else: + offset += objid_range.get_nobjs() + if not found: raise KeyError(objid) + i = self.entlen * offset ent = self.data[i:i+self.entlen] f1 = nunpack(ent[:self.fl1], 1) if f1 == 1: diff --git a/pdflib/pdftypes.py b/pdflib/pdftypes.py index f090c5c..571caf4 100644 --- a/pdflib/pdftypes.py +++ b/pdflib/pdftypes.py @@ -1,8 +1,8 @@ #!/usr/bin/env python import sys, zlib stderr = sys.stderr -from pdflib.lzw import LZWDecoder -from pdflib.psparser import PSException, PSObject, \ +from lzw import LZWDecoder +from psparser import PSException, PSObject, \ PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \ literal_name, keyword_name, STRICT diff --git a/pdflib/psparser.py b/pdflib/psparser.py index 05e8933..e8999bc 100644 --- a/pdflib/psparser.py +++ b/pdflib/psparser.py @@ -2,7 +2,7 @@ import sys, re stderr = sys.stderr -from pdflib.utils import choplist +from utils import choplist STRICT = 0