tmp commit

git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@70 1aa58f4a-7d42-0410-adbc-911cccaed67c
pull/1/head
yusuke.shinyama.dummy 2009-02-23 14:00:38 +00:00
parent 2694de9521
commit 13a6603151
9 changed files with 267 additions and 138 deletions

View File

@ -2,8 +2,8 @@
import sys
stderr = sys.stderr
from struct import pack, unpack
from pdflib.utils import choplist, nunpack
from pdflib.psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
from utils import choplist, nunpack
from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF, \
PSLiteral, PSKeyword, literal_name, keyword_name, \
PSStackParser
try:

View File

@ -2,11 +2,11 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdflib.pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdflib.pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
from pdflib.pdffont import PDFUnicodeNotDefined
from pdflib.cmap import CMapDB
from pdfparser import PDFDocument, PDFParser, PDFPasswordIncorrect
from pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfdevice import PDFDevice, FigureItem, TextItem, PDFPageAggregator
from pdffont import PDFUnicodeNotDefined
from cmap import CMapDB
def enc(x, codec):
@ -121,7 +121,7 @@ class TagExtractor(PDFDevice):
def render_image(self, stream, size, matrix):
return
def render_string(self, textstate, textmatrix, size, seq):
def render_string(self, textstate, textmatrix, seq):
font = textstate.font
text = ''
for x in seq:

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
import sys
stderr = sys.stderr
from pdflib.psparser import PSLiteralTable
from psparser import PSLiteralTable
## ColorSpace

View File

@ -2,8 +2,8 @@
import sys
stdout = sys.stdout
stderr = sys.stderr
from pdflib.pdffont import PDFUnicodeNotDefined
from pdflib.utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
from pdffont import PDFUnicodeNotDefined
from utils import mult_matrix, apply_matrix, apply_matrix_norm, translate_matrix
## PDFDevice

View File

@ -6,13 +6,175 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from pdflib.psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
from psparser import PSLiteralTable, PSKeywordTable, PSLiteral, \
literal_name, keyword_name, STRICT
from pdflib.pdftypes import PDFException, \
from pdftypes import PDFException, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from pdflib.cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import apply_matrix_norm
from cmap import CMap, CMapDB, CMapParser, FontMetricsDB, EncodingDB
from utils import apply_matrix_norm, nunpack
NIBBLES = ('0','1','2','3','4','5','6','7','8','9','.','e','e-',None,'-')
def getnum(fp):
b0 = ord(fp.read(1))
if b0 == 30:
s = ''
loop = True
while loop:
b = ord(fp.read(1))
for n in (b >> 4, b & 15):
if n == 15:
loop = False
else:
s += NIBBLES[n]
return float(s)
if 32 <= b0 and b0 <= 246:
return b0-139
b1 = ord(fp.read(1))
if 247 <= b0 and b0 <= 250:
return ((b0-247)<<8)+b1+108
if 251 <= b0 and b0 <= 254:
return -((b0-251)<<8)-b1-108
b2 = ord(fp.read(1))
if 128 <= b1: b1 -= 256
if b0 == 28:
return b1<<8 | b2
return b1<<24 | b2<<16 | unpack('>H',fp.read(2))[0]
#assert getop(StringIO('\x8b')) == 0
#assert getop(StringIO('\xef')) == 100
#assert getop(StringIO('\x27')) == -100
#assert getop(StringIO('\xfa\x7c')) == 1000
#assert getop(StringIO('\xfe\x7c')) == -1000
#assert getop(StringIO('\x1c\x27\x10')) == 10000
#assert getop(StringIO('\x1c\xd8\xf0')) == -10000
#assert getop(StringIO('\x1d\x00\x01\x86\xa0')) == 100000
#assert getop(StringIO('\x1d\xff\xfe\x79\x60')) == -100000
#assert getop(StringIO('\x1e\xe2\xa2\x5f')) == -2.25
#assert getop(StringIO('\x1e\x0a\x14\x05\x41\xc3\xff')) == 0.140541e-3
## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
class CFFFont(object):
class INDEX(object):
def __init__(self, fp):
self.fp = fp
self.offsets = []
(count, offsize) = unpack('>HB', self.fp.read(3))
for i in xrange(count+1):
self.offsets.append(nunpack(self.fp.read(offsize)))
self.base = self.fp.tell()-1
self.fp.seek(self.base+self.offsets[-1])
return
def __repr__(self):
return '<INDEX: size=%d>' % len(self)
def __len__(self):
return len(self.offsets)-1
def __getitem__(self, i):
self.fp.seek(self.base+self.offsets[i])
return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __init__(self, name, fp):
self.name = name
self.fp = fp
# Header
(_major,_minor,hdrsize,self.offsize) = unpack('BBBB', fp.read(4))
self.fp.read(hdrsize-4)
# Name INDEX
self.name_index = self.INDEX(self.fp)
# Top DICT INDEX
self.dict_index = self.INDEX(self.fp)
# String INDEX
self.string_index = self.INDEX(self.fp)
# Global Subr INDEX
self.subr_index = self.INDEX(self.fp)
# Encodings
# Charsets
return
## TrueTypeFont
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
return CMap().update(char2gid, gid2char)
## Fonts
@ -96,17 +258,15 @@ class PDFSimpleFont(PDFFont):
return
def to_unicode(self, cid):
if not self.ucs2_cmap:
if self.ucs2_cmap:
code = self.ucs2_cmap.tocode(cid)
if code:
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
try:
return self.encoding[cid]
except KeyError:
raise PDFUnicodeNotDefined(None, cid)
code = self.ucs2_cmap.tocode(cid)
if not code:
raise PDFUnicodeNotDefined(None, cid)
chars = unpack('>%dH' % (len(code)/2), code)
return ''.join( unichr(c) for c in chars )
# PDFType1Font
class PDFType1Font(PDFSimpleFont):
@ -171,81 +331,6 @@ class PDFType3Font(PDFSimpleFont):
# PDFCIDFont
## TrueTypeFont
##
class TrueTypeFont(object):
class CMapNotFound(Exception): pass
def __init__(self, name, fp):
self.name = name
self.fp = fp
self.tables = {}
fonttype = fp.read(4)
(ntables, _1, _2, _3) = unpack('>HHHH', fp.read(8))
for i in xrange(ntables):
(name, tsum, offset, length) = unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length)
return
def create_cmap(self):
if 'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = unpack('>HH', fp.read(4))
subtables = []
for i in xrange(nsubtables):
subtables.append(unpack('>HHL', fp.read(8)))
char2gid = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
fp.seek(base_offset+st_offset)
(fmttype, fmtlen, fmtlang) = unpack('>HHH', fp.read(6))
if fmttype == 0:
char2gid.update(enumerate(unpack('>256B', fp.read(256))))
elif fmttype == 2:
subheaderkeys = unpack('>256H', fp.read(512))
firstbytes = [0]*8192
for (i,k) in enumerate(subheaderkeys):
firstbytes[k/8] = i
nhdrs = max(subheaderkeys)/8 + 1
hdrs = []
for i in xrange(nhdrs):
(firstcode,entcount,delta,offset) = unpack('>HHhH', fp.read(8))
hdrs.append((i,firstcode,entcount,delta,fp.tell()-2+offset))
for (i,firstcode,entcount,delta,pos) in hdrs:
if not entcount: continue
first = firstcode + (firstbytes[i] << 8)
fp.seek(pos)
for c in xrange(entcount):
gid = unpack('>H', fp.read(2))
if gid:
gid += delta
char2gid[first+c] = gid
elif fmttype == 4:
(segcount, _1, _2, _3) = unpack('>HHHH', fp.read(8))
segcount /= 2
ecs = unpack('>%dH' % segcount, fp.read(2*segcount))
fp.read(2)
scs = unpack('>%dH' % segcount, fp.read(2*segcount))
idds = unpack('>%dh' % segcount, fp.read(2*segcount))
pos = fp.tell()
idrs = unpack('>%dH' % segcount, fp.read(2*segcount))
for (ec,sc,idd,idr) in zip(ecs, scs, idds, idrs):
if idr:
fp.seek(pos+idr)
for c in xrange(sc, ec+1):
char2gid[c] = (unpack('>H', fp.read(2))[0] + idd) & 0xffff
else:
for c in xrange(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff
gid2char = dict( (gid, pack('>H', char))
for (char,gid) in char2gid.iteritems() )
return CMap().update(char2gid, gid2char)
class PDFCIDFont(PDFFont):
def __init__(self, rsrc, spec):
@ -358,3 +443,13 @@ class PDFCIDFont(PDFFont):
def space_width(self):
return 0
# main
def main(argv):
for fname in argv[1:]:
fp = file(fname, 'rb')
CFFFont(fname, fp)
fp.close()
return
if __name__ == '__main__': sys.exit(main(sys.argv))

View File

@ -6,17 +6,17 @@ try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from pdflib.psparser import PSException, PSTypeError, PSEOF, \
from psparser import PSException, PSTypeError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
PSStackParser, PSKeyword, STRICT
from pdflib.pdftypes import PDFException, PDFStream, PDFObjRef, \
from pdftypes import PDFException, PDFStream, PDFObjRef, \
resolve1, int_value, float_value, num_value, \
str_value, list_value, dict_value, stream_value
from pdflib.utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from pdflib.pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdflib.pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
from utils import choplist, mult_matrix, translate_matrix, apply_matrix, apply_matrix_norm, MATRIX_IDENTITY
from pdffont import PDFFontError, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
from pdfcolor import ColorSpace, PREDEFINED_COLORSPACE, \
LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, LITERAL_DEVICE_CMYK
from pdflib.cmap import CMapDB
from cmap import CMapDB
## Exceptions
@ -391,27 +391,27 @@ class PDFPageInterpreter(object):
return
# setgray-stroking
def do_G(self, gray):
self.do_CS(LITERAL_DEVICE_GRAY)
#self.do_CS(LITERAL_DEVICE_GRAY)
return
# setgray-non-stroking
def do_g(self, gray):
self.do_cs(LITERAL_DEVICE_GRAY)
#self.do_cs(LITERAL_DEVICE_GRAY)
return
# setrgb-stroking
def do_RG(self, r, g, b):
self.do_CS(LITERAL_DEVICE_RGB)
#self.do_CS(LITERAL_DEVICE_RGB)
return
# setrgb-non-stroking
def do_rg(self, r, g, b):
self.do_cs(LITERAL_DEVICE_RGB)
#self.do_cs(LITERAL_DEVICE_RGB)
return
# setcmyk-stroking
def do_K(self, c, m, y, k):
self.do_CS(LITERAL_DEVICE_CMYK)
#self.do_CS(LITERAL_DEVICE_CMYK)
return
# setcmyk-non-stroking
def do_k(self, c, m, y, k):
self.do_cs(LITERAL_DEVICE_CMYK)
#self.do_cs(LITERAL_DEVICE_CMYK)
return
# setcolor

View File

@ -7,12 +7,12 @@
import sys, re
import md5, struct
stderr = sys.stderr
from pdflib.utils import choplist, nunpack, decode_text
from pdflib.arcfour import Arcfour
from pdflib.psparser import PSStackParser, PSSyntaxError, PSEOF, \
from utils import choplist, nunpack, decode_text
from arcfour import Arcfour
from psparser import PSStackParser, PSSyntaxError, PSEOF, \
PSLiteralTable, PSKeywordTable, literal_name, keyword_name, \
STRICT
from pdflib.pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
from pdftypes import PDFException, PDFTypeError, PDFNotImplementedError, \
PDFStream, PDFObjRef, resolve1, decipher_all, \
int_value, float_value, num_value, str_value, list_value, dict_value, stream_value
@ -34,23 +34,50 @@ LITERAL_CATALOG = PSLiteralTable.intern('Catalog')
## XRefs
##
class XRefObjRange(object):
def __init__(self, start, nobjs):
self.start = start
self.nobjs = nobjs
return
def __repr__(self):
return '<XRefObjRange: %d-%d>' % (self.get_start_id(), self.get_end_id())
def get_start_id(self):
return self.start
def get_end_id(self):
return self.start + self.nobjs - 1
def get_nobjs(self):
return self.nobjs
class PDFBaseXRef(object):
def __init__(self):
self.objid_ranges = None
self.objid_list = None
return
def objids(self):
for objid_range in self.objid_ranges:
for objid in xrange(objid_range.get_start_id(), objid_range.get_end_id() + 1):
yield objid
return
## PDFXRef
##
class PDFXRef(object):
class PDFXRef(PDFBaseXRef):
def __init__(self):
PDFBaseXRef.__init__(self)
self.offsets = None
return
def __repr__(self):
return '<PDFXRef: objs=%d>' % len(self.offsets)
def objids(self):
return self.offsets.iterkeys()
def load(self, parser, debug=0):
self.offsets = {}
self.objid_ranges = []
while 1:
try:
(pos, line) = parser.nextline()
@ -68,6 +95,8 @@ class PDFXRef(object):
(start, nobjs) = map(long, f)
except ValueError:
raise PDFNoValidXRef('Invalid line: %r: line=%r' % (parser, line))
self.newoffsets = {}
self.objid_ranges.append(XRefObjRange(start, nobjs))
for objid in xrange(start, start+nobjs):
try:
(_, line) = parser.nextline()
@ -108,11 +137,10 @@ class PDFXRef(object):
## PDFXRefStream
##
class PDFXRefStream(object):
class PDFXRefStream(PDFBaseXRef):
def __init__(self):
self.objid_first = None
self.objid_last = None
PDFBaseXRef.__init__(self)
self.data = None
self.entlen = None
self.fl1 = self.fl2 = self.fl3 = None
@ -121,9 +149,6 @@ class PDFXRefStream(object):
def __repr__(self):
return '<PDFXRef: objid=%d-%d>' % (self.objid_first, self.objid_last)
def objids(self):
return xrange(self.objid_first, self.objid_last+1)
def load(self, parser, debug=0):
(_,objid) = parser.nexttoken() # ignored
(_,genno) = parser.nexttoken() # ignored
@ -132,22 +157,31 @@ class PDFXRefStream(object):
if not isinstance(stream, PDFStream) or stream.dic['Type'] is not LITERAL_XREF:
raise PDFNoValidXRef('Invalid PDF stream spec.')
size = stream.dic['Size']
(start, nobjs) = stream.dic.get('Index', (0,size))
self.objid_first = start
self.objid_last = start+nobjs-1
index_array = stream.dic.get('Index', (0,size))
if len(index_array) % 2 != 0:
raise PDFSyntaxError('Invalid index number')
self.objid_ranges = [ XRefObjRange(start,nobjs) for (start,nobjs) in choplist(2, index_array) ]
(self.fl1, self.fl2, self.fl3) = stream.dic['W']
self.data = stream.get_data()
self.entlen = self.fl1+self.fl2+self.fl3
self.trailer = stream.dic
if debug:
print >>stderr, ('xref stream: objid=%d-%d, fields=%d,%d,%d' %
(self.objid_first, self.objid_last, self.fl1, self.fl2, self.fl3))
print >>stderr, ('xref stream: objid=%s, fields=%d,%d,%d' %
(', '.join(map(repr, self.objid_ranges), self.fl1, self.fl2, self.fl3)))
return
def getpos(self, objid):
if objid < self.objid_first or self.objid_last < objid:
raise KeyError(objid)
i = self.entlen * (objid-self.objid_first)
offset = 0
found = False
for objid_range in self.objid_ranges:
if objid >= objid_range.get_start_id() and objid <= objid_range.get_end_id():
offset += objid - objid_range.get_start_id()
found = True
break
else:
offset += objid_range.get_nobjs()
if not found: raise KeyError(objid)
i = self.entlen * offset
ent = self.data[i:i+self.entlen]
f1 = nunpack(ent[:self.fl1], 1)
if f1 == 1:

View File

@ -1,8 +1,8 @@
#!/usr/bin/env python
import sys, zlib
stderr = sys.stderr
from pdflib.lzw import LZWDecoder
from pdflib.psparser import PSException, PSObject, \
from lzw import LZWDecoder
from psparser import PSException, PSObject, \
PSLiteral, PSKeyword, PSLiteralTable, PSKeywordTable, \
literal_name, keyword_name, STRICT

View File

@ -2,7 +2,7 @@
import sys, re
stderr = sys.stderr
from pdflib.utils import choplist
from utils import choplist
STRICT = 0