Merge pull request #1 from Cybjit/master

Python 3 text conversion issues
pull/4/head
Philippe Guglielmetti 2014-09-09 20:42:37 +02:00
commit 7b620b3146
6 changed files with 31 additions and 26 deletions

View File

@ -37,3 +37,4 @@ class Arcfour(object):
encrypt = decrypt = process encrypt = decrypt = process
new = Arcfour

View File

@ -180,11 +180,11 @@ class FileUnicodeMap(UnicodeMap):
if isinstance(code, PSLiteral): if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name. # Interpret as an Adobe glyph name.
self.cid2unichr[cid] = name2unicode(code.name) self.cid2unichr[cid] = name2unicode(code.name)
elif isinstance(code, str): elif isinstance(code, bytes):
# Interpret as UTF-16BE. # Interpret as UTF-16BE.
self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore') self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore')
elif isinstance(code, int): elif isinstance(code, int):
self.cid2unichr[cid] = unichr(code) self.cid2unichr[cid] = six.unichr(code)
else: else:
raise TypeError(code) raise TypeError(code)
return return
@ -358,7 +358,7 @@ class CMapParser(PSStackParser):
e1 = nunpack(evar) e1 = nunpack(evar)
vlen = len(svar) vlen = len(svar)
#assert s1 <= e1 #assert s1 <= e1
for i in xrange(e1-s1+1): for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:] x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i) self.cmap.add_code2cid(x, cid+i)
return return
@ -379,21 +379,21 @@ class CMapParser(PSStackParser):
if token is self.KEYWORD_ENDBFRANGE: if token is self.KEYWORD_ENDBFRANGE:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (s, e, code) in choplist(3, objs): for (s, e, code) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or if (not isinstance(s, bytes) or not isinstance(e, bytes) or
len(s) != len(e)): len(s) != len(e)):
continue continue
s1 = nunpack(s) s1 = nunpack(s)
e1 = nunpack(e) e1 = nunpack(e)
#assert s1 <= e1 #assert s1 <= e1
if isinstance(code, list): if isinstance(code, list):
for i in xrange(e1-s1+1): for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i]) self.cmap.add_cid2unichr(s1+i, code[i])
else: else:
var = code[-4:] var = code[-4:]
base = nunpack(var) base = nunpack(var)
prefix = code[:-4] prefix = code[:-4]
vlen = len(var) vlen = len(var)
for i in xrange(e1-s1+1): for i in range(e1-s1+1):
x = prefix+struct.pack('>L', base+i)[-vlen:] x = prefix+struct.pack('>L', base+i)[-vlen:]
self.cmap.add_cid2unichr(s1+i, x) self.cmap.add_cid2unichr(s1+i, x)
return return
@ -404,7 +404,7 @@ class CMapParser(PSStackParser):
if token is self.KEYWORD_ENDBFCHAR: if token is self.KEYWORD_ENDBFCHAR:
objs = [obj for (__, obj) in self.popall()] objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs): for (cid, code) in choplist(2, objs):
if isinstance(cid, str) and isinstance(code, str): if isinstance(cid, bytes) and isinstance(code, bytes):
self.cmap.add_cid2unichr(nunpack(cid), code) self.cmap.add_cid2unichr(nunpack(cid), code)
return return

View File

@ -99,7 +99,7 @@ class PDFLayoutAnalyzer(PDFTextDevice):
# other shapes # other shapes
pts = [] pts = []
for p in path: for p in path:
for i in xrange(1, len(p), 2): for i in range(1, len(p), 2):
pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
self.cur_item.add(LTCurve(gstate.linewidth, pts)) self.cur_item.add(LTCurve(gstate.linewidth, pts))
return return
@ -164,7 +164,9 @@ class TextConverter(PDFConverter):
return return
def write_text(self, text): def write_text(self, text):
self.outfp.write(text.encode(self.codec, 'ignore')) if self.codec:
text = text.encode(self.codec, 'ignore')
self.outfp.write(text)
return return
def receive_layout(self, ltpage): def receive_layout(self, ltpage):
@ -252,7 +254,7 @@ class HTMLConverter(PDFConverter):
def write_footer(self): def write_footer(self):
self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' % self.write('<div style="position:absolute; top:0px;">Page: %s</div>\n' %
', '.join('<a href="#%s">%s</a>' % (i, i) for i in xrange(1, self.pageno))) ', '.join('<a href="#%s">%s</a>' % (i, i) for i in range(1, self.pageno)))
self.write('</body></html>\n') self.write('</body></html>\n')
return return

View File

@ -342,7 +342,7 @@ class PDFStandardSecurityHandler(object):
hash.update(self.docid[0]) # 3 hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4 result = ARC4.new(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5 for i in range(1, 20): # 5
k = b''.join(chr(ord(c) ^ i) for c in key) k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key))
result = ARC4.new(k).encrypt(result) result = ARC4.new(k).encrypt(result)
result += result # 6 result += result # 6
return result return result

View File

@ -45,7 +45,7 @@ def get_widths(seq):
r.append(v) r.append(v)
if len(r) == 3: if len(r) == 3:
(char1, char2, w) = r (char1, char2, w) = r
for i in xrange(char1, char2+1): for i in range(char1, char2+1):
widths[i] = w widths[i] = w
r = [] r = []
return widths return widths
@ -68,7 +68,7 @@ def get_widths2(seq):
r.append(v) r.append(v)
if len(r) == 5: if len(r) == 5:
(char1, char2, w, vx, vy) = r (char1, char2, w, vx, vy) = r
for i in xrange(char1, char2+1): for i in range(char1, char2+1):
widths[i] = (w, (vx, vy)) widths[i] = (w, (vx, vy))
r = [] r = []
return widths return widths
@ -266,7 +266,7 @@ class CFFFont(object):
self.fp = fp self.fp = fp
self.offsets = [] self.offsets = []
(count, offsize) = struct.unpack('>HB', self.fp.read(3)) (count, offsize) = struct.unpack('>HB', self.fp.read(3))
for i in xrange(count+1): for i in range(count+1):
self.offsets.append(nunpack(self.fp.read(offsize))) self.offsets.append(nunpack(self.fp.read(offsize)))
self.base = self.fp.tell()-1 self.base = self.fp.tell()-1
self.fp.seek(self.base+self.offsets[-1]) self.fp.seek(self.base+self.offsets[-1])
@ -283,7 +283,7 @@ class CFFFont(object):
return self.fp.read(self.offsets[i+1]-self.offsets[i]) return self.fp.read(self.offsets[i+1]-self.offsets[i])
def __iter__(self): def __iter__(self):
return iter(self[i] for i in xrange(len(self))) return iter(self[i] for i in range(len(self)))
def __init__(self, name, fp): def __init__(self, name, fp):
self.name = name self.name = name
@ -323,9 +323,9 @@ class CFFFont(object):
# Format 1 # Format 1
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
code = 0 code = 0
for i in xrange(n): for i in range(n):
(first, nleft) = struct.unpack('BB', self.fp.read(2)) (first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1): for gid in range(first, first+nleft+1):
self.code2gid[code] = gid self.code2gid[code] = gid
self.gid2code[gid] = code self.gid2code[gid] = code
code += 1 code += 1
@ -348,9 +348,9 @@ class CFFFont(object):
# Format 1 # Format 1
(n,) = struct.unpack('B', self.fp.read(1)) (n,) = struct.unpack('B', self.fp.read(1))
sid = 0 sid = 0
for i in xrange(n): for i in range(n):
(first, nleft) = struct.unpack('BB', self.fp.read(2)) (first, nleft) = struct.unpack('BB', self.fp.read(2))
for gid in xrange(first, first+nleft+1): for gid in range(first, first+nleft+1):
name = self.getstr(sid) name = self.getstr(sid)
self.name2gid[name] = gid self.name2gid[name] = gid
self.gid2name[gid] = name self.gid2name[gid] = name
@ -384,7 +384,7 @@ class TrueTypeFont(object):
self.tables = {} self.tables = {}
self.fonttype = fp.read(4) self.fonttype = fp.read(4)
(ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8))
for _ in xrange(ntables): for _ in range(ntables):
(name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16)) (name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16))
self.tables[name] = (offset, length) self.tables[name] = (offset, length)
return return
@ -397,7 +397,7 @@ class TrueTypeFont(object):
fp.seek(base_offset) fp.seek(base_offset)
(version, nsubtables) = struct.unpack('>HH', fp.read(4)) (version, nsubtables) = struct.unpack('>HH', fp.read(4))
subtables = [] subtables = []
for i in xrange(nsubtables): for i in range(nsubtables):
subtables.append(struct.unpack('>HHL', fp.read(8))) subtables.append(struct.unpack('>HHL', fp.read(8)))
char2gid = {} char2gid = {}
# Only supports subtable type 0, 2 and 4. # Only supports subtable type 0, 2 and 4.
@ -413,7 +413,7 @@ class TrueTypeFont(object):
firstbytes[k//8] = i firstbytes[k//8] = i
nhdrs = max(subheaderkeys)//8 + 1 nhdrs = max(subheaderkeys)//8 + 1
hdrs = [] hdrs = []
for i in xrange(nhdrs): for i in range(nhdrs):
(firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8)) (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8))
hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset)) hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset))
for (i, firstcode, entcount, delta, pos) in hdrs: for (i, firstcode, entcount, delta, pos) in hdrs:
@ -421,7 +421,7 @@ class TrueTypeFont(object):
continue continue
first = firstcode + (firstbytes[i] << 8) first = firstcode + (firstbytes[i] << 8)
fp.seek(pos) fp.seek(pos)
for c in xrange(entcount): for c in range(entcount):
gid = struct.unpack('>H', fp.read(2)) gid = struct.unpack('>H', fp.read(2))
if gid: if gid:
gid += delta gid += delta
@ -438,10 +438,10 @@ class TrueTypeFont(object):
for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs): for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs):
if idr: if idr:
fp.seek(pos+idr) fp.seek(pos+idr)
for c in xrange(sc, ec+1): for c in range(sc, ec+1):
char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff
else: else:
for c in xrange(sc, ec+1): for c in range(sc, ec+1):
char2gid[c] = (c + idd) & 0xffff char2gid[c] = (c + idd) & 0xffff
else: else:
assert 0 assert 0

View File

@ -85,6 +85,8 @@ def main(argv):
outfp = open(outfile, 'wb') outfp = open(outfile, 'wb')
else: else:
outfp = sys.stdout outfp = sys.stdout
if outfp.encoding is not None:
codec = None
if outtype == 'text': if outtype == 'text':
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
imagewriter=imagewriter) imagewriter=imagewriter)