From 0a2d90c0510dfb9de6ed9dbb5de01d34aaef97ee Mon Sep 17 00:00:00 2001 From: cybjit Date: Sun, 7 Sep 2014 18:34:11 +0200 Subject: [PATCH 1/4] pdf2txt: do not double encode stdout --- pdfminer/converter.py | 4 +++- tools/pdf2txt.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 285d826..b01616e 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -164,7 +164,9 @@ class TextConverter(PDFConverter): return def write_text(self, text): - self.outfp.write(text.encode(self.codec, 'ignore')) + if self.codec: + text = text.encode(self.codec, 'ignore') + self.outfp.write(text) return def receive_layout(self, ltpage): diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 61d878f..2cf1572 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -85,6 +85,8 @@ def main(argv): outfp = open(outfile, 'wb') else: outfp = sys.stdout + if outfp.encoding is not None: + codec = None if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) From f9a67db89b508aaf3b97be42035729f11ba8e481 Mon Sep 17 00:00:00 2001 From: cybjit Date: Sun, 7 Sep 2014 18:36:12 +0200 Subject: [PATCH 2/4] change xrange to range --- pdfminer/cmapdb.py | 6 +++--- pdfminer/converter.py | 4 ++-- pdfminer/pdffont.py | 28 ++++++++++++++-------------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 1e1c94e..6dee4a7 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -358,7 +358,7 @@ class CMapParser(PSStackParser): e1 = nunpack(evar) vlen = len(svar) #assert s1 <= e1 - for i in xrange(e1-s1+1): + for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] self.cmap.add_code2cid(x, cid+i) return @@ -386,14 +386,14 @@ class CMapParser(PSStackParser): e1 = nunpack(e) #assert s1 <= e1 if isinstance(code, list): - for i in xrange(e1-s1+1): + for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) else: var = code[-4:] base = nunpack(var) prefix = code[:-4] vlen = len(var) - for i in xrange(e1-s1+1): + for i in range(e1-s1+1): x = prefix+struct.pack('>L', base+i)[-vlen:] self.cmap.add_cid2unichr(s1+i, x) return diff --git a/pdfminer/converter.py b/pdfminer/converter.py index b01616e..ccf1d2a 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -99,7 +99,7 @@ class PDFLayoutAnalyzer(PDFTextDevice): # other shapes pts = [] for p in path: - for i in xrange(1, len(p), 2): + for i in range(1, len(p), 2): pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1]))) self.cur_item.add(LTCurve(gstate.linewidth, pts)) return @@ -254,7 +254,7 @@ class HTMLConverter(PDFConverter): def write_footer(self): self.write('
Page: %s
\n' % - ', '.join('%s' % (i, i) for i in xrange(1, self.pageno))) + ', '.join('%s' % (i, i) for i in range(1, self.pageno))) self.write('\n') return diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 770bc1d..a4bca61 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -45,7 +45,7 @@ def get_widths(seq): r.append(v) if len(r) == 3: (char1, char2, w) = r - for i in xrange(char1, char2+1): + for i in range(char1, char2+1): widths[i] = w r = [] return widths @@ -68,7 +68,7 @@ def get_widths2(seq): r.append(v) if len(r) == 5: (char1, char2, w, vx, vy) = r - for i in xrange(char1, char2+1): + for i in range(char1, char2+1): widths[i] = (w, (vx, vy)) r = [] return widths @@ -266,7 +266,7 @@ class CFFFont(object): self.fp = fp self.offsets = [] (count, offsize) = struct.unpack('>HB', self.fp.read(3)) - for i in xrange(count+1): + for i in range(count+1): self.offsets.append(nunpack(self.fp.read(offsize))) self.base = self.fp.tell()-1 self.fp.seek(self.base+self.offsets[-1]) @@ -283,7 +283,7 @@ class CFFFont(object): return self.fp.read(self.offsets[i+1]-self.offsets[i]) def __iter__(self): - return iter(self[i] for i in xrange(len(self))) + return iter(self[i] for i in range(len(self))) def __init__(self, name, fp): self.name = name @@ -323,9 +323,9 @@ class CFFFont(object): # Format 1 (n,) = struct.unpack('B', self.fp.read(1)) code = 0 - for i in xrange(n): + for i in range(n): (first, nleft) = struct.unpack('BB', self.fp.read(2)) - for gid in xrange(first, first+nleft+1): + for gid in range(first, first+nleft+1): self.code2gid[code] = gid self.gid2code[gid] = code code += 1 @@ -348,9 +348,9 @@ class CFFFont(object): # Format 1 (n,) = struct.unpack('B', self.fp.read(1)) sid = 0 - for i in xrange(n): + for i in range(n): (first, nleft) = struct.unpack('BB', self.fp.read(2)) - for gid in xrange(first, first+nleft+1): + for gid in range(first, first+nleft+1): name = self.getstr(sid) self.name2gid[name] = gid self.gid2name[gid] = name @@ -384,7 +384,7 @@ class TrueTypeFont(object): self.tables = {} self.fonttype = fp.read(4) (ntables, _1, _2, _3) = struct.unpack('>HHHH', fp.read(8)) - for _ in xrange(ntables): + for _ in range(ntables): (name, tsum, offset, length) = struct.unpack('>4sLLL', fp.read(16)) self.tables[name] = (offset, length) return @@ -397,7 +397,7 @@ class TrueTypeFont(object): fp.seek(base_offset) (version, nsubtables) = struct.unpack('>HH', fp.read(4)) subtables = [] - for i in xrange(nsubtables): + for i in range(nsubtables): subtables.append(struct.unpack('>HHL', fp.read(8))) char2gid = {} # Only supports subtable type 0, 2 and 4. @@ -413,7 +413,7 @@ class TrueTypeFont(object): firstbytes[k//8] = i nhdrs = max(subheaderkeys)//8 + 1 hdrs = [] - for i in xrange(nhdrs): + for i in range(nhdrs): (firstcode, entcount, delta, offset) = struct.unpack('>HHhH', fp.read(8)) hdrs.append((i, firstcode, entcount, delta, fp.tell()-2+offset)) for (i, firstcode, entcount, delta, pos) in hdrs: @@ -421,7 +421,7 @@ class TrueTypeFont(object): continue first = firstcode + (firstbytes[i] << 8) fp.seek(pos) - for c in xrange(entcount): + for c in range(entcount): gid = struct.unpack('>H', fp.read(2)) if gid: gid += delta @@ -438,10 +438,10 @@ class TrueTypeFont(object): for (ec, sc, idd, idr) in zip(ecs, scs, idds, idrs): if idr: fp.seek(pos+idr) - for c in xrange(sc, ec+1): + for c in range(sc, ec+1): char2gid[c] = (struct.unpack('>H', fp.read(2))[0] + idd) & 0xffff else: - for c in xrange(sc, ec+1): + for c in range(sc, ec+1): char2gid[c] = (c + idd) & 0xffff else: assert 0 From cc733c821730caa1246e1fc15b2d9ffd541413ee Mon Sep 17 00:00:00 2001 From: cybjit Date: Sun, 7 Sep 2014 18:38:22 +0200 Subject: [PATCH 3/4] fixes for ARC4 --- pdfminer/arcfour.py | 1 + pdfminer/pdfdocument.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pdfminer/arcfour.py b/pdfminer/arcfour.py index a5c8038..b2ba8a1 100644 --- a/pdfminer/arcfour.py +++ b/pdfminer/arcfour.py @@ -37,3 +37,4 @@ class Arcfour(object): encrypt = decrypt = process +new = Arcfour diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index c892414..a49ae0b 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -342,7 +342,7 @@ class PDFStandardSecurityHandler(object): hash.update(self.docid[0]) # 3 result = ARC4.new(key).encrypt(hash.digest()) # 4 for i in range(1, 20): # 5 - k = b''.join(chr(ord(c) ^ i) for c in key) + k = b''.join(six.int2byte(c ^ i) for c in six.iterbytes(key)) result = ARC4.new(k).encrypt(result) result += result # 6 return result From a6f31a713dc662234fe60d1e52d282a795984da7 Mon Sep 17 00:00:00 2001 From: cybjit Date: Sun, 7 Sep 2014 18:41:04 +0200 Subject: [PATCH 4/4] cmap bytes and decode --- pdfminer/cmapdb.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 6dee4a7..e964b8f 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -180,11 +180,11 @@ class FileUnicodeMap(UnicodeMap): if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. self.cid2unichr[cid] = name2unicode(code.name) - elif isinstance(code, str): + elif isinstance(code, bytes): # Interpret as UTF-16BE. - self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore') + self.cid2unichr[cid] = code.decode('UTF-16BE', 'ignore') elif isinstance(code, int): - self.cid2unichr[cid] = unichr(code) + self.cid2unichr[cid] = six.unichr(code) else: raise TypeError(code) return @@ -379,7 +379,7 @@ class CMapParser(PSStackParser): if token is self.KEYWORD_ENDBFRANGE: objs = [obj for (__, obj) in self.popall()] for (s, e, code) in choplist(3, objs): - if (not isinstance(s, str) or not isinstance(e, str) or + if (not isinstance(s, bytes) or not isinstance(e, bytes) or len(s) != len(e)): continue s1 = nunpack(s) @@ -404,7 +404,7 @@ class CMapParser(PSStackParser): if token is self.KEYWORD_ENDBFCHAR: objs = [obj for (__, obj) in self.popall()] for (cid, code) in choplist(2, objs): - if isinstance(cid, str) and isinstance(code, str): + if isinstance(cid, bytes) and isinstance(code, bytes): self.cmap.add_cid2unichr(nunpack(cid), code) return