2020-04-28 08:58:42 +00:00
|
|
|
#!/usr/bin/env python3
|
2016-11-08 19:01:11 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
import sys
|
2020-01-04 15:47:07 +00:00
|
|
|
import pickle as pickle
|
2014-09-11 21:34:09 +00:00
|
|
|
import codecs
|
2009-12-19 14:17:00 +00:00
|
|
|
|
|
|
|
|
2020-01-04 15:47:07 +00:00
|
|
|
class CMapConverter:
|
2013-10-12 04:20:40 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def __init__(self, enc2codec={}):
|
|
|
|
self.enc2codec = enc2codec
|
2019-12-29 20:20:20 +00:00
|
|
|
self.code2cid = {} # {'cmapname': ...}
|
2013-10-12 04:20:40 +00:00
|
|
|
self.is_vertical = {}
|
2019-12-29 20:20:20 +00:00
|
|
|
self.cid2unichr_h = {} # {cid: unichr}
|
|
|
|
self.cid2unichr_v = {} # {cid: unichr}
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def get_encs(self):
|
|
|
|
return self.code2cid.keys()
|
|
|
|
|
|
|
|
def get_maps(self, enc):
|
|
|
|
if enc.endswith('-H'):
|
|
|
|
(hmapenc, vmapenc) = (enc, None)
|
|
|
|
elif enc == 'H':
|
|
|
|
(hmapenc, vmapenc) = ('H', 'V')
|
|
|
|
else:
|
|
|
|
(hmapenc, vmapenc) = (enc+'-H', enc+'-V')
|
|
|
|
if hmapenc in self.code2cid:
|
|
|
|
hmap = self.code2cid[hmapenc]
|
|
|
|
else:
|
|
|
|
hmap = {}
|
|
|
|
self.code2cid[hmapenc] = hmap
|
|
|
|
vmap = None
|
|
|
|
if vmapenc:
|
|
|
|
self.is_vertical[vmapenc] = True
|
|
|
|
if vmapenc in self.code2cid:
|
|
|
|
vmap = self.code2cid[vmapenc]
|
|
|
|
else:
|
|
|
|
vmap = {}
|
|
|
|
self.code2cid[vmapenc] = vmap
|
|
|
|
return (hmap, vmap)
|
|
|
|
|
2013-10-12 04:20:40 +00:00
|
|
|
def load(self, fp):
|
2013-10-17 13:57:48 +00:00
|
|
|
encs = None
|
2013-10-12 04:20:40 +00:00
|
|
|
for line in fp:
|
2019-12-29 20:20:20 +00:00
|
|
|
(line, _, _) = line.strip().partition('#')
|
|
|
|
if not line:
|
|
|
|
continue
|
2013-10-12 04:20:40 +00:00
|
|
|
values = line.split('\t')
|
2013-10-17 13:57:48 +00:00
|
|
|
if encs is None:
|
2017-05-29 07:06:09 +00:00
|
|
|
assert values[0] == 'CID', str(values)
|
2013-10-17 13:57:48 +00:00
|
|
|
encs = values
|
2013-10-12 04:20:40 +00:00
|
|
|
continue
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def put(dmap, code, cid, force=False):
|
|
|
|
for b in code[:-1]:
|
|
|
|
if b in dmap:
|
|
|
|
dmap = dmap[b]
|
|
|
|
else:
|
|
|
|
d = {}
|
|
|
|
dmap[b] = d
|
|
|
|
dmap = d
|
2014-09-11 21:34:09 +00:00
|
|
|
b = code[-1]
|
2013-10-17 13:57:48 +00:00
|
|
|
if force or ((b not in dmap) or dmap[b] == cid):
|
|
|
|
dmap[b] = cid
|
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def add(unimap, enc, code):
|
2009-12-19 14:17:00 +00:00
|
|
|
try:
|
2013-10-17 13:57:48 +00:00
|
|
|
codec = self.enc2codec[enc]
|
|
|
|
c = code.decode(codec, 'strict')
|
|
|
|
if len(c) == 1:
|
|
|
|
if c not in unimap:
|
|
|
|
unimap[c] = 0
|
|
|
|
unimap[c] += 1
|
|
|
|
except KeyError:
|
|
|
|
pass
|
2009-12-19 14:17:00 +00:00
|
|
|
except UnicodeError:
|
|
|
|
pass
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def pick(unimap):
|
2014-09-11 21:34:09 +00:00
|
|
|
chars = list(unimap.items())
|
2019-12-29 20:20:20 +00:00
|
|
|
chars.sort(key=(lambda x: (x[1], -ord(x[0]))), reverse=True)
|
|
|
|
(c, _) = chars[0]
|
2013-10-17 13:57:48 +00:00
|
|
|
return c
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-22 09:44:40 +00:00
|
|
|
cid = int(values[0])
|
2013-10-17 13:57:48 +00:00
|
|
|
unimap_h = {}
|
|
|
|
unimap_v = {}
|
2019-12-29 20:20:20 +00:00
|
|
|
for (enc, value) in zip(encs, values):
|
|
|
|
if enc == 'CID':
|
|
|
|
continue
|
|
|
|
if value == '*':
|
|
|
|
continue
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
# hcodes, vcodes: encoded bytes for each writing mode.
|
|
|
|
hcodes = []
|
|
|
|
vcodes = []
|
|
|
|
for code in value.split(','):
|
|
|
|
vertical = code.endswith('v')
|
|
|
|
if vertical:
|
|
|
|
code = code[:-1]
|
|
|
|
try:
|
2014-09-11 21:34:09 +00:00
|
|
|
code = codecs.decode(code, 'hex_codec')
|
2019-12-29 20:20:20 +00:00
|
|
|
except Exception:
|
2013-10-17 13:57:48 +00:00
|
|
|
code = chr(int(code, 16))
|
|
|
|
if vertical:
|
|
|
|
vcodes.append(code)
|
|
|
|
add(unimap_v, enc, code)
|
|
|
|
else:
|
|
|
|
hcodes.append(code)
|
|
|
|
add(unimap_h, enc, code)
|
|
|
|
# add cid to each map.
|
|
|
|
(hmap, vmap) = self.get_maps(enc)
|
|
|
|
if vcodes:
|
|
|
|
assert vmap is not None
|
|
|
|
for code in vcodes:
|
|
|
|
put(vmap, code, cid, True)
|
|
|
|
for code in hcodes:
|
|
|
|
put(hmap, code, cid, True)
|
2013-10-12 04:20:40 +00:00
|
|
|
else:
|
2013-10-17 13:57:48 +00:00
|
|
|
for code in hcodes:
|
|
|
|
put(hmap, code, cid)
|
|
|
|
put(vmap, code, cid)
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
# Determine the "most popular" candidate.
|
|
|
|
if unimap_h:
|
|
|
|
self.cid2unichr_h[cid] = pick(unimap_h)
|
2013-10-22 09:44:40 +00:00
|
|
|
if unimap_v or unimap_h:
|
2013-10-17 13:57:48 +00:00
|
|
|
self.cid2unichr_v[cid] = pick(unimap_v or unimap_h)
|
2013-10-22 09:44:40 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
def dump_cmap(self, fp, enc):
|
|
|
|
data = dict(
|
|
|
|
IS_VERTICAL=self.is_vertical.get(enc, False),
|
|
|
|
CODE2CID=self.code2cid.get(enc),
|
|
|
|
)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp.write(pickle.dumps(data, 2))
|
2013-10-17 13:57:48 +00:00
|
|
|
return
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
def dump_unicodemap(self, fp):
|
|
|
|
data = dict(
|
|
|
|
CID2UNICHR_H=self.cid2unichr_h,
|
|
|
|
CID2UNICHR_V=self.cid2unichr_v,
|
|
|
|
)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp.write(pickle.dumps(data, 2))
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
def main(argv):
|
2013-10-17 13:57:48 +00:00
|
|
|
import getopt
|
|
|
|
import gzip
|
|
|
|
import os.path
|
2013-11-07 07:14:53 +00:00
|
|
|
|
2009-12-19 14:17:00 +00:00
|
|
|
def usage():
|
2019-12-29 20:20:20 +00:00
|
|
|
print('usage: %s [-c enc=codec] output_dir regname [cid2code.txt ...]'
|
|
|
|
% argv[0])
|
2009-12-19 14:17:00 +00:00
|
|
|
return 100
|
2013-10-17 13:57:48 +00:00
|
|
|
try:
|
|
|
|
(opts, args) = getopt.getopt(argv[1:], 'c:')
|
|
|
|
except getopt.GetoptError:
|
|
|
|
return usage()
|
|
|
|
enc2codec = {}
|
|
|
|
for (k, v) in opts:
|
|
|
|
if k == '-c':
|
2019-12-29 20:20:20 +00:00
|
|
|
(enc, _, codec) = v.partition('=')
|
2013-10-17 13:57:48 +00:00
|
|
|
enc2codec[enc] = codec
|
2019-12-29 20:20:20 +00:00
|
|
|
if not args:
|
|
|
|
return usage()
|
2013-10-17 13:57:48 +00:00
|
|
|
outdir = args.pop(0)
|
2019-12-29 20:20:20 +00:00
|
|
|
if not args:
|
|
|
|
return usage()
|
2013-10-17 13:57:48 +00:00
|
|
|
regname = args.pop(0)
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
converter = CMapConverter(enc2codec)
|
|
|
|
for path in args:
|
2019-12-29 20:20:20 +00:00
|
|
|
print('reading: %r...' % path)
|
2014-09-11 21:34:09 +00:00
|
|
|
fp = open(path)
|
2013-10-17 13:57:48 +00:00
|
|
|
converter.load(fp)
|
|
|
|
fp.close()
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2013-10-17 13:57:48 +00:00
|
|
|
for enc in converter.get_encs():
|
|
|
|
fname = '%s.pickle.gz' % enc
|
|
|
|
path = os.path.join(outdir, fname)
|
2019-12-29 20:20:20 +00:00
|
|
|
print('writing: %r...' % path)
|
2013-10-17 13:57:48 +00:00
|
|
|
fp = gzip.open(path, 'wb')
|
|
|
|
converter.dump_cmap(fp, enc)
|
2009-12-19 14:17:00 +00:00
|
|
|
fp.close()
|
|
|
|
|
2010-06-13 13:50:24 +00:00
|
|
|
fname = 'to-unicode-%s.pickle.gz' % regname
|
2013-10-17 13:57:48 +00:00
|
|
|
path = os.path.join(outdir, fname)
|
2019-12-29 20:20:20 +00:00
|
|
|
print('writing: %r...' % path)
|
2013-10-17 13:57:48 +00:00
|
|
|
fp = gzip.open(path, 'wb')
|
|
|
|
converter.dump_unicodemap(fp)
|
2009-12-19 14:17:00 +00:00
|
|
|
fp.close()
|
2013-10-12 04:20:40 +00:00
|
|
|
return
|
2009-12-19 14:17:00 +00:00
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
Add type annotations (#661)
Squashed commit of the following:
commit fa229f7b7591c07aea4e5a4545f9e0c34246e1cd
Merge: eaab3c6 c3e3499
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 20:33:06 2021 -0700
Merge branch 'develop' into mypy (and fixed types)
commit eaab3c65e2e3ab5f1f400cfc5186a3834c4ffe34
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 20:00:45 2021 -0700
reformat all multi-line function defs to one-arg-per-line
commit 3fe2b69eed9197009d9da6776462f580ebf0dfa3
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:58:48 2021 -0700
ccitt nit -- avoid casting needlessly
commit 15983d8c1e7162632fde43752c9d1c15938cd980
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:58:36 2021 -0700
tweak CHANGELOG
commit 13dc0babf782938e7d5b5e482d4c5adf92d82702
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:43:46 2021 -0700
add failing tests for dumppdf crash
commit 6b509c517876b8c15ac5a98a963884e23bd2e4d8
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:24:23 2021 -0700
ccitt: apply misc PR feedback
commit feb031ba86d3f22e41cfbbda13f17c039359f1e6
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:18:26 2021 -0700
add missing None return type to all __init__ methods
commit c0d62d6c54c7ec37b40bea54a3f6a7a618ec0ec6
Author: Andrew Baumann <ab@ab.id.au>
Date: Mon Sep 6 15:13:08 2021 -0700
minor cleanup, remove a few more Any types
commit b52a0594e1998a492c172538a9b35491c5fc5f52
Author: Andrew Baumann <ab@ab.id.au>
Date: Sun Sep 5 22:37:28 2021 -0700
tighten up types, avoid Any in favour of explicit casts
commit e58fd48bd14f31bebd2de8259f12630ac02756d6
Author: Andrew Baumann <ab@ab.id.au>
Date: Sun Sep 5 14:10:49 2021 -0700
annotate ccitt.py, and fix one definite bug (array.tostring was renamed tobytes)
commit 605290633e55595e5e0045840df5c5b1d9de843a
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Sep 4 22:37:38 2021 -0700
python 3.7 back-compat
commit 4dbcf8760f8a1d3e3d99f085476f86e6a043c80c
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Sep 4 22:32:43 2021 -0700
annotate pdfminer.jbig2
commit 0d40b7c03a8028dc44acd3f457eac71abd681827
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Sep 4 22:31:33 2021 -0700
annotate pdf2txt.py
commit 5f82eb4f5646b5d1285252689191e0a14557ec7b
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Sep 4 09:16:31 2021 -0700
cleanup: make Plane generic
commit 624fc92b88473ff36a174760883f34c22109da2b
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 23:16:51 2021 -0700
bluntly ignore calls to cryptography.hazmat
commit 96b20439c169f40dbb114cabba6a582ad1ebe91e
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 23:01:06 2021 -0700
finish annotating, and disallow_untyped_defs for pdfminer.* _except_ ccitt and jbig2
commit 0ab586347861b72b1d16880dc9293f9ad597e20a
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 21:51:56 2021 -0700
annotate pdffont
commit 4b689f1bcbdaf654feb9de81023e318ca310a12e
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 18:30:02 2021 -0700
annotate a couple more scripts; document sketchy code
commit 291981ff3d273952ec9c92ef8ab948473558b787
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 15:02:01 2021 -0700
pacify flake8
commit 45d2ce91ff333f3b7e34322b16e9c52b99b7a972
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 14:31:48 2021 -0700
annotate dumppdf, and comment likely bugs
commit 7278d83851cb336a1be3803a0993b5ec0ad39b4c
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 13:49:58 2021 -0700
enable mypy on tests and tools, fix one implicit reexport bug
commit 4a83166ef4e4733cd2113f43188b585a4fda392b
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 13:25:59 2021 -0700
pdfdocument: per dumppdf.py, get_dest accepts either bytes or str
commit 43701e1bee068df98f378a253c9c2150ee4ad9f7
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 13:25:00 2021 -0700
layout: LAParams.boxes_flow may be None
commit 164f81652f1788e74837466f0ab593e94079bc0f
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 09:45:09 2021 -0700
add whitespace, pacify flake8
commit 893b9fb9ec918032b36a30456fc0b7a217da86d8
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 09:40:33 2021 -0700
support old Python without typing.Protocol
commit dc245084102b7b04c3f5599d75b5d62ba4290787
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Sep 3 09:12:03 2021 -0700
Move "# type: ignore" comments to fix mypy on Python < 3.8
The placement of these comments got more flexible in 3.8 due to
https://github.com/python/mypy/issues/1032
Satisfying older Python and fitting in flake8's 79-character line
limit was quite a challenge!
commit da03afe7bd2cf3336e611f467f1c901455940ae8
Author: Andrew Baumann <ab@ab.id.au>
Date: Thu Sep 2 22:59:58 2021 -0700
fix text output from HTMLConverter
commit 5401276a2ed3b74a385ebcab5152485224146161
Author: Andrew Baumann <ab@ab.id.au>
Date: Thu Sep 2 22:40:22 2021 -0700
annotate high_level.py and the immediately-reachable internal APIs (mostly converters)
commit cc490513f8f17a7adc0bcbab2e0e86f37e832300
Author: Andrew Baumann <ab@ab.id.au>
Date: Thu Sep 2 17:04:35 2021 -0700
* expand and improve annotations in cmap, encryption/decompression and fonts
* disallow untyped calls; this way, we have a core set of
typed code that can grow over time
(just not for ccitt, because there's a ton of work lurking there)
* expand "typing: none" comments to suppress a specific error code
commit 92df54ba1d53d5dbbd5442757dd85be5b1851f99
Author: Andrew Baumann <ab@ab.id.au>
Date: Wed Sep 1 20:50:59 2021 -0700
update CHANGELOG
commit f72aaead45d0615e472a9b3190c9551a6b67b36e
Merge: ff787a9 8ea9f10
Author: Andrew Baumann <ab@ab.id.au>
Date: Wed Sep 1 20:47:03 2021 -0700
Merge branch 'develop' into mypy
commit ff787a93986c60361536a97182a41774f4a53ac3
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Aug 21 21:46:14 2021 -0700
be more precise about types on ps/pdf stacks, remove most of the Any annotations
commit be1550189e10717f6827dbb7009d6e8c8b3f4c62
Author: Andrew Baumann <ab@ab.id.au>
Date: Sat Aug 21 10:13:58 2021 -0700
silence missing imports, (maybe?) hook to tox
commit ff4b6a9bd46b352583d823d39065652c9a6f05f4
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Aug 20 22:49:06 2021 -0700
turn on more strict checks, and untangle the layout mess with generics
Status:
$ mypy pdfminer
pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame"
pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports
pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs
pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs
pdfminer/pdfdevice.py:191: error: Argument 1 to "write" of "IO" has incompatible type "str"; expected "bytes"
pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL"
Found 5 errors in 4 files (checked 27 source files)
pdfdevice.py:191 appears to be a real bug
commit 5c9c0b19d26ae391aea0e69c2c819261cc04460c
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Aug 20 17:22:41 2021 -0700
finish annotating layout
commit 0e6871c16abb29df2868ab145b4ce451b4b6c777
Author: Andrew Baumann <ab@ab.id.au>
Date: Fri Aug 20 16:54:46 2021 -0700
general progress on annotations
* finish utils
* annotate more of pdfinterp, pdfdevice
* document reason for # type: ignore comments
* fix cyclic imports
* satisfy flake8
commit 17d59f42917fbf9b2b2eb844d3e83a8f2a3f123a
Author: Andrew Baumann <ab@ab.id.au>
Date: Thu Aug 19 21:38:50 2021 -0700
WIP on type annotations
With the possible exception of psparser.py, this is far from complete.
$ mypy pdfminer
pdfminer/ccitt.py:565: error: Cannot find implementation or library stub for module named "pygame"
pdfminer/ccitt.py:565: note: See https://mypy.readthedocs.io/en/stable/running_mypy.html#missing-imports
pdfminer/pdfdocument.py:7: error: Skipping analyzing "cryptography.hazmat.backends": found module but no type hints or library stubs
pdfminer/pdfdocument.py:8: error: Skipping analyzing "cryptography.hazmat.primitives.ciphers": found module but no type hints or library stubs
pdfminer/image.py:84: error: Cannot find implementation or library stub for module named "PIL"
2021-10-09 14:23:28 +00:00
|
|
|
sys.exit(main(sys.argv)) # type: ignore[no-untyped-call]
|