From 8ab2e287be67185cc120597e6785c1b81df6b842 Mon Sep 17 00:00:00 2001 From: John Kesegich Date: Mon, 25 Feb 2019 11:33:18 -0600 Subject: [PATCH 01/13] Handle PDFStream as character map name in PDFCIDFont --- pdfminer/pdffont.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index feb8557..1487bab 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -16,6 +16,7 @@ from . import settings from .psparser import PSLiteral from .psparser import literal_name from .pdftypes import PDFException +from .pdftypes import PDFStream from .pdftypes import resolve1 from .pdftypes import int_value from .pdftypes import num_value @@ -654,6 +655,17 @@ class PDFCIDFont(PDFFont): if strict: raise PDFFontError('Encoding is unspecified') name = 'unknown' + if type(name) is PDFStream: + if 'CMapName' in name: + name = name.get('CMapName').name + if name == 'DLIdent-H': + name = 'Identity-H' + elif name == 'DLIdent-V': + name = 'Identity-V' + else: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' try: self.cmap = CMapDB.get_cmap(name) except CMapDB.CMapNotFound as e: From c022358c8df6f2b6b9affc8c4bfc716aa222dcee Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 13 Jul 2019 04:52:24 +0530 Subject: [PATCH 02/13] Encapsulates character map name --- pdfminer/pdffont.py | 60 ++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 1487bab..fdebe33 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -649,29 +649,8 @@ class PDFCIDFont(PDFFont): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - try: - name = literal_name(spec['Encoding']) - except KeyError: - if strict: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - if type(name) is PDFStream: - if 'CMapName' in name: - name = name.get('CMapName').name - if name == 'DLIdent-H': - name = 'Identity-H' - elif name == 'DLIdent-V': - name = 'Identity-V' - else: - if strict: - raise PDFFontError('Encoding is unspecified') - name = 'unknown' - try: - self.cmap = CMapDB.get_cmap(name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) - self.cmap = CMap() + self.cmap = (spec, strict) + try: descriptor = dict_value(spec['FontDescriptor']) except KeyError: @@ -718,6 +697,41 @@ class PDFCIDFont(PDFFont): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return + @property + def cmap(self): + return self._cmap + + @cmap.setter + def cmap(self,values): + spec, strict = values + try: + spec_encoding = spec['Encoding'] + if hasattr(spec_encoding, 'name'): + name = literal_name(spec['Encoding']) + else: + name = literal_name(spec_encoding['CMapName']) + except KeyError: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' + if type(name) is PDFStream: + if 'CMapName' in name: + name = name.get('CMapName').name + if name in('DLIdent-H','OneByteIdentityH','Identity-H') : + name = 'Identity-H' + elif name in ('DLIdent-V','OneByteIdentityV','Identity-V'): + name = 'Identity-V' + else: + if strict: + raise PDFFontError('Encoding is unspecified') + name = 'unknown' + try: + self._cmap = CMapDB.get_cmap(name) + except CMapDB.CMapNotFound as e: + if strict: + raise PDFFontError(e) + self._cmap = CMap() + def __repr__(self): return '' % (self.basefont, self.cidcoding) From 8e4a82ad8b249ddd1bc5cd42f6df992e15c0a01d Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 13 Jul 2019 05:00:25 +0530 Subject: [PATCH 03/13] Corrects Indentation --- pdfminer/pdffont.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index fdebe33..85c23b5 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -697,7 +697,7 @@ class PDFCIDFont(PDFFont): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - @property + @property def cmap(self): return self._cmap From cc40af3d2b84a8532988dfe70c3d6c3e15e29ffa Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Mon, 15 Jul 2019 14:21:21 +0530 Subject: [PATCH 04/13] Removes @property, Adds docstring --- pdfminer/pdffont.py | 53 ++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 85c23b5..f5b8942 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,7 +128,14 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') - +CMAP_ENCODER = { + 'DLIdent-H': 'Identity-H', + 'OneByteIdentityH': 'Identity-H', + 'Identity-H': 'Identity-H', + 'DLIdent-V': 'Identity-V', + 'OneByteIdentityV': 'Identity-V', + 'Identity-V': 'Identity-V' +} ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -649,7 +656,7 @@ class PDFCIDFont(PDFFont): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - self.cmap = (spec, strict) + self.cmap_setter(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -697,40 +704,42 @@ class PDFCIDFont(PDFFont): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - @property - def cmap(self): - return self._cmap - - @cmap.setter - def cmap(self,values): - spec, strict = values + def cmap_setter(self, spec, strict): + """ + For certain PDFs, Encoding Type isn't mentioned as an attribute of + Encoding but as an attribute of CMapName, where CMapName is an + attribure of spec['Encoding']. + The horizaontal/vertical modes are mentioned with diffrent name + such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V' + """ try: spec_encoding = spec['Encoding'] if hasattr(spec_encoding, 'name'): - name = literal_name(spec['Encoding']) + cmap_name = literal_name(spec['Encoding']) else: - name = literal_name(spec_encoding['CMapName']) + cmap_name = literal_name(spec_encoding['CMapName']) except KeyError: if strict: raise PDFFontError('Encoding is unspecified') - name = 'unknown' - if type(name) is PDFStream: - if 'CMapName' in name: - name = name.get('CMapName').name - if name in('DLIdent-H','OneByteIdentityH','Identity-H') : - name = 'Identity-H' - elif name in ('DLIdent-V','OneByteIdentityV','Identity-V'): - name = 'Identity-V' + cmap_name = 'unknown' + if type(cmap_name) is PDFStream: + if 'CMapName' in cmap_name: + cmap_key = cmap_name.get('CMapName').cmap_name + try: + cmap_name = CMAP_ENCODER[cmap_key] + except: + cmap_name = cmap_key + raise PDFFontError('Unidentified encoding mentioned. %s is not supported' % cmap_name) else: if strict: raise PDFFontError('Encoding is unspecified') - name = 'unknown' + cmap_name = 'unknown' try: - self._cmap = CMapDB.get_cmap(name) + self.cmap = CMapDB.get_cmap(cmap_name) except CMapDB.CMapNotFound as e: if strict: raise PDFFontError(e) - self._cmap = CMap() + self.cmap = CMap() def __repr__(self): return '' % (self.basefont, self.cidcoding) From fa400431f571de9197ab72ffc5dd9f7d76826474 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 17 Jul 2019 11:38:00 +0530 Subject: [PATCH 05/13] Adds Test, Removes Unnecessary Assumptions --- pdfminer/pdffont.py | 24 ++++------------- tests/test_pdfencoding.py | 56 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 19 deletions(-) create mode 100644 tests/test_pdfencoding.py diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index f5b8942..a09c5c4 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,14 +128,7 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -CMAP_ENCODER = { - 'DLIdent-H': 'Identity-H', - 'OneByteIdentityH': 'Identity-H', - 'Identity-H': 'Identity-H', - 'DLIdent-V': 'Identity-V', - 'OneByteIdentityV': 'Identity-V', - 'Identity-V': 'Identity-V' -} +IDENTITY_ENCODER = ('Identity-H', 'Identity-V') ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -724,21 +717,14 @@ class PDFCIDFont(PDFFont): cmap_name = 'unknown' if type(cmap_name) is PDFStream: if 'CMapName' in cmap_name: - cmap_key = cmap_name.get('CMapName').cmap_name - try: - cmap_name = CMAP_ENCODER[cmap_key] - except: - cmap_name = cmap_key - raise PDFFontError('Unidentified encoding mentioned. %s is not supported' % cmap_name) + cmap_name = cmap_name.get('CMapName').name else: if strict: - raise PDFFontError('Encoding is unspecified') + raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' - try: + if cmap_name in IDENTITY_ENCODER: self.cmap = CMapDB.get_cmap(cmap_name) - except CMapDB.CMapNotFound as e: - if strict: - raise PDFFontError(e) + else: self.cmap = CMap() def __repr__(self): diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py new file mode 100644 index 0000000..4725615 --- /dev/null +++ b/tests/test_pdfencoding.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +# -*- coding: utf-8 -*- + +import nose, logging, os +from pdfminer.cmapdb import IdentityCMap, CMap +from pdfminer.pdffont import PDFCIDFont +from pdfminer.pdftypes import PDFStream +from pdfminer.psparser import PSLiteral + +# 'DLIdent-H': 'Identity-H', +# 'OneByteIdentityH': 'Identity-H', +# 'Identity-H': 'Identity-H', +# 'DLIdent-V': 'Identity-V', +# 'OneByteIdentityV': 'Identity-V', +# 'Identity-V': 'Identity-V' + +class TestPDFEncoding(): + + def test_cmapname_onebyteidentityV(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_onebyteidentityH(self): + stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_V(self): + stream = PDFStream({'CMapName': PSLiteral('V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_cmapname_H(self): + stream = PDFStream({'CMapName': PSLiteral('H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, CMap) + + def test_encoding_identityH(self): + spec = {'Encoding': PSLiteral('Identity-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV(self): + spec = {'Encoding': PSLiteral('Identity-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + +if __name__ == '__main__': + nose.runmodule() From b4c261b647f42844981e083f9920240ccea2a0dc Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 17 Jul 2019 11:43:45 +0530 Subject: [PATCH 06/13] Removes Code Comments --- tests/test_pdfencoding.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 4725615..9ed4e9e 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -8,13 +8,6 @@ from pdfminer.pdffont import PDFCIDFont from pdfminer.pdftypes import PDFStream from pdfminer.psparser import PSLiteral -# 'DLIdent-H': 'Identity-H', -# 'OneByteIdentityH': 'Identity-H', -# 'Identity-H': 'Identity-H', -# 'DLIdent-V': 'Identity-V', -# 'OneByteIdentityV': 'Identity-V', -# 'Identity-V': 'Identity-V' - class TestPDFEncoding(): def test_cmapname_onebyteidentityV(self): From f1a4dcea88e2ada84f737c23b12bf0d3f9a57c49 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Wed, 24 Jul 2019 11:56:06 +0530 Subject: [PATCH 07/13] Adds Test Cases, Neater Code For CMap Assignment --- pdfminer/pdffont.py | 8 ++++---- tests/test_pdfencoding.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index a09c5c4..9f24afb 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -649,7 +649,7 @@ class PDFCIDFont(PDFFont): self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) - self.cmap_setter(spec, strict) + self.cmap = self.get_cmap_from_spec(spec, strict) try: descriptor = dict_value(spec['FontDescriptor']) @@ -697,7 +697,7 @@ class PDFCIDFont(PDFFont): PDFFont.__init__(self, descriptor, widths, default_width=default_width) return - def cmap_setter(self, spec, strict): + def get_cmap_from_spec(self, spec, strict): """ For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an @@ -723,9 +723,9 @@ class PDFCIDFont(PDFFont): raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' if cmap_name in IDENTITY_ENCODER: - self.cmap = CMapDB.get_cmap(cmap_name) + return CMapDB.get_cmap(cmap_name) else: - self.cmap = CMap() + return CMap() def __repr__(self): return '' % (self.basefont, self.cidcoding) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 9ed4e9e..396d12d 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -44,6 +44,34 @@ class TestPDFEncoding(): font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap) + def test_encoding_identityH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityH_as_stream(self): + stream = PDFStream({'CMapName':'Identity-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_identityV_as_stream(self): + stream = PDFStream({'CMapName':'Identity-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_font_without_spec(self): + font = PDFCIDFont(None, {}) + assert isinstance(font.cmap, CMap) + if __name__ == '__main__': nose.runmodule() From 5a0d8db052115465bfe27c08bb2d20e087d5f305 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 10:07:23 +0530 Subject: [PATCH 08/13] Adds decoder for OnebyteIdentityH/V instead of using default CMap --- pdfminer/cmapdb.py | 13 +++++++++++++ pdfminer/pdffont.py | 10 ++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index c3403d1..f7f4a0b 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -131,6 +131,15 @@ class IdentityCMap(CMapBase): return () +class IdentityCMapByte(IdentityCMap): + + def decode(self, code): + n = len(code) + if n: + return struct.unpack('>%dB' % n, code) + else: + return () + ## UnicodeMap ## class UnicodeMap(CMapBase): @@ -252,6 +261,10 @@ class CMapDB(object): return IdentityCMap(WMode=0) elif name == 'Identity-V': return IdentityCMap(WMode=1) + elif name == 'OneByteIdentityH': + return IdentityCMapByte(WMode=0) + elif name == 'OneByteIdentityV': + return IdentityCMapByte(WMode=1) try: return klass._cmap_cache[name] except KeyError: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 9f24afb..17b80cd 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -128,7 +128,13 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -IDENTITY_ENCODER = ('Identity-H', 'Identity-V') +IDENTITY_ENCODER = {'Identity-H':'Identity-H', + 'Identity-V':'Identity-V', + 'DLIdent-H':'Identity-H', + 'DLIdent-V':'Identity-V', + 'OneByteIdentityH':'OneByteIdentityH', + 'OneByteIdentityV':'OneByteIdentityV', + } ## CFFFont ## (Format specified in Adobe Technical Note: #5176 @@ -723,7 +729,7 @@ class PDFCIDFont(PDFFont): raise PDFFontError('CMapName unspecified for encoding') cmap_name = 'unknown' if cmap_name in IDENTITY_ENCODER: - return CMapDB.get_cmap(cmap_name) + return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) else: return CMap() From 5b210981c91cabdef9300e1951a087dc476c72fe Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 10:19:20 +0530 Subject: [PATCH 09/13] Adds Test Case --- samples/sampleOneByteIdentityEncode.pdf | Bin 0 -> 13941 bytes tests/test_tools_pdf2txt.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 samples/sampleOneByteIdentityEncode.pdf diff --git a/samples/sampleOneByteIdentityEncode.pdf b/samples/sampleOneByteIdentityEncode.pdf new file mode 100644 index 0000000000000000000000000000000000000000..35abc4f80d19587d1abb9ffba8bed18d606cdb0b GIT binary patch literal 13941 zcmb7rWmsIzmTrLH9^4xU9-zDF#@*fBg1fuBJ0Z9P2nhrU8XyFBcMSx0cL{bo-lI3b?!qo< zXY6J!Zf@#mX3j2e?qKO=1%N^Y1zp`-%#H2QymDO8)+VvRU=1LbGK$OwSRMF6R1DD+ z0g)p~noX-tvXEX`m=;x@-nN%H%**+E#ETb@XmkxUa|g5kkN_L~QONnXVl{UsCp&X{ za|bs7kX_Z>(i+C;%_st6GiPE~axpV^v39Uzlre)bSi5;M2@3v^_D2TvZy91T;%eTm zZsztf4i=7pKf+c2FAjk@IoZV&jGYvX?akSh9Lz<%-OT?}zBIeKx0Cr_kN?SQ1C;@r zE+9_JJHkhmX}4(n}?b5Z7F(oIEHi9~gKm zF@qa7a^9}pEYn0)KlR^882IL#Sgzm(Q^AFF?HiBGof7w*^!w+=b;>aIii2sEHD4J> zU|NukHdOJD*LR6IHZBy*#1nd{j@>#@`gqGQ^6i+^PY>c7c?HooKSV=4oIc%QzamVo z{P#q1{hfsWy@EJET>l)*cOZu@AclnJ2T`snyVx}JmxhycVzGrsh-T4f{NkMCmb*vy z1FtD~mB(v^gqT#W&MtkJFC6w0?W5l|XY;4mowDJ4t?nyhqAzT(Q5Q zr`;B)$%*6Cdg0sX^tgxgIJxAy{_eIZqF;~E@+;PCLS#-~jNI>=GWVM}CZQtMqF1vf zh*<4JJt9DO9S}Sp$`8dVJZGtPk>r4#ui8yp5zQGcWPA{Q|I!bhY|_$QJKnLh`m4{I0&PIxo#jcR|_LRl)2B{o^5 zreQh1U9+xtwQM6Ms%^>=(uVk}*_s^2n~xFSw1aUh2uxFD_$&1zXr}zJrn(pFofoc9 zccaYfT-VyPYS_xmNW`zul~nWj|2_U7;QvbkKtLW2$Umjv$j{qPedckO$1~I4oFXjMGm?Z3l;e1BLrY9hdZi`Mxy`@XCZvTD?%iR!z32^Ccsac($(W zL}!_1nMN7BD$&bNHWfnsOU)-dt_&}$7JZ+aZ|*kpcvf7Gb&em8cYH@N0dY8!1(;D1 zUv-Ak8fPDBuKlgNu}D)a*fU(mVtX=Ml!d~RJf?E)SS)FM==?&Jd*4O;utaJTHXjdM zu^rbQe}Sp_){jrLI@RgJPp7eW1j%PNc2ZHv+bdkQ7MlU`2%@1#D;%ypA>1mS#J%jn zl5-*VgzS5{A!%KYiau?-$B5X0$!;%?v8juaZ)5u-a8?q?Rpy`9m5V;zOf6T;<@4)| z?mqHSByN68P9EM{mU~D{P9~YV=g|v|10G*IDebp(-zAf{=eJmq{ftT*x-S$UrE^VQ zwD{y*tvVXojN_$~_;hzgSoHjaRv=KK@#O?=Sh?-@MoGcRlgf0VQGIk8UxA1HWn1g0 zun%c&AG`7P=hk~_>Px*#vemwq5DJC0m5r6qdYd{57mjhiqu&VOVS@yKAq^|~&3M@r z$w6B2`{RNwEUQI0zW`E!m=$5XiC6e9OMkPaBDaKWcg{zj=197|5iKZbz6O8)1TXO=p4PalxJaXOSbP#!52m`g4pX0UOvgM+XqGx z%NToDP3KP-0!W};P^?~BpNZ0&TuE%f9U2b&(>CH5d~)M_7&?_ z#f_;bKNeS2QIOthaqi#5$)*q3iWo@A`Ff6%jwCb|mf11B`enu;@iwitvJGiFY<;No zmxE=LOz4W&Zpcbl=2s)>@-UAEj!Cd!7;~A-@xFl8%lMLh0|X}Y@e{Af*P*;Ji4#`!ks$)_mb}8`OQ5K^5G_s zroT*y#pXfUX$jp-x}@b<;tKq<8A2hFMjtZs^%T#SlIkk3nDZvuxpu4d5BBzt9QIbi zjdbfQqNi!3!_>{G-Cv?=nM!Aoh%V~q>iimEHJ~Z2ydVw9TOH;r6+4tC9*>@{JC7(; z7M0j?vkK7?`D*Kg>zw#G2xUxTdDSMu7hI2(ZwbNu?S9k|@#7#WYn6A*YIiD4!_nE3 zPt1aGNQ~aGAEhg=GxJqjPv3lH2`N6Y5Jkrx*JpTa;d8{N;WQOt{gN zN_OgGeaK&&itITI3NWu+X`8re=}MoEEkI_svzU``ckS}LD#qqj4s0Uc$MT{Uclufv z^*@-tu)c?kYiZprO}tKt&VSt1%n)d4YVJ@AI0_f$R-dmU+iCh$z|EdHzof6IoI~>R zkd{%b5Bdc`Cpk$+T}lN8kd0c1h4A4yh-;>`Azq;virRBO|uTNEIqCqN^Tm(m0XKE#+&n- zMNdS`$f>u+TFk7ceTNpfM-x$7J!dOkt`3mZ8aSPURis+^>`5ZJ*F=K>WTQaKp;u?fWdHg!9bG9gz zOHqyZ<NYxH%-vhJJ9;rqGBNt+iThx^h>o^^dbEPD$$ zM|;_ZJ8(KY%%300a*r#=M(7h42u@X=)I7;B+_m81R|vu(5;2Xe<}>9fDmHN`bkJO) zB`xRv<4<^~O6_u-v6OSRc}csNXXZ1%M%PKxt>AAWjEA||t4U7_E4>Xnx;+cEL1D3C z^O8StZ&mjvb$oH%u9*_cn!hVfe0?%wLKm)DySEMn6(aY$O7><#I;M zznh{U0Ts~Wi#<3qZqpzJRo0gOL|jHLcSoye95lcbpQqOPWwnoFlk=GU!)I(dp56gl znO-a>Mi(Kj>e|QPehlh|V0GJ@mW4Eaob(@|_ zy$#6vwPGN_R5}Ox60>+nsUAM32x`1^5AAh(Gg9JV`q=6kR7OBGO)cH2Of>%)DSe!m z6Pgiq?<8F|Xt+w?yQZ*!q`b@C;-XdvM_X11EgxpWaqXwX7lBzbJ>oyRt;i7U^@|h= z?UPSCHI})3P(nzgFXXi)rEfnMZ|#pEX4dI@+B2}GMNY6M6DWw2^DBu9x^amvNgUBP zc16Ww9OlZ8!aKGm0Ej5Pr4QQi^bH!hbxBAoHYHeEt?2l42VyiaCeVl~swU9VzE0}0 zhl9ff@B?7)G{$E6wRC*L!c+M0z=Bf)v}gG>GW@*Hr}*KOs<&w23{3nYYtCqIKUZaU z5;4UY&C09=py@F#Ng8v6>oL74tV-&nN8?gy%@ja^ zAhTIOOSYhe_!q|xQSQsXs=ST;mP{5)Yq)P@`@0bO`7m7a?e%L;h;KmWXKzfruJb=} z-3`kv39bIJWOQ>*$0k*P`x5)ZNtlX^dvbBa-+Mgs6(foHW^3?MWMd`VHsh&W%tbNs zl?h-+!jEOi9)P1oVJD}+c1OS#P6(GMsw8INGe-VfFM^0N{!=LJAdUIHk!l+G@VSJn zFFcc!biJG7>V}eb<)w_Khw$%i)Dew=0k|stfotDH20%+nGD!w!>Y1WIpO*o33&`mu zN4~UTFWK>gcjfU;%Uh8#$ABTPPo784!8fZ&^i=%mDRNXMyy+SENy{~h2p`(Iz24eH z8ZNmRrDMg!C!2^BX|tm$3u1W_Vc0lstbr`^o@r(Y=O+f=HJRP^nxO03Z8TR1b6Ag$ zxpX61jCCf~fs*0utwW84!kWVsS_bluBIm%%c{gs$v|g{-1Ye+p$WTZtVg}ujYom{K zFCb@YpV%7lwRKN^fb$Niy7rRQ74{*)p*M5uHt^K#wCMAFdN3F!S1Xb^_rh+^?OiU=7Ki@fXRW5p_Iv)4ti1x0S=_`CP`^FN2Z zv$Nfn?SuMyujYSvQ0y4Y;Zu963n*)?gtcG`FT;CtY$_<^+|+)4wl*QnKFy~;d{f7Z zRzB-j=3Sz3s)y;VY5!Ft)zsWy^SwPqY68P5nv+DT7tL-TJU<-7f|wF3OEnCE5s7G3 z6hHK`5d||+AMWlZiN}g5C=DKh0FD*s$JqXYRn;=^aY;M!3ivkk8}M!T3^0&#wtedA zMWoo*pa=NPG3vMB>EV5cSReM7Wd1>vibk@EVbR}1GfE-hs;5RV$I7i_BVKdD0peOM`2o+8z>*M~ zO_5!ZRguYqfPsK7mZE3Y#I;(voK=&cwx^_V@@=@-Hk!JM25HR;IhN0<5f95TY_m}4 z`<)6DYZgm&Pm_l(QY%alcDt?`!8&Eo)+e-jjwu*@jeD|X!TTk5v?zglHdXnV51yFF zt5ZAaa@4Kl!Ru~P6ore=VL5rKC-HM8Xsa>HH)Xzj$i?ZA;33P1n!y9fsZlxuZH?vm zaQCfv4mcl4Ph*A6RxF#mi9yNs z>B^@QHm|;QI+cCt`1%O)f>*1TH*v?)ufA$2vcl{(V#)uKgBx?xJ|fe|6hszUT!$y3 zJvo4r4-Yhzykcabj1FleK3DGdPV!HxrM48|6y65ZCRMx1RdF#Q`-Z$lekN=te*k+i zc~Re{um0!|LS8}1!^%TzCH1580(;ST@!z^s@!gu7Dpzqd3eCS-BDBMDz~n^aLYzaM zL!3i82uK`J>F4D7b1IpUP!6b`4S(b-9tkZ zC=$*ND4MgvD3u{q!Cr-zLz~?ZIJoL$8EthZOntw(y{fBBj17*S#}4}Ple#Si-D8kp z^J8P43Wu&=<9K$H1A?m#X)sjz+7Hd!Gn-WBjD(~dD z%^6YwO@q}5MEPZ#kaPqrfA)sUmWKt3MrpxnympmljAx2xY(v@;lxNxpq0L{RnP2pSoY5f5qi^6vESnxA};&`e0 z2jxu4tCbUtD;&!B0(Yf9JG~xqAu&COaXD`Ek16&$+25BsW+ibOied6}%z8iE{Ik~n zl*0akflF4A?NeRAdlo@GQha8zgUS^{U)zgP#ID|vuU|;6Pv`k`XDfbuvbsK$Y-h>TC5Pr#i7xl7}|vy@}q3$CM;h*_f;%jc21t z=YMN&j~Yb>m}I`jCCLl|_y)x=BcYJKcf8n%H#?VTU|2Z;W4af=2!1b^+(2J_n0%I< z$j04La}^$w+Hk*#I7GDWb63{K>FDqLJ0T*-&*33!FW{knxR@{@hg|o)phJDwM!oR} zSM7(aEH3|F;&+v5+zQ55F8QAkO1u#dr$QyAkHfaTe&njY`wVJeU^vM6L0-y_>0y0( z)vs2u-*s~0)p zZSTvNUmQCp27)d>zX~}n^E|fBxF6kM?O5ctT}*g)T^A+VGg3?dC$f2}RY-Y!my8~M z^o~bVPX#`Wd?j)^<{JW2VlQ2P#HQqCWc9a@=!q$%UTA<|eWw0ug0-EC#A(LUdGNI)mn1o;_o+YXjh}K&B~T2G zv&eD?iAK#EF1irLTJPeQXrKKzkjndRV^e-K^Q}Rxd`>Kr?X-<=>dN zGrPaC)tJ{SaRRvs~Dd}aRSqt;0-$Nd6y3mMG3 z9nYM}H;@f-cG&ChmC<>sQ)$@RUS|7LEYq;FwPN@#qyd=_xtKQsc95{%%n=dJtO$9} zDn;;qMj}E!_FHdFrakdB30QcDCdtIxNhjpewD4sZF|~PUNR*YMa`#;xw?Q4WSr__3 zv~9N3K+HgK^;)SC3%-mAaVb{HI(O>2XIxtt)_GY+YqP}?yMS9`Q7qs!Ll3n48(bUnJU>}E==$mE6LR7@UF z;KQ^@oGwM*0Ve53n|C7M#a{wUPWmk1*our9n5AKYslhMCeY$>%M~z~MlCfeY ztDcK!nb0UjoY~IlRj0m;*Tk&Tt#D8yUsEBsj-gd>J^-xWC-5T-Iu3C3w-nLAWn`B$ zq<8Cn5TQ>XB|gpM!m$S-e)Xm~4KsWxi%QJ~<@h#(gt%j@pgy-WaZVg?xcfsvPWNVD zrNw3I`!>UtT$^Cq1;e>R>*HyHM&u;VRC2Dtj<@9KujcbBC*Q_m^HdC8gywZAX5NeZ zP2FCTTkBwU(p7XR#&r}4@`5Nm+r$0G2xSxXMpJdh(DRVZAZp2vGIH8q==djJY1+)G zoYUGR5peb+qY@QPtDe(p^z;}3Me^q)hS7IvJ&-QH0ape&Hj9`2(EB!w*Sh?C8}$WW zuXJ%}QFScL4`PHsk#>!-;RJjjQiHSgvdkzKdM!N6l5;U5ek|pcq>7`Ir>Gq~Pd4B!iNZbR^@# zE)^RDSL*}?r>$hu0|!@KS-Pr|kzbHKWGL|#Snj@8UWaYm92yHQcRc``QMA9u>y)lW zhXygkc6V7(^jbuOTeb&;Dc)?X<_V0G73`3dMDL<98QWuwMxk~Cfr5D0U2(g>paYx% zP8Rxoy{MGrk{=U2Q5ic>o7#wY+?Q|Ig8Gt2xjW&^X3xULqmpq&@7!%9z&18Q<6r52 zmK>*`JF|a-!@;CZZ^B4vfPlvK!F@( z-^G6R>qvqDkvjr9l#53Kg=%CQEIM3Fgv@FR<338s^~K75Tnr5W=cc$6-9f2W^K{4* zDQ~0`s zdqUL9F%m0Q<@IJ%ksj=FATNwW-B;v=fR~T$Yv~BUmOcx~F1u1|g2z3>nHbNdlkS7m z;gIuvR;|&A&H)-YR=@qWYYEEfqFMYr=us9w#eV^Je2&iS$p3UjwUAEq=w`7!gRqp5 zqto#C8QE)cLz`TiGfsf&%t{~0NfOz4+hU+rKl}nQP8Sm?4RoNVqp7f~Zu@)MszO>^ zN}484mc}gkv5m2p6j%W1L=j~&>|+cGJtt%4j`&Enb3pF>fi~7o)VK`6O=Tq>znHbD zBf;sS&-gBd{Vm3kuOSvlX3f>_K2xoDlXBbdc3iE38;+^Y4S^Na3g;*7)*FJ;8e(#z zDuFKq_4LsdK7$xEJTSp9W+6o?1VZ5>9FGWnXzQP7IFXUpsi%#W=icJWfgf zMJuIon{~6gbL?khy_aMy*H^r~bNi|*cSmkpVbsPK=Y+9|Zj3VEo#V}nfVx(7ed|(Z zG>L?%|6$#!=bq_~bnISc=isu^0ybc9Oseno!Mefc-tSA;0a?$*_sdQnhZKBR#;AKE3A3~|l)4IH<;uZWx(?;E9@RlkMb$-FV> zuFVu#uru?-{q?DXzB9l+u^PPfwfDOi<;vEFRv)__?0qyxTORHE?cu97mgbxKkhsAf zf|M*WNqOHYTMyZ)0+0FVBJu|Bvj^jbc#704!Mo9|)T;_li~G0b_1L+*%~dsSX1YEz zhtO14s#aPtM+GDY!Nq%~8-{UCTHQC8GE-Xz84!imh>3bIA;M5C7rP3MDrvp~_S!nZ z02YOhhyv1=0RWVw$Pk%2{f%`r!e$i0rGx>wGdiwVfgo0SSk6e$$5{NO51mvIZNEa( zS9yFhCf2Y%=d={>$audANHdUrO_i9`lyED=FY8hf@e84!pOBqny$7jTJywfjeh(hC z4-ZE!>}z<`kZ1!=x0qO2uwUA^h(s}Ua4F}jMj96s6gUpJ*EBC!q*8FAQ*#YkDk&gF zs(CzCMO(%15YxEDuWgd>Ibrhe}c;`GLd2B+|4YVHi}`;r^FP9)5<_9)A%tcRz5 z>^po*Id*e}=BLtleeU(ZFI2}DZL(j!rtCaw2K040mulY~t@L}6cYG(z+u1maqS_!= z>2mhSL9(5+8&&Zb3Ik~gZ7)_*GFSG0=ul|>t#33rzSt<6lt%WTos<_@FVhQUPGat5 z>m(#7U<#csXT%gK4;B)AOG}?Own>7J$bK%s-?gDR8aG`*swsi+Vm&3AqO?$vm<(gk z;0;I!fk$1VtU6|kJM5C2NBM=~t=|OnN;N<1cWF5KS|lnNkRI7`v;Q6XwOh4VW$S1H z8VC70k%23oiSF3Bj*TZK{dbX{_P-LHfEgPPkK2 z29#dT+TB+!Mp_k0trI?3K)mP|ttufS?63JcD1U@EDv*jMN0hS)V$KG!p9O5U)z=Fb zsmYBMHNTK;zwqe@+Vx-(s<9`SniR(t;C0QMw$X5VJLs^#y4_NLK;*@I-R4|$G_kVY zn-KU$7hGvEqo(93;3}ke?G0*kP|AG>&1G7D&qLsOMECa2GgPZ&on5pjJxbR|V8)ox zL(o1%;mB?(hP=u7pl+bx%{#1`_rz9r$@tg?SW-1Hpp-QW@REE)cD6Q+Iv%7315%Tb z*TNvBSh!c{^Q%;rF@h?ZYLb6eT7PGZLr-(AJc3}aW$}073@a3nJ;Cmt05^*|*xS=-vf4OHrYYGNGxncPBH zQz|1qmPpwu2bNp>-}W@ zy8K>yGPzOFvi7%K!t{m^BnJZv&Wd zo3sXJ(9J5RdtdMJ+e>B}RMFS#h>wz5{$M(_ntW9;!>lVO24pW#<%@DnE?@&sOGGqr zMvDi~sOTk^RP2(uDcIkCSLdVSRB~n)kJx;TGxIsx!Ke8I%;fVXE1Gg=rKE4|g8lK7 zNT{={@zaOr{o1mVCu)?R4^uOV0yk60S%Dy^pN+m0#d7s3H>ZD`H^18rc2}U6+f3_C zV8hL&GtJtss>KJa@;UK`16ABEY`t%q2*^I=VqEOxa~_E*wU3Ijg%-9Tt9~m?k_zgN zPz7E3P8OZDRCz`BZ-R0{$i5!~S)<9@+mGd6zh?enxe9l0&_~45F8|&9WPNYg>?WIO#2nj)X9GWKVk9^$j znvTyamOKfpw*z`-Tsb^?FFvs7&`McgGBKUGxt;1edt681F6hu!UM2+|d&hr3x@>&? z6}-CTgF#mKjkg)gR1EUD>viYH6*IMd#h{6^`9VGR5VXBZANhLe*BQdE?p$mEY&5|} zfm2NKcw-WZ2tL?q>{McRq&f}#aBnMI-l(XW6j}}^e^kWh$x_Cm2h5wa-D|VaqZ2OQ z)aOL74^}_)c0hQbSMCi8curevH>%`@AZarUZ(Vs$9Ic28RDZH#nD z4Sl=AO_SPqBvv#C3TF9V2IT8ezng6(QnR#O^p04jC`x5~Nddf%;3CmZ*PDBfkg)TW z&v*oXFF8c4fC+ngqee8ZYv(5Y{p|+(!iCZ6Gj-1yAKGH{y%_i#Ch+6Fc`cs3riMG@ ztF>R_M(rJI`>n5`qaTg)1P^~|=Ix>Bubapp8N4}O5`~QP;V`y zukciKqo)cZ-=WpX$R?y*4!?FKt0DQRz8jV3(&MZ4DhInrfR5c25%JKI@?@Cl3xHZ_7y?HdX{LTdJ9Nsa-KR zC1^@{LNHcykdoci?PpL>_A-clieiR&2|C_NoND1dXejJ|U9S8@TqDMi?Fb*4UG+7qky5=B7g)3{!uM#p?_NG_AonqS8YEtKCj)Te1>G!IWt-{c zB@iv1q9GW<`8tQS&~%K+<6EJ8*^Ak264I7lH;J6z_-P_RG}`gX0<_54ZP9?O`13W2 z)hWG87V|>C=;%(nWYU<3c(roK`Bl4kqE0^vx_}Y&+pA87{xvj%w$zUVMi*@Y+SZNF zeSwNB-=UtDIW9_jE!diN4*IuF*I^Ic-G#ExyLWgTgM^1AeqdIJt*<0t$kdrIO$(dK z^R0x>^)c$~rQh6Y$ng95+s^YnS_Za^Bw4Suqn38NO@Ex)k@SZ*@g63oBUSO^d+k@> zzKF<6o}>1oz)#w~q@fp?u@qFuaY5;0iX2uB5W!k&44~GfJhZePhj(vEeDTg0Zw)*# zU!D|@nD2{Xu~GG7vr?=8Yqw}Nnt(I%4+V09_(B){yxphd3 z{irNK#D`?GiYP>)K2jqpYBW-~mCcP=bZvlTSn|g>%B@9bY-6G#&_i(XJFrhYgz)LU#^58x&*YJ&_qkw^e@S7t|3C zGembE7(|5=@g7xbDxB80s=YGdDrb{FM$)jdL5vtc{sQsOdpnG|E@Md=F@8fabhji{ z5AsTZG74f(NSaEC@7dn+se;7?4@HPYLeLl;co=tOFO|x6^$TpAon!bA&ee}bnqmtSaiKo*&r@p(abtL9afIfc{3%L5u<(?Bk zz$SS9IbjaiX-mE@GJKzT_a(TBY`=yfDYdUKFR!jsYHbIjM3KjbAHa( z(CV!Aqpq@}1j;!96E#2H=6<2#FSl~quA;fCA6^TNi(5Bd8j@HZrYXT96XaHf zmpndE;23AiGL#7HA(b{g^}prb2(Jsi%TO@75&nIsyH2^bafd^t%A+ol)udjmUOe43 z!?00aL7CFl?bU|Se z@HoKV<=6l2#U6@G_2`L$SSn<1rD(z5`ms$~3VBkPR2sn=fLRm_NkaPnn>;?}+{)MS zWo7DzJ(=gl>H-!ENvOm^wS|6SZNb0$E-U_&d+6PNBWzf)iPe82@->n_cXRiathNb3 zpwALOc7wtz34VV0KEgJ0B{BhYEWOP;4!Y*%{d{dOv}8lq2CpN^BC|dB(NadUQv7UU z*k0;5Fl!`egSswTAZWj_E6KkkVIk_=+8_OAJUc>5(gI6etDZaM5&e-q%5P)Jfnk}9 zKrS02EL!$UdxlJ*X!40+!RV!D>x6Zx5&X92Z28k>5DWKHLuQ4z?cQ_!$V&1A$B;1+uHbaN?Q(5C``^px%;> z4sL&zyI4E9Il=(&uqh15Yz_we1+9hQ$nA_RT>+549{hzE*OXFHkq}`~Gq-d!*O2*( zE-LEir3d^=JAela0&sJJ4A^DdjP0yVMI0>c%wfoOk$)HnGvIFoI|%f*2md>I8pI7- z@?QY_qcH_JQ4##eGh}_zqDxWc3J3;1o+xgCL1yY@p%reO=Y*8i6xR)~Y|#L*kT@zDh(ZxqK;r|)$t%Q&P{G$HT^2V^z z=jX@12I+UNW&Z1{8h!M_Qy)X;-xJ6Ex7*@c04N6!`~L*sS93RU`wNRNBmO5Te{lI= zc2Q$j^FK58{~-&1XA$tHG*ti2FT0i=P>&PBX#n5=>w&o-1^@_9j{^t>z<#hXL=Oz& zf{}m#*c~?K)`Qvf$FBeU8Nder1C9I7Is;gV>G43Je<}?C<>G+Nx%Ie!AOI8y)5itY ztU!3U4FKFgJt!v-b_eT0dAMOJK(H}XkCPiFlbcHq2H7?M z@W6PuU_3lrdQc$D9&T;`gcA(A!^U6^00)TUA7f5V02l)E1q9^MgYa7#0a24{QM8 zgqaRw218-C!d4ml$(XvMl#Dp6j{+v*Pq_cCYslaE{m&%+$sX{3&E8)<7k{Ny!U5KL z0^7WRKiwU$3_5B!SpR880sLt+P-XweJ~uOl?NL|}xa$3>BvpgIHFNx}SrrTg{4wO8 z?E0teCI(v$J1|@Ue_CMvqf(TOEzMyaD6Wp~E~e(Le|lnIOn^TkVKUfN9o=B9F|eJi zU~Xn@{ErO}1VDM9Y+!BxHw47S&11l>Z0z#q7y|tz^>4HOn*T?m&0oI$QRT`m=IH*1 znO)A>%vJBN82*zhu+08fbIxCD#EfAFilZe=2W;)XtN5p&E4w(Fxtp51Fj}jdd%3Au z8M`>W13=jzY!JW~Fc1i21p{HyLFydZ>HtwqQ4xSXqctoXE)M2y0Ao{_@2&tTdlP9s z0POrScW^Z~19+IbxLP|p=rb{~i@`d9++bsISTq>Ld10bBft(yb5CjUlumHgfKp+DX zyMm+H|CQnI*m${^Tc7~}ASfElvj6x1IH6DuD8K^n4;qLA2!*-#=L2x~4;pL?g4M{s z&|rM9%KmROFb`~>|0@j$1Oxv&A50tvta0dHH8Nx zAdrU(RSuUwJjqK(BtPZ9Sdibs z!~y~anLxRrU=vOg9&R%pV^bb(6EM`22Vx9{y@USLCLP$=)C|mO3X{gg3N?eRVTEvm fOw3sSsxCBFH)9vKzY+@KfIxZB=;$PrB+>p4`g0MS literal 0 HcmV?d00001 diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index 18be203..6126d92 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -24,6 +24,7 @@ class TestDumpPDF(): run('../samples/','simple1') run('../samples/','simple2') run('../samples/','simple3') + run('../samples/','sampleOneByteIdentityEncode') def test_2(self): run('../samples/nonfree/','dmca') From 3125d3634adb4f3f395322d254ffa3e4bd4a73de Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Sat, 10 Aug 2019 11:03:28 +0530 Subject: [PATCH 10/13] Correct old test cases --- tests/test_pdfencoding.py | 40 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py index 396d12d..5878eba 100644 --- a/tests/test_pdfencoding.py +++ b/tests/test_pdfencoding.py @@ -3,7 +3,7 @@ # -*- coding: utf-8 -*- import nose, logging, os -from pdfminer.cmapdb import IdentityCMap, CMap +from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte from pdfminer.pdffont import PDFCIDFont from pdfminer.pdftypes import PDFStream from pdfminer.psparser import PSLiteral @@ -14,13 +14,13 @@ class TestPDFEncoding(): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) - assert isinstance(font.cmap, CMap) + assert isinstance(font.cmap, IdentityCMapByte) def test_cmapname_onebyteidentityH(self): stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') spec = {'Encoding': stream} font = PDFCIDFont(None, spec) - assert isinstance(font.cmap, CMap) + assert isinstance(font.cmap, IdentityCMapByte) def test_cmapname_V(self): stream = PDFStream({'CMapName': PSLiteral('V')}, '') @@ -68,6 +68,40 @@ class TestPDFEncoding(): font = PDFCIDFont(None, spec) assert isinstance(font.cmap, IdentityCMap) + def test_encoding_DLIdentH(self): + spec = {'Encoding': PSLiteral('DLIdent-H')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV(self): + spec = {'Encoding': PSLiteral('DLIdent-V')} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_PSLiteral_stream(self): + stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentH_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-H'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + + def test_encoding_DLIdentV_as_stream(self): + stream = PDFStream({'CMapName':'DLIdent-V'}, '') + spec = {'Encoding': stream} + font = PDFCIDFont(None, spec) + assert isinstance(font.cmap, IdentityCMap) + def test_font_without_spec(self): font = PDFCIDFont(None, {}) assert isinstance(font.cmap, CMap) From 3d549ea48c11a50d427f4636fb060eac654c044d Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 16:48:40 +0530 Subject: [PATCH 11/13] Removes code comments --- pdfminer/cmapdb.py | 28 ++-------------------------- pdfminer/pdffont.py | 44 +++++++++----------------------------------- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 83110e7..1681a8d 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -1,5 +1,3 @@ - - """ Adobe character mapping (CMap) support. CMaps provide the mapping between character codes and Unicode @@ -40,8 +38,6 @@ class CMapError(Exception): pass -## CMapBase -## class CMapBase(object): debug = 0 @@ -67,8 +63,6 @@ class CMapBase(object): return -## CMap -## class CMap(CMapBase): def __init__(self, **kwargs): @@ -119,8 +113,6 @@ class CMap(CMapBase): return -## IdentityCMap -## class IdentityCMap(CMapBase): def decode(self, code): @@ -140,8 +132,7 @@ class IdentityCMapByte(IdentityCMap): else: return () -## UnicodeMap -## + class UnicodeMap(CMapBase): def __init__(self, **kwargs): @@ -162,8 +153,6 @@ class UnicodeMap(CMapBase): return -## FileCMap -## class FileCMap(CMap): def add_code2cid(self, code, cid): @@ -182,8 +171,6 @@ class FileCMap(CMap): return -## FileUnicodeMap -## class FileUnicodeMap(UnicodeMap): def add_cid2unichr(self, cid, code): @@ -201,8 +188,6 @@ class FileUnicodeMap(UnicodeMap): return -## PyCMap -## class PyCMap(CMap): def __init__(self, name, module): @@ -213,8 +198,6 @@ class PyCMap(CMap): return -## PyUnicodeMap -## class PyUnicodeMap(UnicodeMap): def __init__(self, name, module, vertical): @@ -227,8 +210,6 @@ class PyUnicodeMap(UnicodeMap): return -## CMapDB -## class CMapDB(object): _cmap_cache = {} @@ -284,8 +265,6 @@ class CMapDB(object): return umaps[vertical] -## CMapParser -## class CMapParser(PSStackParser): def __init__(self, cmap, fp): @@ -373,7 +352,6 @@ class CMapParser(PSStackParser): s1 = nunpack(svar) e1 = nunpack(evar) vlen = len(svar) - #assert s1 <= e1, str((s1, e1)) for i in range(e1-s1+1): x = sprefix+struct.pack('>L', s1+i)[-vlen:] self.cmap.add_code2cid(x, cid+i) @@ -400,7 +378,6 @@ class CMapParser(PSStackParser): continue s1 = nunpack(s) e1 = nunpack(e) - #assert s1 <= e1, str((s1, e1)) if isinstance(code, list): for i in range(e1-s1+1): self.cmap.add_cid2unichr(s1+i, code[i]) @@ -435,17 +412,16 @@ class CMapParser(PSStackParser): return -# test def main(argv): args = argv[1:] for fname in args: fp = open(fname, 'rb') cmap = FileUnicodeMap() - #cmap = FileCMap() CMapParser(cmap, fp).run() fp.close() cmap.dump() return + if __name__ == '__main__': sys.exit(main(sys.argv)) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 4bfd6ed..e94b383 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -35,7 +35,6 @@ from .utils import nunpack log = logging.getLogger(__name__) - def get_widths(seq): widths = {} r = [] @@ -54,10 +53,6 @@ def get_widths(seq): widths[i] = w r = [] return widths -#assert get_widths([1]) == {} -#assert get_widths([1,2,3]) == {1:3, 2:3} -#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8} - def get_widths2(seq): widths = {} @@ -77,13 +72,8 @@ def get_widths2(seq): widths[i] = (w, (vx, vy)) r = [] return widths -#assert get_widths2([1]) == {} -#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))} -#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))} -## FontMetricsDB -## class FontMetricsDB(object): @classmethod @@ -91,8 +81,6 @@ class FontMetricsDB(object): return FONT_METRICS[fontname] -## Type1FontHeaderParser -## class Type1FontHeaderParser(PSStackParser): KEYWORD_BEGIN = KWD(b'begin') @@ -142,6 +130,10 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') +""" +Note: DLIdent-* isn't found in PDF Reference but is been kept as +it is harmless and have possibility of been a type. (induced from bug report/PR) +""" IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', 'DLIdent-H':'Identity-H', @@ -150,10 +142,6 @@ IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'OneByteIdentityV':'OneByteIdentityV', } -## CFFFont -## (Format specified in Adobe Technical Note: #5176 -## "The Compact Font Format Specification") -## def getdict(data): d = {} fp = BytesIO(data) @@ -281,6 +269,7 @@ class CFFFont(object): 'Light', 'Medium', 'Regular', 'Roman', 'Semibold', ) + class INDEX(object): def __init__(self, fp): @@ -381,9 +370,6 @@ class CFFFont(object): assert False, str(('Unhandled', format)) else: raise ValueError('unsupported charset format: %r' % format) - #print self.code2gid - #print self.name2gid - #assert 0 return def getstr(self, sid): @@ -392,8 +378,6 @@ class CFFFont(object): return self.string_index[sid-len(self.STANDARD_STRINGS)] -## TrueTypeFont -## class TrueTypeFont(object): class CMapNotFound(Exception): @@ -479,8 +463,6 @@ class TrueTypeFont(object): return unicode_map -## Fonts -## class PDFFontError(PDFException): pass @@ -492,7 +474,6 @@ LITERAL_STANDARD_ENCODING = LIT('StandardEncoding') LITERAL_TYPE1C = LIT('Type1C') -# PDFFont class PDFFont(object): def __init__(self, descriptor, widths, default_width=None): @@ -557,7 +538,6 @@ class PDFFont(object): return sum(self.char_width(cid) for cid in self.decode(s)) -# PDFSimpleFont class PDFSimpleFont(PDFFont): def __init__(self, descriptor, widths, spec): @@ -594,7 +574,6 @@ class PDFSimpleFont(PDFFont): raise PDFUnicodeNotDefined(None, cid) -# PDFType1Font class PDFType1Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -626,14 +605,12 @@ class PDFType1Font(PDFSimpleFont): return '' % self.basefont -# PDFTrueTypeFont class PDFTrueTypeFont(PDFType1Font): def __repr__(self): return '' % self.basefont -# PDFType3Font class PDFType3Font(PDFSimpleFont): def __init__(self, rsrcmgr, spec): @@ -656,7 +633,6 @@ class PDFType3Font(PDFSimpleFont): return '' -# PDFCIDFont class PDFCIDFont(PDFFont): def __init__(self, rsrcmgr, spec, strict=settings.STRICT): @@ -721,9 +697,9 @@ class PDFCIDFont(PDFFont): """ For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an - attribure of spec['Encoding']. - The horizaontal/vertical modes are mentioned with diffrent name - such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V' + attribute of spec['Encoding']. + The horizontal/vertical modes are mentioned with different name + such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'. """ try: spec_encoding = spec['Encoding'] @@ -771,16 +747,14 @@ class PDFCIDFont(PDFFont): except KeyError: raise PDFUnicodeNotDefined(self.cidcoding, cid) - -# main def main(argv): for fname in argv[1:]: fp = open(fname, 'rb') - #font = TrueTypeFont(fname, fp) font = CFFFont(fname, fp) print (font) fp.close() return + if __name__ == '__main__': sys.exit(main(sys.argv)) From abd685fdc6853eac2df7b01a65f98f5264db6f08 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 17:13:27 +0530 Subject: [PATCH 12/13] Corrects Code Comment --- pdfminer/pdffont.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index e94b383..5217071 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -130,9 +130,9 @@ class Type1FontHeaderParser(PSStackParser): NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') -""" -Note: DLIdent-* isn't found in PDF Reference but is been kept as -it is harmless and have possibility of been a type. (induced from bug report/PR) + +#Note: DLIdent-* isn't found in PDF Reference but is been kept as +#it is harmless and have possibility of been a type. (induced from bug report/PR) """ IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', From 7c03d96d25c2a06a5cec4f2506d8bf36f3441158 Mon Sep 17 00:00:00 2001 From: Fakabbir Amin Date: Tue, 20 Aug 2019 17:16:10 +0530 Subject: [PATCH 13/13] Corrects Comment --- pdfminer/pdffont.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 5217071..be9ef8b 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -133,7 +133,6 @@ NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', Non #Note: DLIdent-* isn't found in PDF Reference but is been kept as #it is harmless and have possibility of been a type. (induced from bug report/PR) -""" IDENTITY_ENCODER = {'Identity-H':'Identity-H', 'Identity-V':'Identity-V', 'DLIdent-H':'Identity-H',