Use charset-normalizer instead of chardet (#744)

* Use charset-normalizer instead of chardet

* Ignore charset_normalizer type stub

* Add CHANGELOG.md
pull/749/head
Pieter Marsman 2022-04-20 21:42:50 +02:00 committed by GitHub
parent 617e4c8388
commit 1bf3c42b59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 13 additions and 6 deletions

View File

@ -19,6 +19,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737))
### Changed
- Using charset-normalizer instead of chardet for less restrictive license ([#744](https://github.com/pdfminer/pdfminer.six/pull/744))
## [20220319]
### Added

View File

@ -23,8 +23,11 @@ ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-setuptools]
[mypy-setuptools.*]
ignore_missing_imports = True
[mypy-nox]
[mypy-nox.*]
ignore_missing_imports = True
[mypy-charset_normalizer.*]
ignore_missing_imports = True

View File

@ -28,7 +28,7 @@ from typing import (
if TYPE_CHECKING:
from .layout import LTComponent
import chardet # For str encoding detection
import charset_normalizer # For str encoding detection
# from sys import maxint as INF doesn't work anymore under Python3, but PDF
# still uses 32 bits ints
@ -75,7 +75,7 @@ def make_compat_bytes(in_str: str) -> bytes:
def make_compat_str(o: object) -> str:
"""Converts everything to string, if bytes guessing the encoding."""
if isinstance(o, bytes):
enc = chardet.detect(o)
enc = charset_normalizer.detect(o)
try:
return o.decode(enc["encoding"])
except UnicodeDecodeError:

View File

@ -17,8 +17,8 @@ setup(
packages=["pdfminer"],
package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]},
install_requires=[
'chardet ; python_version > "3.0"',
"cryptography",
"charset-normalizer~=2.0.0",
"cryptography~=36.0.0",
],
extras_require={
"dev": ["pytest", "nox", "black", "mypy == 0.931"],