Initial commit

2023-08-07 18:10:10 +06:00 · 2023-08-07 18:10:10 +06:00 · ec5b4ea6ee
commit ec5b4ea6ee
1535 changed files with 467951 additions and 0 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+extend-ignore = 
+    # See https://github.com/PyCQA/pycodestyle/issues/373
+    E203,
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,20 @@
+---
+name: Bug report
+about: Report a bug
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Bug report**
+
+Thanks for finding the bug! To help us fix it, please make sure that you 
+include the following information:
+
+- A description of the bug
+- Steps to reproduce the bug. Try to minimize the number of steps needed. 
+  Include the command and/or script that you use. Also include the PDF that 
+  you use.
+- If relevant, include the output and/or error stacktrace. 
+
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,18 @@
+---
+name: Feature request
+about: Request a new feature
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Feature request**
+
+Thanks for your suggestion on improving pdfminer.six. To helps us discuss and
+implement this request, please make sure to include the following information:
+
+- A description of the feature you would like to have
+- If relevant, the context that you are in. What are you trying to achieve?
+- If possible, an example of what you want to achieve. Include the PDF that
+  you are working on. Include the output that you would like to have. 
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,15 @@
+**Pull request**
+
+Please *remove* this paragraph and replace it with a description of your PR. Also include the issue that it fixes. 
+
+**How Has This Been Tested?**
+
+Please *remove* this paragraph with a description of how this PR has been tested.
+
+**Checklist**
+
+- [ ] I have read [CONTRIBUTING.md](../CONTRIBUTING.md). 
+- [ ] I have added a concise human-readable description of the change to [CHANGELOG.md](../CHANGELOG.md).
+- [ ] I have tested that this fix is effective or that this feature works.
+- [ ] I have added docstrings to newly created methods and classes.
+- [ ] I have updated the [README.md](../README.md) and the [readthedocs](../docs/source) documentation. Or verified that this is not necessary.
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@ -0,0 +1,164 @@
+name: Continuous integration
+
+on:
+  push: # run when commits are added to master
+    branches:
+      - master
+    tags:
+      - '[0-9]+'  # match version tags with only numbers
+  pull_request: # run on pr's against master
+    branches:
+      - master
+
+env:
+  default-python: "3.10"
+
+jobs:
+
+  check-code-formatting:
+    name: Check coding style
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ env.default-python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.default-python }}
+      - name: Upgrade pip, Install nox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install nox
+      - name: Check coding style
+        run: |
+          nox --error-on-missing-interpreters --non-interactive --session format
+
+  check-coding-style:
+    name: Check coding style
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ env.default-python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.default-python }}
+      - name: Upgrade pip, Install nox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install nox
+      - name: Check coding style
+        run: |
+          nox --error-on-missing-interpreters --non-interactive --session lint
+
+  check-static-types:
+    name: Check static types
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ env.default-python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.default-python }}
+      - name: Upgrade pip, Install nox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install nox
+      - name: Check static types
+        run: |
+          nox --error-on-missing-interpreters --non-interactive --session types
+
+  tests:
+    name: Run tests
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10" ]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Determine pip cache directory
+        id: pip-cache
+        run: |
+          echo "::set-output name=dir::$(pip cache dir)"
+      - name: Cache pip cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ steps.pip-cache.outputs.dir }}
+          key: ${{ runner.os }}-pip${{ matrix.python-version }}
+      - name: Upgrade pip and install nox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install nox
+      - name: Run tests
+        run: |
+          nox --non-interactive --session tests-${{ matrix.python-version }}
+
+  build-docs:
+    name: Test building docs
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Set up Python ${{ env.default-python }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.default-python }}
+      - name: Upgrade pip and install nox
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install nox
+      - name: Build docs
+        run: |
+          nox --error-on-missing-interpreters --non-interactive --session docs
+
+  publish:
+    name: Publish to PyPi
+    runs-on: ubuntu-latest
+    needs:
+      - check-code-formatting
+      - check-coding-style
+      - check-static-types
+      - tests
+      - build-docs
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Install dependencies
+        run: python -m pip install wheel
+      - name: Set version
+        run: |
+          if [[ "${{ github.ref }}" == "refs/tags/"* ]]
+          then
+            VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,' | sed -e 's/^v//')
+          else
+            VERSION=$(date +%Y%m%d).$(date +%H%M%S)
+          fi
+          echo ${VERSION}
+          sed -i "s/__VERSION__/${VERSION}/g" pdfminer/__init__.py
+      - name: Build package
+        run: python setup.py sdist bdist_wheel
+      - name: Generate changelog
+        run: sed '1,/## \[/d;/## \[/Q' CHANGELOG.md > ${{ github.workspace }}-CHANGELOG.md
+      - name: Publish package to PyPi
+        if: startsWith(github.ref, 'refs/tags')
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}
+      - name: Create GitHub release
+        if: startsWith(github.ref, 'refs/tags')
+        uses: softprops/action-gh-release@v1
+        id: create_release
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+          body_path: ${{ github.workspace }}-CHANGELOG.md
+          files: |
+            dist/*.tar.gz
+            dist/*.whl
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,28 @@
+*.class
+*.pyc
+*.pyo
+.svn
+.env
+_svn
+.pythoscope
+.ipynb_checkpoints
+.settings
+_update.bat
+docs/_build
+/Goulib.egg-info/
+/build/
+/dist/
+/pdfminer.six.egg-info/
+tests/*.xml
+tests/*.txt
+.idea/
+.tox/
+.nox/
+
+# python venv management tools
+Pipfile
+Pipfile.lock
+.noseids
+.vscode/
+pyproject.toml
+poetry.lock
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,256 @@
+# Changelog
+All notable changes in pdfminer.six will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
+
+## [Unreleased]
+
+### Added
+
+- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
+- Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
+- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
+- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801))
+
+### Fixed
+
+- `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
+- `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766))
+- Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760))
+- `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720))
+- Installing typing-extensions on Python 3.6 and 3.7 ([#775](https://github.com/pdfminer/pdfminer.six/pull/775))
+- `TypeError` in cmapdb.py when parsing null characters ([#768](https://github.com/pdfminer/pdfminer.six/pull/768))
+- Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794))
+- `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827))
+- Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828))
+
+### Deprecated
+
+- Usage of `if __name__ == "__main__"` where it was only intended for testing purposes ([#756](https://github.com/pdfminer/pdfminer.six/pull/756))
+
+## [20220524]
+
+### Fixed
+
+- Ignoring (invalid) path constructors that do not begin with `m` ([#749](https://github.com/pdfminer/pdfminer.six/pull/749))
+
+### Changed
+
+- Removed upper version bounds ([#755](https://github.com/pdfminer/pdfminer.six/pull/755))
+
+## [20220506]
+
+### Fixed
+
+- `IndexError` when handling invalid bfrange code map in
+  CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731))
+- `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
+- `TypeError` in encodingdb.py when name of unicode is not
+  str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
+- `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734))
+
+### Added
+
+- Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737))
+
+### Changed
+
+- Using charset-normalizer instead of chardet for less restrictive license ([#744](https://github.com/pdfminer/pdfminer.six/pull/744))
+
+## [20220319]
+
+### Added
+
+- Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
+- Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
+- Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
+- Installation of Pillow as an optional extra dependency ([#714](https://github.com/pdfminer/pdfminer.six/pull/714))
+
+### Fixed
+
+- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
+- Regression (since 20191107) in `LTLayoutContainer.group_textboxes` that returned some text lines out of order ([#659](https://github.com/pdfminer/pdfminer.six/pull/659))
+- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
+- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
+- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
+- Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
+- Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499))
+
+### Changed
+- Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
+- Switched from nose to pytest, from tox to nox and from Travis CI to GitHub Actions ([#704](https://github.com/pdfminer/pdfminer.six/pull/704))
+
+### Removed
+- Unnecessary return statements without argument at the end of functions ([#707](https://github.com/pdfminer/pdfminer.six/pull/707))
+
+## [20211012]
+
+### Added
+- Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614))
+- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
+- Type annotations ([#661](https://github.com/pdfminer/pdfminer.six/pull/661))
+
+### Fixed
+- `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594))
+- Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
+- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
+- Fix `PermissionError` when creating temporary filepaths on windows when running tests ([#484](https://github.com/pdfminer/pdfminer.six/pull/484))
+- Fix `AttributeError` when dumping a TOC with bytes destinations ([#600](https://github.com/pdfminer/pdfminer.six/pull/600))
+- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
+- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
+- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
+- Raising `UnboundLocalError` when a bad `--output-type`  is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
+- `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
+- Using `io.TextIOBase` as the file to write to ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
+- Parsing \r\n after the escape character in a literal string ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
+
+### Removed
+- Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
+- Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525))
+- Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523))
+- Dependency on typing-extensions introduced by [#661](https://github.com/pdfminer/pdfminer.six/pull/661) ([#677](https://github.com/pdfminer/pdfminer.six/pull/677))
+
+## [20201018]
+
+### Deprecated
+- Support for Python 3.4 and 3.5 ([#507](https://github.com/pdfminer/pdfminer.six/pull/507))
+
+### Added
+
+- Option to disable boxes flow layout analysis when using pdf2txt ([#479](https://github.com/pdfminer/pdfminer.six/pull/479))
+- Support for `pathlib.PurePath` in `open_filename` ([#492](https://github.com/pdfminer/pdfminer.six/pull/492))
+
+### Fixed
+- Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
+- Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#512](https://github.com/pdfminer/pdfminer.six/pull/512))
+- Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
+
+### Removed
+- Remove unused rijndael encryption implementation ([#465](https://github.com/pdfminer/pdfminer.six/pull/465))
+
+## [20200726]
+
+### Fixed
+- Rename PDFTextExtractionNotAllowedError to PDFTextExtractionNotAllowed to revert breaking change ([#461](https://github.com/pdfminer/pdfminer.six/pull/461))
+- Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
+
+## [20200720]
+
+### Added
+- Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
+
+### Fixed
+- Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
+
+### Changed
+- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
+- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#453](https://github.com/pdfminer/pdfminer.six/pull/453))
+- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
+  
+## [20200517]
+
+### Added
+- Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408))
+
+### Fixed
+- Fix ordering of textlines within a textbox when `boxes_flow=None` ([#412](https://github.com/pdfminer/pdfminer.six/pull/412))
+
+## [20200402]
+
+### Added
+- Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#396](https://github.com/pdfminer/pdfminer.six/pull/396))
+- Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#393](https://github.com/pdfminer/pdfminer.six/pull/393))
+
+### Fixed
+- Text no longer comes in reverse order when advanced layout analysis is disabled ([#399](https://github.com/pdfminer/pdfminer.six/pull/399))
+- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
+- Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
+- Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
+
+### Changed
+- Group text lines if they are centered ([#384](https://github.com/pdfminer/pdfminer.six/pull/384))
+
+## [20200124]
+
+### Security
+- Removed samples/issue-00152-embedded-pdf.pdf because it contains a possible security thread; a javascript enabled object ([#364](https://github.com/pdfminer/pdfminer.six/pull/364))
+
+## [20200121]
+
+### Fixed
+- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
+- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
+- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
+- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
+
+### Removed
+- The command-line utility latin2ascii.py ([#360](https://github.com/pdfminer/pdfminer.six/pull/360))
+
+## [20200104]
+
+### Removed
+- Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346))
+
+### Changed
+- Enforce pep8 coding style by adding flake8 to CI ([#345](https://github.com/pdfminer/pdfminer.six/pull/345))
+
+## [20191110]
+
+### Fixed
+- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))
+
+## [20191107]
+
+### Deprecated
+- The argument `_py2_no_more_posargs` because Python2 is removed on January
+, 2020 ([#328](https://github.com/pdfminer/pdfminer.six/pull/328) and 
+[#307](https://github.com/pdfminer/pdfminer.six/pull/307))
+
+### Added
+- Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
+- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
+- Sphinx documentation that is published on 
+  [Read the Docs](https://pdfminersix.readthedocs.io/)
+  ([#329](https://github.com/pdfminer/pdfminer.six/pull/329))
+
+### Fixed
+- Unhandled AssertionError when dumping pdf containing reference to object id 0 
+ ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
+- Debug flag actually changes logging level to debug for pdf2txt.py and
+ dumppdf.py ([#325](https://github.com/pdfminer/pdfminer.six/pull/325))
+
+### Changed
+- Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
+- Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315))
+
+### Removed
+- Files for external applications such as django, cgi and pyinstaller ([#320](https://github.com/pdfminer/pdfminer.six/pull/320))
+
+## [20191020]
+
+### Deprecated
+- Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307))
+
+### Added
+- Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259))
+- Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
+
+### Fixed
+- Use `six.iteritems()` instead of `dict().iteritems()` to ensure Python2 and Python3 compatibility ([#274](https://github.com/pdfminer/pdfminer.six/pull/274))
+- Properly convert Adobe Glyph names to unicode characters ([#263](https://github.com/pdfminer/pdfminer.six/pull/263))
+- Allow CMap to be a content stream ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
+- Resolve indirect objects for width and bounding boxes for fonts ([#273](https://github.com/pdfminer/pdfminer.six/pull/273))
+- Actually updating stroke color in graphic state ([#298](https://github.com/pdfminer/pdfminer.six/pull/298))
+- Interpret (invalid) negative font descent as a positive descent ([#203](https://github.com/pdfminer/pdfminer.six/pull/203))
+- Correct colorspace comparision for images ([#132](https://github.com/pdfminer/pdfminer.six/pull/132))
+- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
+
+### Changed
+- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
+
+## [20181108]
+
+### Changed
+- Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141))
+- Use argparse instead of replace deprecated getopt ([#173](https://github.com/pdfminer/pdfminer.six/pull/173))
+- Allow pdfminer.six to be compiled with cython ([#142](https://github.com/pdfminer/pdfminer.six/pull/142))
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,75 @@
+# Contributing guidelines
+
+Any contribution is appreciated! You might want to:
+
+* Fix spelling errors
+* Improve documentation
+* Add tests for untested code
+* Add new features
+* Fix bugs
+
+## How can I contribute?
+
+* Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
+    - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
+     issue. 
+* Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request).
+* Help others by sharing your thoughs in comments on issues and pull requests.
+* Join the chat on [gitter](https://gitter.im/pdfminer-six/Lobby)
+
+## Guidelines for creating issues
+
+* Search previous issues, as yours might be a duplicate.
+* When creating a new issue for a bug, include a minimal reproducible example.
+* When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
+  will help others to see the importance of your feature request. 
+
+## Guideline for creating pull request
+
+* A pull request should close an existing issue. For example, use "Fix #123" to indicate that your PR fixes issue 123. 
+* Pull requests should be merged to master.
+* Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
+  of features, this will show that your code works correctly.
+* Code should work for Python 3.6+.
+* Test your code by using nox (see below). 
+* New features should be well documented using docstrings.
+* Check if the [README.md](../README.md) or [readthedocs](../docs/source) documentation needs to be updated. 
+* Check spelling and grammar.
+* Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]). 
+
+## Guidelines for posting comments
+
+* [Be cordial and positive](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way)
+
+## Guidelines for publishing
+
+* Publishing is automated. Add a YYYYMMDD version tag and GitHub workflows will do the rest. 
+
+## Getting started
+
+1. Clone the repository
+
+    ```sh
+    git clone https://github.com/pdfminer/pdfminer.six
+    cd pdfminer.six
+    ```
+
+2. Install dev dependencies
+
+    ```sh
+    pip install -e .[dev]
+    ```
+
+3. Run the tests
+
+    On all Python versions:
+
+    ```sh
+    nox
+   ```
+   
+   Or on a single Python version:
+   
+   ```sh
+    nox -e py36
+    ```
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
+PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,11 @@
+include Makefile
+include LICENSE
+include *.txt
+include *.md
+include *.py
+graft cmaprsrc
+graft pdfminer
+graft tools
+global-exclude *.pyc
+prune samples
+prune docs
--- a/29
+++ b/29
@ -0,0 +1,29 @@
+##  Makefile (for maintenance purpose)
+##
+
+PYTHON=python
+RM=rm -f
+CP=cp -f
+MKDIR=mkdir
+
+CONV_CMAP=$(PYTHON) tools/conv_cmap.py
+CMAPSRC=cmaprsrc
+CMAPDST=pdfminer/cmap
+cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
+	$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
+cmap_clean:
+	-$(RM) -r $(CMAPDST)
+$(CMAPDST):
+	$(MKDIR) $(CMAPDST)
+$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
+	$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
+$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
+	$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
+		$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
+$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
+	$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
+$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
+	$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
+		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,72 @@
+pdfminer.six
+============
+
+[![Continuous integration](https://github.com/pdfminer/pdfminer.six/actions/workflows/actions.yml/badge.svg)](https://github.com/pdfminer/pdfminer.six/actions/workflows/actions.yml)
+[![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/)
+[![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium)
+
+*We fathom PDF*
+
+Pdfminer.six is a community maintained fork of the original PDFMiner. It is a tool for extracting information from PDF
+documents. It focuses on getting and analyzing text data. Pdfminer.six extracts the text from a page directly from the
+sourcecode of the PDF. It can also be used to get the exact location, font or color of the text.
+
+It is built in a modular way such that each component of pdfminer.six can be replaced easily. You can implement your own
+interpreter or rendering device that uses the power of pdfminer.six for other purposes than text analysis.
+
+Check out the full documentation on
+[Read the Docs](https://pdfminersix.readthedocs.io).
+
+
+Features
+--------
+
+* Written entirely in Python.
+* Parse, analyze, and convert PDF documents.
+* Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
+* PDF-1.7 specification support. (well, almost).
+* CJK languages and vertical writing scripts support.
+* Various font types (Type1, TrueType, Type3, and CID) support.
+* Support for extracting images (JPG, JBIG2, Bitmaps).
+* Support for various compressions (ASCIIHexDecode, ASCII85Decode, LZWDecode, FlateDecode, RunLengthDecode,
+  CCITTFaxDecode)
+* Support for RC4 and AES encryption.
+* Support for AcroForm interactive form extraction.
+* Table of contents extraction.
+* Tagged contents extraction.
+* Automatic layout analysis.
+
+How to use
+----------
+
+* Install Python 3.6 or newer.
+* Install pdfminer.six.
+
+  `pip install pdfminer.six`
+
+* (Optionally) install extra dependencies for extracting images.
+
+  `pip install 'pdfminer.six[image]'`
+
+* Use the command-line interface to extract text from pdf.
+
+  `pdf2txt.py example.pdf`
+
+* Or use it with Python. 
+
+```python
+from pdfminer.high_level import extract_text
+
+text = extract_text("example.pdf")
+print(text)
+```
+
+Contributing
+------------
+
+Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). 
+
+Acknowledgement
+---------------
+
+This repository includes code from `pyHanko` ; the original license has been included [here](/docs/licenses/LICENSE.pyHanko).
--- a/cmaprsrc/README.txt
+++ b/cmaprsrc/README.txt
@ -0,0 +1,60 @@
+README.txt for cmaprsrc
+
+This directory contains Adobe CMap resources. CMaps are required 
+to decode text data written in CJK (Chinese, Japanese, Korean) language.
+CMap resources are now available freely from Adobe web site:
+http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
+
+The following files were extracted from the downloadable tarballs:
+
+cid2code_Adobe_CNS1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
+
+cid2code_Adobe_GB1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
+
+cid2code_Adobe_Japan1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
+
+cid2code_Adobe_Korea1.txt:
+	http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
+
+
+Here is the license information in the original files:
+
+%%Copyright: -----------------------------------------------------------
+%%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
+%%Copyright: All rights reserved.
+%%Copyright:
+%%Copyright: Redistribution and use in source and binary forms, with or
+%%Copyright: without modification, are permitted provided that the
+%%Copyright: following conditions are met:
+%%Copyright:
+%%Copyright: Redistributions of source code must retain the above
+%%Copyright: copyright notice, this list of conditions and the following
+%%Copyright: disclaimer.
+%%Copyright:
+%%Copyright: Redistributions in binary form must reproduce the above
+%%Copyright: copyright notice, this list of conditions and the following
+%%Copyright: disclaimer in the documentation and/or other materials
+%%Copyright: provided with the distribution.
+%%Copyright:
+%%Copyright: Neither the name of Adobe Systems Incorporated nor the names
+%%Copyright: of its contributors may be used to endorse or promote
+%%Copyright: products derived from this software without specific prior
+%%Copyright: written permission.
+%%Copyright:
+%%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+%%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+%%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+%%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+%%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+%%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+%%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+%%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+%%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+%%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+%%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+%%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+%%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+%%Copyright: -----------------------------------------------------------
--- a/cmaprsrc/cid2code_Adobe_CNS1.txt
+++ b/cmaprsrc/cid2code_Adobe_CNS1.txt
--- a/cmaprsrc/cid2code_Adobe_GB1.txt
+++ b/cmaprsrc/cid2code_Adobe_GB1.txt
--- a/cmaprsrc/cid2code_Adobe_Japan1.txt
+++ b/cmaprsrc/cid2code_Adobe_Japan1.txt
--- a/cmaprsrc/cid2code_Adobe_Korea1.txt
+++ b/cmaprsrc/cid2code_Adobe_Korea1.txt
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -0,0 +1 @@
+build/
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/licenses/LICENSE.pyHanko
+++ b/docs/licenses/LICENSE.pyHanko
@ -0,0 +1,23 @@
+This package contains various elements based on code from the pyHanko project, of which we reproduce the license below.
+
+MIT License
+
+Copyright (c) 2020 Matthias Valvekens 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/docs/make.bat
+++ b/docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1 @@
+sphinx-argparse
--- a/docs/source/_static/layout_analysis.html
+++ b/docs/source/_static/layout_analysis.html
@ -0,0 +1,28 @@
+<style>
+    td {
+        text-align: center;
+    }
+</style>
+<table style="margin: 10px; padding: 10px;">
+    <tr>
+        <td style="text-align: right; border-right:1px red solid">&rarr;</td>
+        <td colspan="4"
+            style="text-align: left; border-left:1px red solid">&larr; <em><font
+                color="red">M</font></em></td>
+    </tr>
+    <tr>
+        <td style="border:1px solid"><code>Q u i</code></td>
+        <td style="border:1px solid"><code>c k</code></td>
+        <td width="10px"></td>
+        <td style="border:1px solid"><code>b r o w n</code></td>
+    </tr>
+    <tr>
+        <td colspan="2" style="text-align: right; border-right:1px green solid">
+            &rarr;
+        </td>
+        <td></td>
+        <td colspan="2"
+            style="text-align: left; border-left:1px green solid">&larr;
+            <em><font color="green">W</font></em></td>
+    </tr>
+</table>
--- a/docs/source/_static/layout_analysis_group_boxes.html
+++ b/docs/source/_static/layout_analysis_group_boxes.html
@ -0,0 +1,23 @@
+<style>
+    .background-blue {
+        background-color: lightblue;
+        border: 2px solid lightblue;
+    }
+</style>
+<table style="margin: 10px; padding: 10px;">
+    <tr>
+        <td style="border:1px solid; text-align: left">
+            <code>
+                Q u i c k &nbsp; b r o w n<br/> f o x
+            </code>
+        </td>
+        <td class="background-blue" colspan="3"></td>
+    </tr>
+    <tr style="height: 10px;">
+        <td class="background-blue" colspan="4"></td>
+    </tr>
+    <tr>
+        <td class="background-blue" colspan="3"></td>
+        <td style="border:1px solid"><code>j u m p s ...</code></td>
+    </tr>
+</table>
--- a/docs/source/_static/layout_analysis_group_lines.html
+++ b/docs/source/_static/layout_analysis_group_lines.html
@ -0,0 +1,45 @@
+<style>
+    td {
+        text-align: center;
+    }
+</style>
+<table style="margin: 10px; padding: 10px;">
+    <tr>
+        <td></td>
+        <td></td>
+        <td align=right style="border-bottom:1px blue solid">&darr;</td>
+        <td></td>
+    </tr>
+    <tr>
+        <td colspan="2" style="border:1px solid"><code>Q u i c k &nbsp; b r o w
+            n</code></td>
+        <td></td>
+        <td align=right style="border-bottom:1px blue solid">&darr;</td>
+    </tr>
+    <tr>
+        <td></td>
+        <td></td>
+        <td align=center valign=center><em><font color="blue">
+            L<sub>1</sub>
+        </font></em></td>
+        <td></td>
+    </tr>
+    <tr>
+        <td style="border:1px solid;">
+            <code>f o x</code>
+        </td>
+        <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+            &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+        </td>
+        <td align=right style="border-top:1px blue solid">&uarr;</td>
+        <td align=center valign=center><em><font color="blue">
+            L<sub>2</sub>
+        </font></em></td>
+    </tr>
+    <tr>
+        <td></td>
+        <td></td>
+        <td></td>
+        <td align=right style="border-top:1px blue solid">&uarr;</td>
+    </tr>
+</table>
--- a/docs/source/_static/layout_analysis_output.png
+++ b/docs/source/_static/layout_analysis_output.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -0,0 +1,64 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import sys
+from typing import List
+
+import pdfminer
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../"))
+
+# -- Project information -----------------------------------------------------
+
+project = "pdfminer.six"
+copyright = "2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
+author = "Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
+
+# The full version, including alpha/beta/rc tags
+release = pdfminer.__version__
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinxarg.ext",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+]
+
+# Root rst file
+master_doc = "index"
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns: List[str] = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "alabaster"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
--- a/docs/source/faq.rst
+++ b/docs/source/faq.rst
@ -0,0 +1,68 @@
+.. _faq:
+
+Frequently asked questions
+**************************
+
+Why is it called pdfminer.six?
+==============================
+
+Pdfminer.six is a fork of the `original pdfminer created by Euske
+<https://github.com/euske>`_. Almost all of the code and architecture are in
+-fact created by Euske. But, for a long time, this original pdfminer did not
+support Python 3. Until 2020 the original pdfminer only supported Python 2.
+The original goal of pdfminer.six was to add support for Python 3. This was
+done with the `six` package. The `six` package helps to write code that is
+compatible with both Python 2 and Python 3. Hence, pdfminer.six.
+
+As of 2020, pdfminer.six dropped the support for Python 2 because it was
+`end-of-life <https://www.python.org/doc/sunset-python-2/>`_. While the .six
+part is no longer applicable, we kept the name to prevent breaking changes for
+existing users.
+
+The current punchline "We fathom PDF" is a `whimsical reference
+<https://github.com/pdfminer/pdfminer.six/issues/197#issuecomment-655091942>`_
+to the six. Fathom means both deeply understanding something, and a fathom is
+also equal to six feet.
+
+How does pdfminer.six compare to other forks of pdfminer?
+==========================================================
+
+Pdfminer.six is now an independent and community-maintained package for
+extracting text from PDFs with Python. We actively fix bugs (also for PDFs
+that don't strictly follow the PDF Reference), add new features and improve
+the usability of pdfminer.six. This community separates pdfminer.six from the
+other forks of the original pdfminer. PDF as a format is very diverse and
+there are countless deviations from the official format. The only way to
+support all the PDFs out there is to have a community that actively uses and
+improves pdfminer.
+
+Since 2020, the original pdfminer is `dormant
+<https://github.com/euske/pdfminer#pdfminer>`_, and pdfminer.six is the fork
+which Euske recommends if you need an actively maintained version of pdfminer.
+
+Why are there `(cid:x)` values in the textual output?
+=====================================================
+
+One of the most common issues with pdfminer.six is that the textual output
+contains raw character id's `(cid:x)`. This is often experienced as confusing
+because the text is shown fine in a PDF viewer and other text from the same
+PDF is extracted properly.
+
+The underlying problem is that a PDF has two different representations
+of each character. Each character is mapped to a glyph that determines
+how the character is shown in a PDF viewer. And each character is also
+mapped to its unicode value that is used when copy-pasting the character.
+Some PDF's have incomplete unicode mappings and therefore it is impossible
+to convert the character to unicode. In these cases pdfminer.six defaults
+to showing the raw character id `(cid:x)`
+
+A quick test to see if pdfminer.six should be able to do better is to
+copy-paste the text from a PDF viewer to a text editor. If the result
+is proper text, pdfminer.six should also be able to extract proper text.
+If the result is gibberish, pdfminer.six will also not be able to convert
+the characters to unicode.
+
+References: 
+
+#. `Chapter 5: Text, PDF Reference 1.7 <https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/index.html#pdf-reference>`_
+#. `Text: PDF, Wikipedia <https://en.wikipedia.org/wiki/PDF#Text>`_
--- a/docs/source/howto/acro_forms.rst
+++ b/docs/source/howto/acro_forms.rst
@ -0,0 +1,148 @@
+.. _acro_forms:
+
+How to extract AcroForm interactive form fields from a PDF using PDFMiner
+*************************************************************************
+
+Before you start, make sure you have :ref:`installed pdfminer.six<install>`.
+
+The second thing you need is a PDF with AcroForms (as found in PDF files with fillable forms or multiple choices). There are some examples of these in the GitHub repository under `samples/acroform`.
+
+Only AcroForm interactive forms are supported, XFA forms are not supported.
+
+.. code-block:: python
+
+    from pdfminer.pdfparser import PDFParser
+    from pdfminer.pdfdocument import PDFDocument
+    from pdfminer.pdftypes import resolve1
+    from pdfminer.psparser import PSLiteral, PSKeyword
+    from pdfminer.utils import decode_text    
+    
+    
+    data = {}
+ 
+ 
+    def decode_value(value):
+
+        # decode PSLiteral, PSKeyword
+        if isinstance(value, (PSLiteral, PSKeyword)):
+            value = value.name
+
+        # decode bytes
+        if isinstance(value, bytes):
+            value = decode_text(value)
+
+        return value
+
+
+    with open(file_path, 'rb') as fp:
+        parser = PDFParser(fp)
+        
+        doc = PDFDocument(parser)
+        res = resolve1(doc.catalog)
+
+        if 'AcroForm' not in res:
+            raise ValueError("No AcroForm Found")
+            
+        fields = resolve1(doc.catalog['AcroForm'])['Fields']  # may need further resolving
+
+        for f in fields:
+            field = resolve1(f)
+            name, values = field.get('T'), field.get('V')
+
+            # decode name
+            name = decode_text(name)
+
+            # resolve indirect obj
+            values = resolve1(values)
+            
+            # decode value(s)
+            if isinstance(values, list):
+                values = [decode_value(v) for v in values]
+            else:
+                values = decode_value(values)
+
+            data.update({name: values})    
+              
+            print(name, values)
+
+This code snippet will print all the fields' names and values and save them in the "data" dictionary.
+
+
+How it works:
+
+- Initialize the parser and the PDFDocument objects
+
+.. code-block:: python
+
+    parser = PDFParser(fp)
+    doc = PDFDocument(parser)
+
+- Get the Catalog
+
+  (the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/index.html#pdf-reference)
+
+.. code-block:: python
+
+    res = resolve1(doc.catalog)
+
+- Check if the catalog contains the AcroForm key and raise ValueError if not 
+
+  (the PDF does not contain Acroform type of interactive forms if this key is missing in the catalog, see section 12.7.2 of PDF 32000-1:2008 specs)
+
+.. code-block:: python
+
+    if 'AcroForm' not in res:
+        raise ValueError("No AcroForm Found")
+
+- Get the field list resolving the entry in the catalog
+
+.. code-block:: python
+
+    fields = resolve1(doc.catalog['AcroForm'])['Fields']
+    for f in fields:
+        field = resolve1(f)
+
+- Get field name and field value(s)
+
+.. code-block:: python
+
+    name, values = field.get('T'), field.get('V')
+
+- Decode field name.
+
+.. code-block:: python
+
+    name = decode_text(name)
+
+- Resolve indirect field value objects
+
+.. code-block:: python
+
+    values = resolve1(value)
+
+- Call the value(s) decoding method as needed
+
+  (a single field can hold multiple values, for example, a combo box can hold more than one value at a time)
+
+.. code-block:: python
+
+    if isinstance(values, list):
+        values = [decode_value(v) for v in values]
+    else:
+        values = decode_value(values)
+        
+(the decode_value method takes care of decoding the field's value, returning a string)
+
+- Decode PSLiteral and PSKeyword field values
+
+.. code-block:: python
+
+    if isinstance(value, (PSLiteral, PSKeyword)):
+        value = value.name
+
+- Decode bytes field values
+
+.. code-block:: python
+
+    if isinstance(value, bytes):
+        value = utils.decode_text(value)
--- a/docs/source/howto/images.rst
+++ b/docs/source/howto/images.rst
@ -0,0 +1,19 @@
+.. _images:
+
+How to extract images from a PDF
+********************************
+
+Before you start, make sure you have :ref:`installed pdfminer.six<install>`.
+The second thing you need is a PDF with images. If you don't have one,
+you can download `this research paper
+<https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf>`_
+with images of cats and dogs and save it as `example.pdf`::
+
+    $ curl https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf --output example.pdf
+
+Then run the :ref:`pdf2txt<api_pdf2txt>` command::
+
+    $ pdf2txt.py example.pdf --output-dir cats-and-dogs
+
+This command extracts all the images from the PDF and saves them into the
+`cats-and-dogs` directory.
--- a/docs/source/howto/index.rst
+++ b/docs/source/howto/index.rst
@ -0,0 +1,12 @@
+.. _howto:
+
+How-to guides
+*************
+
+How-to guides help you to solve specific problems with pdfminer.six.
+
+.. toctree::
+    :maxdepth: 1
+
+    images
+    acro_forms
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -0,0 +1,94 @@
+Welcome to pdfminer.six's documentation!
+****************************************
+
+.. image:: https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master
+    :target: https://travis-ci.org/pdfminer/pdfminer.six
+    :alt: Travis-ci build badge
+
+.. image:: https://img.shields.io/pypi/v/pdfminer.six.svg
+    :target: https://pypi.python.org/pypi/pdfminer.six/
+    :alt: PyPi version badge
+
+.. image:: https://badges.gitter.im/pdfminer-six/Lobby.svg
+    :target: https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium
+    :alt: gitter badge
+
+We fathom PDF.
+
+Pdfminer.six is a python package for extracting information from PDF documents.
+
+Check out the source on `github <https://github.com/pdfminer/pdfminer.six>`_.
+
+Content
+=======
+
+This documentation is organized into four sections (according to the `Diátaxis
+documentation framework <https://diataxis.fr>`_). The
+:ref:`tutorial` section helps you setup and use pdfminer.six for the first
+time. Read this section if this is your first time working with pdfminer.six.
+The :ref:`howto` offers specific recipies for solving common problems.
+Take a look at the :ref:`topic` if you want more background information on
+how pdfminer.six works internally. The :ref:`reference` provides
+detailed api documentation for all the common classes and functions in
+pdfminer.six.
+
+.. toctree::
+    :maxdepth: 2
+
+    tutorial/index
+    howto/index
+    topic/index
+    reference/index
+    faq
+
+
+Features
+========
+
+* Parse all objects from a PDF document into Python objects.
+* Analyze and group text in a human-readable way.
+* Extract text, images (JPG, JBIG2 and Bitmaps), table-of-contents, tagged
+  contents and more.
+* Support for (almost all) features from the PDF-1.7 specification
+* Support for Chinese, Japanese and Korean CJK) languages as well as vertical writing.
+* Support for various font types (Type1, TrueType, Type3, and CID).
+* Support for RC4 and AES encryption.
+* Support for AcroForm interactive form extraction.
+
+
+Installation instructions
+=========================
+
+* Install Python 3.6 or newer.
+* Install pdfminer.six.
+
+::
+    $ pip install pdfminer.six`
+
+* (Optionally) install extra dependencies for extracting images.
+
+::
+    $ pip install 'pdfminer.six[image]'`
+
+* Use the command-line interface to extract text from pdf.
+
+::
+    $ pdf2txt.py example.pdf`
+
+* Or use it with Python.
+
+.. code-block:: python
+
+    from pdfminer.high_level import extract_text
+
+    text = extract_text("example.pdf")
+    print(text)
+
+
+
+Contributing
+============
+
+We welcome any contributors to pdfminer.six! But, before doing anything, take
+a look at the `contribution guide
+<https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md>`_.
--- a/docs/source/reference/commandline.rst
+++ b/docs/source/reference/commandline.rst
@ -0,0 +1,25 @@
+.. _api_commandline:
+
+
+Command-line API
+****************
+
+.. _api_pdf2txt:
+
+pdf2txt.py
+==========
+
+.. argparse::
+    :module: tools.pdf2txt
+    :func: create_parser
+    :prog: python tools/pdf2txt.py
+
+.. _api_dumppdf:
+
+dumppdf.py
+==========
+
+.. argparse::
+    :module: tools.dumppdf
+    :func: create_parser
+    :prog: python tools/dumppdf.py
--- a/docs/source/reference/composable.rst
+++ b/docs/source/reference/composable.rst
@ -0,0 +1,20 @@
+.. _api_composable:
+
+Composable API
+**************
+
+.. _api_laparams:
+
+LAParams
+========
+
+.. currentmodule:: pdfminer.layout
+.. autoclass:: LAParams
+
+Todo:
+=====
+
+- `PDFDevice`
+    - `TextConverter`
+    - `PDFPageAggregator`
+- `PDFPageInterpreter`
--- a/docs/source/reference/highlevel.rst
+++ b/docs/source/reference/highlevel.rst
@ -0,0 +1,30 @@
+.. _api_highlevel:
+
+High-level functions API
+************************
+
+.. _api_extract_text:
+
+extract_text
+============
+
+.. currentmodule:: pdfminer.high_level
+.. autofunction:: extract_text
+
+
+.. _api_extract_text_to_fp:
+
+extract_text_to_fp
+==================
+
+.. currentmodule:: pdfminer.high_level
+.. autofunction:: extract_text_to_fp
+
+
+.. _api_extract_pages:
+
+extract_pages
+=============
+
+.. currentmodule:: pdfminer.high_level
+.. autofunction:: extract_pages
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
@ -0,0 +1,11 @@
+.. _reference:
+
+API Reference
+*************
+
+.. toctree::
+    :maxdepth: 2
+
+    commandline
+    highlevel
+    composable
--- a/docs/source/topic/converting_pdf_to_text.rst
+++ b/docs/source/topic/converting_pdf_to_text.rst
@ -0,0 +1,131 @@
+.. _topic_pdf_to_text:
+
+Converting a PDF file to text
+*****************************
+
+Most PDF files look like they contain well-structured text. But the reality is
+that a PDF file does not contain anything that resembles paragraphs,
+sentences or even words. When it comes to text, a PDF file is only aware of
+the characters and their placement.
+
+This makes extracting meaningful pieces of text from PDF files difficult.
+The characters that compose a paragraph are no different from those that
+compose the table, the page footer or the description of a figure. Unlike
+other document formats, like a `.txt` file or a word document, the PDF format
+does not contain a stream of text.
+
+A PDF document consists of a collection of objects that together describe
+the appearance of one or more pages, possibly accompanied by additional
+interactive elements and higher-level application data. A PDF file contains
+the objects making up a PDF document along with associated structural
+information, all represented as a single self-contained sequence of bytes. [1]_
+
+.. _topic_pdf_to_text_layout:
+
+Layout analysis algorithm
+=========================
+
+PDFMiner attempts to reconstruct some of those structures by using heuristics
+on the positioning of characters. This works well for sentences and
+paragraphs because meaningful groups of nearby characters can be made.
+
+The layout analysis consists of three different stages: it groups characters
+into words and lines, then it groups lines into boxes and finally it groups
+textboxes hierarchically. These stages are discussed in the following
+sections. The resulting output of the layout analysis is an ordered hierarchy
+of layout objects on a PDF page.
+
+.. figure:: ../_static/layout_analysis_output.png
+    :align: center
+
+    The output of the layout analysis is a hierarchy of layout objects.
+
+The output of the layout analysis heavily depends on a couple of parameters.
+All these parameters are part of the :ref:`api_laparams` class.
+
+Grouping characters into words and lines
+----------------------------------------
+
+The first step in going from characters to text is to group characters in a
+meaningful way. Each character has an x-coordinate and a y-coordinate for its
+bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer.six 
+uses these bounding boxes to decide which characters belong together.
+
+Characters that are both horizontally and vertically close are grouped onto
+one line. How close they should be is determined by the `char_margin`
+(M in the figure) and the `line_overlap` (not in figure) parameter. The horizontal
+*distance* between the bounding boxes of two characters should be smaller than
+the `char_margin` and the vertical *overlap* between the bounding boxes should
+be smaller than the `line_overlap`.
+
+.. raw:: html
+    :file: ../_static/layout_analysis.html
+
+The values of `char_margin` and `line_overlap` are relative to the size of
+the bounding boxes of the characters. The `char_margin` is relative to the
+maximum width of either one of the bounding boxes, and the `line_overlap` is
+relative to the minimum height of either one of the bounding boxes.
+
+Spaces need to be inserted between characters because the PDF format has no
+notion of the space character. A space is inserted if the characters are
+further apart than the `word_margin` (W in the figure). The `word_margin` is
+relative to the maximum width or height of the new character. Having a smaller
+`word_margin` creates smaller words. Note that the `word_margin` should at
+least be smaller than the `char_margin` otherwise none of the characters will
+be separated by a space.
+
+The result of this stage is a list of lines. Each line consists of a list of
+characters. These characters are either original `LTChar` characters that
+originate from the PDF file or inserted `LTAnno` characters that
+represent spaces between words or newlines at the end of each line.
+
+Grouping lines into boxes
+-------------------------
+
+The second step is grouping lines in a meaningful way. Each line has a
+bounding box that is determined by the bounding boxes of the characters that
+it contains. Like grouping characters, pdfminer.six uses the bounding boxes
+to group the lines.
+
+Lines that are both horizontally overlapping and vertically close are grouped.
+How vertically close the lines should be is determined by the `line_margin`.
+This margin is specified relative to the height of the bounding box. Lines
+are close if the gap between the tops (see L :sub:`1` in the figure) and bottoms
+(see L :sub:`2`) in the figure) of the bounding boxes are closer together
+than the absolute line margin, i.e. the `line_margin` multiplied by the
+height of the bounding box.
+
+.. raw:: html
+    :file: ../_static/layout_analysis_group_lines.html
+
+The result of this stage is a list of text boxes. Each box consists of a list
+of lines.
+
+Grouping textboxes hierarchically
+---------------------------------
+
+The last step is to group the text boxes in a meaningful way. This step
+repeatedly merges the two text boxes that are closest to each other.
+
+The closeness of bounding boxes is computed as the area that is between the
+two text boxes (the blue area in the figure). In other words, it is the area of
+the bounding box that surrounds both lines, minus the area of the bounding
+boxes of the individual lines.
+
+.. raw:: html
+    :file: ../_static/layout_analysis_group_boxes.html
+
+Working with rotated characters
+===============================
+
+The algorithm described above assumes that all characters have the same
+orientation. However, any writing direction is possible in a PDF. To
+accommodate for this, pdfminer.six allows detecting vertical writing with the
+`detect_vertical` parameter. This will apply all the grouping steps as if the
+pdf was rotated 90 (or 270) degrees
+
+References
+==========
+
+.. [1] Adobe System Inc. (2007). *Pdf reference: Adobe portable document
+  format, version 1.7.*
--- a/docs/source/topic/index.rst
+++ b/docs/source/topic/index.rst
@ -0,0 +1,9 @@
+.. _topic:
+
+Topics
+******
+
+.. toctree::
+    :maxdepth: 2
+
+    converting_pdf_to_text
--- a/docs/source/tutorial/commandline.rst
+++ b/docs/source/tutorial/commandline.rst
@ -0,0 +1,41 @@
+.. _tutorial_commandline:
+
+Extract text from a PDF using the commandline
+*********************************************
+
+pdfminer.six has several tools that can be used from the command line. The
+command-line tools are aimed at users that occasionally want to extract text
+from a pdf.
+
+Take a look at the high-level or composable interface if you want to use
+pdfminer.six programmatically.
+
+Examples
+========
+
+pdf2txt.py
+----------
+
+::
+
+    $ pdf2txt.py example.pdf
+    all the text from the pdf appears on the command line
+
+The :ref:`api_pdf2txt` tool extracts all the text from a PDF. It uses layout
+analysis with sensible defaults to order and group the text in a sensible way.
+
+dumppdf.py
+----------
+
+::
+
+    $ dumppdf.py -a example.pdf
+    <pdf><object id="1">
+    ...
+    </object>
+    ...
+    </pdf>
+
+The :ref:`api_dumppdf` tool can be used to extract the internal structure from a
+PDF. This tool is primarily for debugging purposes, but that can be useful to
+anybody working with PDF's.
--- a/docs/source/tutorial/composable.rst
+++ b/docs/source/tutorial/composable.rst
@ -0,0 +1,33 @@
+.. _tutorial_composable:
+
+Extract text from a PDF using Python - part 2
+*********************************************
+
+The command line tools and the high-level API are just shortcuts for often
+used combinations of pdfminer.six components. You can use these components to
+modify pdfminer.six to your own needs.
+
+For example, to extract the text from a PDF file and save it in a python
+variable::
+
+    from io import StringIO
+
+    from pdfminer.converter import TextConverter
+    from pdfminer.layout import LAParams
+    from pdfminer.pdfdocument import PDFDocument
+    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+    from pdfminer.pdfpage import PDFPage
+    from pdfminer.pdfparser import PDFParser
+
+    output_string = StringIO()
+    with open('samples/simple1.pdf', 'rb') as in_file:
+        parser = PDFParser(in_file)
+        doc = PDFDocument(parser)
+        rsrcmgr = PDFResourceManager()
+        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        for page in PDFPage.create_pages(doc):
+            interpreter.process_page(page)
+
+    print(output_string.getvalue())
+
--- a/docs/source/tutorial/extract_pages.rst
+++ b/docs/source/tutorial/extract_pages.rst
@ -0,0 +1,47 @@
+.. _tutorial_extract_pages:
+
+Extract elements from a PDF using Python
+****************************************
+
+The high level functions can be used to achieve common tasks. In this case,
+we can use :ref:`api_extract_pages`:
+
+.. code-block:: python
+
+   from pdfminer.high_level import extract_pages
+   for page_layout in extract_pages("test.pdf"):
+       for element in page_layout:
+           print(element)
+
+
+Each ``element`` will be an ``LTTextBox``, ``LTFigure``, ``LTLine``, ``LTRect``
+or an ``LTImage``. Some of these can be iterated further, for example iterating
+though an ``LTTextBox`` will give you an ``LTTextLine``, and these in turn can
+be iterated through to get an ``LTChar``. See the diagram here:
+:ref:`topic_pdf_to_text_layout`.
+
+Let's say we want to extract all of the text. We could do:
+
+.. code-block:: python
+
+   from pdfminer.high_level import extract_pages
+   from pdfminer.layout import LTTextContainer
+   for page_layout in extract_pages("test.pdf"):
+       for element in page_layout:
+           if isinstance(element, LTTextContainer):
+               print(element.get_text())
+
+Or, we could extract the fontname or size of each individual character:
+
+.. code-block:: python
+
+   from pdfminer.high_level import extract_pages
+   from pdfminer.layout import LTTextContainer, LTChar
+   for page_layout in extract_pages("test.pdf"):
+       for element in page_layout:
+           if isinstance(element, LTTextContainer):
+               for text_line in element:
+                   for character in text_line:
+                       if isinstance(character, LTChar):
+                           print(character.fontname)
+                           print(character.size)
--- a/docs/source/tutorial/highlevel.rst
+++ b/docs/source/tutorial/highlevel.rst
@ -0,0 +1,59 @@
+.. _tutorial_highlevel:
+
+Extract text from a PDF using Python
+************************************
+
+The high-level API can be used to do common tasks.
+
+The most simple way to extract text from a PDF is to use
+:ref:`api_extract_text`:
+
+.. doctest::
+
+    >>> from pdfminer.high_level import extract_text
+    >>> text = extract_text('samples/simple1.pdf')
+    >>> print(repr(text))
+    'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\x0c'
+    >>> print(text)
+    ... # doctest: +NORMALIZE_WHITESPACE
+    Hello
+    <BLANKLINE>
+    World
+    <BLANKLINE>
+    Hello
+    <BLANKLINE>
+    World
+    <BLANKLINE>
+    H e l l o
+    <BLANKLINE>
+    W o r l d
+    <BLANKLINE>
+    H e l l o
+    <BLANKLINE>
+    W o r l d
+    <BLANKLINE>
+
+
+To read text from a PDF and print it on the command line:
+
+.. doctest::
+
+    >>> from io import StringIO
+    >>> from pdfminer.high_level import extract_text_to_fp
+    >>> output_string = StringIO()
+    >>> with open('samples/simple1.pdf', 'rb') as fin:
+    ...     extract_text_to_fp(fin, output_string)
+    >>> print(output_string.getvalue().strip())
+    Hello WorldHello WorldHello WorldHello World
+
+Or to convert it to html and use layout analysis:
+
+.. doctest::
+
+    >>> from io import StringIO
+    >>> from pdfminer.high_level import extract_text_to_fp
+    >>> from pdfminer.layout import LAParams
+    >>> output_string = StringIO()
+    >>> with open('samples/simple1.pdf', 'rb') as fin:
+    ...     extract_text_to_fp(fin, output_string, laparams=LAParams(),
+    ...                        output_type='html', codec=None)
--- a/docs/source/tutorial/index.rst
+++ b/docs/source/tutorial/index.rst
@ -0,0 +1,15 @@
+.. _tutorial:
+
+Tutorials
+*********
+
+Tutorials help you get started with specific parts of pdfminer.six.
+
+.. toctree::
+    :maxdepth: 1
+
+    install
+    commandline
+    highlevel
+    composable
+    extract_pages
--- a/docs/source/tutorial/install.rst
+++ b/docs/source/tutorial/install.rst
@ -0,0 +1,39 @@
+.. _install:
+
+Install pdfminer.six as a Python package
+****************************************
+
+To use pdfminer.six for the first time, you need to install the Python
+package in your Python environment.
+
+This tutorial requires you to have a system with a working Python and pip
+installation. If you don't have one and don't know how to install it, take a
+look at `The Hitchhiker's Guide to Python! <https://docs.python-guide.org/>`_.
+
+Install using pip
+=================
+
+Run the following command on the commandline to install pdfminer.six as a
+Python package::
+
+    pip install pdfminer.six
+
+
+Test pdfminer.six installation
+==============================
+
+You can test the pdfminer.six installation by importing it in Python.
+
+Open an interactive Python session from the commandline import pdfminer
+.six::
+
+    >>> import pdfminer
+    >>> print(pdfminer.__version__)  # doctest: +IGNORE_RESULT
+    '<installed version>'
+
+Now you can use pdfminer.six as a Python package. But pdfminer.six also
+comes with a couple of useful commandline tools. To test if these tools are
+correctly installed, run the following on your commandline::
+
+    $ pdf2txt.py --version
+    pdfminer.six <installed version>
--- a/mypy.ini
+++ b/mypy.ini
@ -0,0 +1,33 @@
+[mypy]
+warn_unused_configs = True
+disallow_any_generics = True
+disallow_subclassing_any = True
+disallow_untyped_calls = True
+disallow_incomplete_defs = True
+disallow_untyped_decorators = True
+no_implicit_optional = True
+warn_redundant_casts = True
+warn_return_any = True
+no_implicit_reexport = True
+strict_equality = True
+
+# This seems impossible to turn on in a version-independent manner
+warn_unused_ignores = False
+
+[mypy-pdfminer.*]
+disallow_untyped_defs = True
+
+[mypy-cryptography.hazmat.*]
+ignore_missing_imports = True
+
+[mypy-pytest.*]
+ignore_missing_imports = True
+
+[mypy-setuptools.*]
+ignore_missing_imports = True
+
+[mypy-nox.*]
+ignore_missing_imports = True
+
+[mypy-charset_normalizer.*]
+ignore_missing_imports = True
--- a/noxfile.py
+++ b/noxfile.py
@ -0,0 +1,52 @@
+import os
+
+import nox
+
+
+PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
+PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"]
+
+
+@nox.session
+def format(session):
+    session.install("black")
+    # Format files locally with black, but only check in cicd
+    if "CI" in os.environ:
+        session.run("black", "--check", *PYTHON_MODULES)
+    else:
+        session.run("black", *PYTHON_MODULES)
+
+
+@nox.session
+def lint(session):
+    session.install("flake8")
+    session.run("flake8", *PYTHON_MODULES, "--count", "--statistics")
+
+
+@nox.session
+def types(session):
+    session.install("mypy")
+    session.run(
+        "mypy",
+        "--install-types",
+        "--non-interactive",
+        "--show-error-codes",
+        *PYTHON_MODULES,
+    )
+
+
+@nox.session(python=PYTHON_ALL_VERSIONS)
+def tests(session):
+    session.install("-e", ".[dev]")
+    session.run("pytest")
+
+
+@nox.session
+def docs(session):
+    session.install("-e", ".[docs]")
+    session.run(
+        "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
+    )
+    session.run(
+        "python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest"
+    )
--- a/pdfminer/init.py
+++ b/pdfminer/init.py
@ -0,0 +1,4 @@
+__version__ = "__VERSION__"  # auto replaced with tag in github actions
+
+if __name__ == "__main__":
+    print(__version__)
--- a/pdfminer/_saslprep.py
+++ b/pdfminer/_saslprep.py
@ -0,0 +1,95 @@
+# Copyright 2016-present MongoDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Some changes copyright 2021-present Matthias Valvekens,
+# licensed under the license of the pyHanko project (see LICENSE file).
+
+
+"""An implementation of RFC4013 SASLprep."""
+
+__all__ = ["saslprep"]
+
+import stringprep
+from typing import Callable, Tuple
+import unicodedata
+
+# RFC4013 section 2.3 prohibited output.
+_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
+    # A strict reading of RFC 4013 requires table c12 here, but
+    # characters from it are mapped to SPACE in the Map step. Can
+    # normalization reintroduce them somehow?
+    stringprep.in_table_c12,
+    stringprep.in_table_c21_c22,
+    stringprep.in_table_c3,
+    stringprep.in_table_c4,
+    stringprep.in_table_c5,
+    stringprep.in_table_c6,
+    stringprep.in_table_c7,
+    stringprep.in_table_c8,
+    stringprep.in_table_c9,
+)
+
+
+def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
+    """An implementation of RFC4013 SASLprep.
+    :param data:
+        The string to SASLprep.
+    :param prohibit_unassigned_code_points:
+        RFC 3454 and RFCs for various SASL mechanisms distinguish between
+        `queries` (unassigned code points allowed) and
+        `stored strings` (unassigned code points prohibited). Defaults
+        to ``True`` (unassigned code points are prohibited).
+    :return: The SASLprep'ed version of `data`.
+    """
+    if prohibit_unassigned_code_points:
+        prohibited = _PROHIBITED + (stringprep.in_table_a1,)
+    else:
+        prohibited = _PROHIBITED
+
+    # RFC3454 section 2, step 1 - Map
+    # RFC4013 section 2.1 mappings
+    # Map Non-ASCII space characters to SPACE (U+0020). Map
+    # commonly mapped to nothing characters to, well, nothing.
+    in_table_c12 = stringprep.in_table_c12
+    in_table_b1 = stringprep.in_table_b1
+    data = "".join(
+        ["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)]
+    )
+
+    # RFC3454 section 2, step 2 - Normalize
+    # RFC4013 section 2.2 normalization
+    data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
+
+    in_table_d1 = stringprep.in_table_d1
+    if in_table_d1(data[0]):
+        if not in_table_d1(data[-1]):
+            # RFC3454, Section 6, #3. If a string contains any
+            # RandALCat character, the first and last characters
+            # MUST be RandALCat characters.
+            raise ValueError("SASLprep: failed bidirectional check")
+        # RFC3454, Section 6, #2. If a string contains any RandALCat
+        # character, it MUST NOT contain any LCat character.
+        prohibited = prohibited + (stringprep.in_table_d2,)
+    else:
+        # RFC3454, Section 6, #3. Following the logic of #3, if
+        # the first character is not a RandALCat, no other character
+        # can be either.
+        prohibited = prohibited + (in_table_d1,)
+
+    # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
+    for char in data:
+        if any(in_table(char) for in_table in prohibited):
+            raise ValueError("SASLprep: failed prohibited character check")
+
+    return data
--- a/pdfminer/arcfour.py
+++ b/pdfminer/arcfour.py
@ -0,0 +1,36 @@
+""" Python implementation of Arcfour encryption algorithm.
+See https://en.wikipedia.org/wiki/RC4
+This code is in the public domain.
+
+"""
+
+
+from typing import Sequence
+
+
+class Arcfour:
+    def __init__(self, key: Sequence[int]) -> None:
+        # because Py3 range is not indexable
+        s = [i for i in range(256)]
+        j = 0
+        klen = len(key)
+        for i in range(256):
+            j = (j + s[i] + key[i % klen]) % 256
+            (s[i], s[j]) = (s[j], s[i])
+        self.s = s
+        (self.i, self.j) = (0, 0)
+
+    def process(self, data: bytes) -> bytes:
+        (i, j) = (self.i, self.j)
+        s = self.s
+        r = b""
+        for c in iter(data):
+            i = (i + 1) % 256
+            j = (j + s[i]) % 256
+            (s[i], s[j]) = (s[j], s[i])
+            k = s[(s[i] + s[j]) % 256]
+            r += bytes((c ^ k,))
+        (self.i, self.j) = (i, j)
+        return r
+
+    encrypt = decrypt = process
--- a/pdfminer/ascii85.py
+++ b/pdfminer/ascii85.py
@ -0,0 +1,72 @@
+""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
+
+This code is in the public domain.
+
+"""
+
+import re
+import struct
+
+
+# ascii85decode(data)
+def ascii85decode(data: bytes) -> bytes:
+    """
+    In ASCII85 encoding, every four bytes are encoded with five ASCII
+    letters, using 85 different types of characters (as 256**4 < 85**5).
+    When the length of the original bytes is not a multiple of 4, a special
+    rule is used for round up.
+
+    The Adobe's ASCII85 implementation is slightly different from
+    its original in handling the last characters.
+
+    """
+    n = b = 0
+    out = b""
+    for i in iter(data):
+        c = bytes((i,))
+        if b"!" <= c and c <= b"u":
+            n += 1
+            b = b * 85 + (ord(c) - 33)
+            if n == 5:
+                out += struct.pack(">L", b)
+                n = b = 0
+        elif c == b"z":
+            assert n == 0, str(n)
+            out += b"\0\0\0\0"
+        elif c == b"~":
+            if n:
+                for _ in range(5 - n):
+                    b = b * 85 + 84
+                out += struct.pack(">L", b)[: n - 1]
+            break
+    return out
+
+
+# asciihexdecode(data)
+hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
+trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
+
+
+def asciihexdecode(data: bytes) -> bytes:
+    """
+    ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
+    For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
+    ASCIIHexDecode filter produces one byte of binary data. All white-space
+    characters are ignored. A right angle bracket character (>) indicates
+    EOD. Any other characters will cause an error. If the filter encounters
+    the EOD marker after reading an odd number of hexadecimal digits, it
+    will behave as if a 0 followed the last digit.
+    """
+
+    def decode(x: bytes) -> bytes:
+        i = int(x, 16)
+        return bytes((i,))
+
+    out = b""
+    for x in hex_re.findall(data):
+        out += decode(x)
+
+    m = trail_re.search(data)
+    if m:
+        out += decode(m.group(1) + b"0")
+    return out
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@ -0,0 +1,629 @@
+# CCITT Fax decoder
+#
+# Bugs: uncompressed mode untested.
+#
+# cf.
+#  ITU-T Recommendation T.4
+#    "Standardization of Group 3 facsimile terminals
+#    for document transmission"
+#  ITU-T Recommendation T.6
+#    "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
+#    FOR GROUP 4 FACSIMILE APPARATUS"
+
+
+import array
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    MutableSequence,
+    Optional,
+    Sequence,
+    Union,
+    cast,
+)
+
+
+def get_bytes(data: bytes) -> Iterator[int]:
+    yield from data
+
+
+# Workaround https://github.com/python/mypy/issues/731
+BitParserState = MutableSequence[Any]
+# A better definition (not supported by mypy) would be:
+# BitParserState = MutableSequence[Union["BitParserState", int, str, None]]
+
+
+class BitParser:
+    _state: BitParserState
+
+    # _accept is declared Optional solely as a workaround for
+    # https://github.com/python/mypy/issues/708
+    _accept: Optional[Callable[[Any], BitParserState]]
+
+    def __init__(self) -> None:
+        self._pos = 0
+
+    @classmethod
+    def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None:
+        p: BitParserState = root
+        b = None
+        for i in range(len(bits)):
+            if 0 < i:
+                assert b is not None
+                if p[b] is None:
+                    p[b] = [None, None]
+                p = p[b]
+            if bits[i] == "1":
+                b = 1
+            else:
+                b = 0
+        assert b is not None
+        p[b] = v
+
+    def feedbytes(self, data: bytes) -> None:
+        for byte in get_bytes(data):
+            for m in (128, 64, 32, 16, 8, 4, 2, 1):
+                self._parse_bit(byte & m)
+
+    def _parse_bit(self, x: object) -> None:
+        if x:
+            v = self._state[1]
+        else:
+            v = self._state[0]
+        self._pos += 1
+        if isinstance(v, list):
+            self._state = v
+        else:
+            assert self._accept is not None
+            self._state = self._accept(v)
+
+
+class CCITTG4Parser(BitParser):
+
+    MODE = [None, None]
+    BitParser.add(MODE, 0, "1")
+    BitParser.add(MODE, +1, "011")
+    BitParser.add(MODE, -1, "010")
+    BitParser.add(MODE, "h", "001")
+    BitParser.add(MODE, "p", "0001")
+    BitParser.add(MODE, +2, "000011")
+    BitParser.add(MODE, -2, "000010")
+    BitParser.add(MODE, +3, "0000011")
+    BitParser.add(MODE, -3, "0000010")
+    BitParser.add(MODE, "u", "0000001111")
+    BitParser.add(MODE, "x1", "0000001000")
+    BitParser.add(MODE, "x2", "0000001001")
+    BitParser.add(MODE, "x3", "0000001010")
+    BitParser.add(MODE, "x4", "0000001011")
+    BitParser.add(MODE, "x5", "0000001100")
+    BitParser.add(MODE, "x6", "0000001101")
+    BitParser.add(MODE, "x7", "0000001110")
+    BitParser.add(MODE, "e", "000000000001000000000001")
+
+    WHITE = [None, None]
+    BitParser.add(WHITE, 0, "00110101")
+    BitParser.add(WHITE, 1, "000111")
+    BitParser.add(WHITE, 2, "0111")
+    BitParser.add(WHITE, 3, "1000")
+    BitParser.add(WHITE, 4, "1011")
+    BitParser.add(WHITE, 5, "1100")
+    BitParser.add(WHITE, 6, "1110")
+    BitParser.add(WHITE, 7, "1111")
+    BitParser.add(WHITE, 8, "10011")
+    BitParser.add(WHITE, 9, "10100")
+    BitParser.add(WHITE, 10, "00111")
+    BitParser.add(WHITE, 11, "01000")
+    BitParser.add(WHITE, 12, "001000")
+    BitParser.add(WHITE, 13, "000011")
+    BitParser.add(WHITE, 14, "110100")
+    BitParser.add(WHITE, 15, "110101")
+    BitParser.add(WHITE, 16, "101010")
+    BitParser.add(WHITE, 17, "101011")
+    BitParser.add(WHITE, 18, "0100111")
+    BitParser.add(WHITE, 19, "0001100")
+    BitParser.add(WHITE, 20, "0001000")
+    BitParser.add(WHITE, 21, "0010111")
+    BitParser.add(WHITE, 22, "0000011")
+    BitParser.add(WHITE, 23, "0000100")
+    BitParser.add(WHITE, 24, "0101000")
+    BitParser.add(WHITE, 25, "0101011")
+    BitParser.add(WHITE, 26, "0010011")
+    BitParser.add(WHITE, 27, "0100100")
+    BitParser.add(WHITE, 28, "0011000")
+    BitParser.add(WHITE, 29, "00000010")
+    BitParser.add(WHITE, 30, "00000011")
+    BitParser.add(WHITE, 31, "00011010")
+    BitParser.add(WHITE, 32, "00011011")
+    BitParser.add(WHITE, 33, "00010010")
+    BitParser.add(WHITE, 34, "00010011")
+    BitParser.add(WHITE, 35, "00010100")
+    BitParser.add(WHITE, 36, "00010101")
+    BitParser.add(WHITE, 37, "00010110")
+    BitParser.add(WHITE, 38, "00010111")
+    BitParser.add(WHITE, 39, "00101000")
+    BitParser.add(WHITE, 40, "00101001")
+    BitParser.add(WHITE, 41, "00101010")
+    BitParser.add(WHITE, 42, "00101011")
+    BitParser.add(WHITE, 43, "00101100")
+    BitParser.add(WHITE, 44, "00101101")
+    BitParser.add(WHITE, 45, "00000100")
+    BitParser.add(WHITE, 46, "00000101")
+    BitParser.add(WHITE, 47, "00001010")
+    BitParser.add(WHITE, 48, "00001011")
+    BitParser.add(WHITE, 49, "01010010")
+    BitParser.add(WHITE, 50, "01010011")
+    BitParser.add(WHITE, 51, "01010100")
+    BitParser.add(WHITE, 52, "01010101")
+    BitParser.add(WHITE, 53, "00100100")
+    BitParser.add(WHITE, 54, "00100101")
+    BitParser.add(WHITE, 55, "01011000")
+    BitParser.add(WHITE, 56, "01011001")
+    BitParser.add(WHITE, 57, "01011010")
+    BitParser.add(WHITE, 58, "01011011")
+    BitParser.add(WHITE, 59, "01001010")
+    BitParser.add(WHITE, 60, "01001011")
+    BitParser.add(WHITE, 61, "00110010")
+    BitParser.add(WHITE, 62, "00110011")
+    BitParser.add(WHITE, 63, "00110100")
+    BitParser.add(WHITE, 64, "11011")
+    BitParser.add(WHITE, 128, "10010")
+    BitParser.add(WHITE, 192, "010111")
+    BitParser.add(WHITE, 256, "0110111")
+    BitParser.add(WHITE, 320, "00110110")
+    BitParser.add(WHITE, 384, "00110111")
+    BitParser.add(WHITE, 448, "01100100")
+    BitParser.add(WHITE, 512, "01100101")
+    BitParser.add(WHITE, 576, "01101000")
+    BitParser.add(WHITE, 640, "01100111")
+    BitParser.add(WHITE, 704, "011001100")
+    BitParser.add(WHITE, 768, "011001101")
+    BitParser.add(WHITE, 832, "011010010")
+    BitParser.add(WHITE, 896, "011010011")
+    BitParser.add(WHITE, 960, "011010100")
+    BitParser.add(WHITE, 1024, "011010101")
+    BitParser.add(WHITE, 1088, "011010110")
+    BitParser.add(WHITE, 1152, "011010111")
+    BitParser.add(WHITE, 1216, "011011000")
+    BitParser.add(WHITE, 1280, "011011001")
+    BitParser.add(WHITE, 1344, "011011010")
+    BitParser.add(WHITE, 1408, "011011011")
+    BitParser.add(WHITE, 1472, "010011000")
+    BitParser.add(WHITE, 1536, "010011001")
+    BitParser.add(WHITE, 1600, "010011010")
+    BitParser.add(WHITE, 1664, "011000")
+    BitParser.add(WHITE, 1728, "010011011")
+    BitParser.add(WHITE, 1792, "00000001000")
+    BitParser.add(WHITE, 1856, "00000001100")
+    BitParser.add(WHITE, 1920, "00000001101")
+    BitParser.add(WHITE, 1984, "000000010010")
+    BitParser.add(WHITE, 2048, "000000010011")
+    BitParser.add(WHITE, 2112, "000000010100")
+    BitParser.add(WHITE, 2176, "000000010101")
+    BitParser.add(WHITE, 2240, "000000010110")
+    BitParser.add(WHITE, 2304, "000000010111")
+    BitParser.add(WHITE, 2368, "000000011100")
+    BitParser.add(WHITE, 2432, "000000011101")
+    BitParser.add(WHITE, 2496, "000000011110")
+    BitParser.add(WHITE, 2560, "000000011111")
+
+    BLACK = [None, None]
+    BitParser.add(BLACK, 0, "0000110111")
+    BitParser.add(BLACK, 1, "010")
+    BitParser.add(BLACK, 2, "11")
+    BitParser.add(BLACK, 3, "10")
+    BitParser.add(BLACK, 4, "011")
+    BitParser.add(BLACK, 5, "0011")
+    BitParser.add(BLACK, 6, "0010")
+    BitParser.add(BLACK, 7, "00011")
+    BitParser.add(BLACK, 8, "000101")
+    BitParser.add(BLACK, 9, "000100")
+    BitParser.add(BLACK, 10, "0000100")
+    BitParser.add(BLACK, 11, "0000101")
+    BitParser.add(BLACK, 12, "0000111")
+    BitParser.add(BLACK, 13, "00000100")
+    BitParser.add(BLACK, 14, "00000111")
+    BitParser.add(BLACK, 15, "000011000")
+    BitParser.add(BLACK, 16, "0000010111")
+    BitParser.add(BLACK, 17, "0000011000")
+    BitParser.add(BLACK, 18, "0000001000")
+    BitParser.add(BLACK, 19, "00001100111")
+    BitParser.add(BLACK, 20, "00001101000")
+    BitParser.add(BLACK, 21, "00001101100")
+    BitParser.add(BLACK, 22, "00000110111")
+    BitParser.add(BLACK, 23, "00000101000")
+    BitParser.add(BLACK, 24, "00000010111")
+    BitParser.add(BLACK, 25, "00000011000")
+    BitParser.add(BLACK, 26, "000011001010")
+    BitParser.add(BLACK, 27, "000011001011")
+    BitParser.add(BLACK, 28, "000011001100")
+    BitParser.add(BLACK, 29, "000011001101")
+    BitParser.add(BLACK, 30, "000001101000")
+    BitParser.add(BLACK, 31, "000001101001")
+    BitParser.add(BLACK, 32, "000001101010")
+    BitParser.add(BLACK, 33, "000001101011")
+    BitParser.add(BLACK, 34, "000011010010")
+    BitParser.add(BLACK, 35, "000011010011")
+    BitParser.add(BLACK, 36, "000011010100")
+    BitParser.add(BLACK, 37, "000011010101")
+    BitParser.add(BLACK, 38, "000011010110")
+    BitParser.add(BLACK, 39, "000011010111")
+    BitParser.add(BLACK, 40, "000001101100")
+    BitParser.add(BLACK, 41, "000001101101")
+    BitParser.add(BLACK, 42, "000011011010")
+    BitParser.add(BLACK, 43, "000011011011")
+    BitParser.add(BLACK, 44, "000001010100")
+    BitParser.add(BLACK, 45, "000001010101")
+    BitParser.add(BLACK, 46, "000001010110")
+    BitParser.add(BLACK, 47, "000001010111")
+    BitParser.add(BLACK, 48, "000001100100")
+    BitParser.add(BLACK, 49, "000001100101")
+    BitParser.add(BLACK, 50, "000001010010")
+    BitParser.add(BLACK, 51, "000001010011")
+    BitParser.add(BLACK, 52, "000000100100")
+    BitParser.add(BLACK, 53, "000000110111")
+    BitParser.add(BLACK, 54, "000000111000")
+    BitParser.add(BLACK, 55, "000000100111")
+    BitParser.add(BLACK, 56, "000000101000")
+    BitParser.add(BLACK, 57, "000001011000")
+    BitParser.add(BLACK, 58, "000001011001")
+    BitParser.add(BLACK, 59, "000000101011")
+    BitParser.add(BLACK, 60, "000000101100")
+    BitParser.add(BLACK, 61, "000001011010")
+    BitParser.add(BLACK, 62, "000001100110")
+    BitParser.add(BLACK, 63, "000001100111")
+    BitParser.add(BLACK, 64, "0000001111")
+    BitParser.add(BLACK, 128, "000011001000")
+    BitParser.add(BLACK, 192, "000011001001")
+    BitParser.add(BLACK, 256, "000001011011")
+    BitParser.add(BLACK, 320, "000000110011")
+    BitParser.add(BLACK, 384, "000000110100")
+    BitParser.add(BLACK, 448, "000000110101")
+    BitParser.add(BLACK, 512, "0000001101100")
+    BitParser.add(BLACK, 576, "0000001101101")
+    BitParser.add(BLACK, 640, "0000001001010")
+    BitParser.add(BLACK, 704, "0000001001011")
+    BitParser.add(BLACK, 768, "0000001001100")
+    BitParser.add(BLACK, 832, "0000001001101")
+    BitParser.add(BLACK, 896, "0000001110010")
+    BitParser.add(BLACK, 960, "0000001110011")
+    BitParser.add(BLACK, 1024, "0000001110100")
+    BitParser.add(BLACK, 1088, "0000001110101")
+    BitParser.add(BLACK, 1152, "0000001110110")
+    BitParser.add(BLACK, 1216, "0000001110111")
+    BitParser.add(BLACK, 1280, "0000001010010")
+    BitParser.add(BLACK, 1344, "0000001010011")
+    BitParser.add(BLACK, 1408, "0000001010100")
+    BitParser.add(BLACK, 1472, "0000001010101")
+    BitParser.add(BLACK, 1536, "0000001011010")
+    BitParser.add(BLACK, 1600, "0000001011011")
+    BitParser.add(BLACK, 1664, "0000001100100")
+    BitParser.add(BLACK, 1728, "0000001100101")
+    BitParser.add(BLACK, 1792, "00000001000")
+    BitParser.add(BLACK, 1856, "00000001100")
+    BitParser.add(BLACK, 1920, "00000001101")
+    BitParser.add(BLACK, 1984, "000000010010")
+    BitParser.add(BLACK, 2048, "000000010011")
+    BitParser.add(BLACK, 2112, "000000010100")
+    BitParser.add(BLACK, 2176, "000000010101")
+    BitParser.add(BLACK, 2240, "000000010110")
+    BitParser.add(BLACK, 2304, "000000010111")
+    BitParser.add(BLACK, 2368, "000000011100")
+    BitParser.add(BLACK, 2432, "000000011101")
+    BitParser.add(BLACK, 2496, "000000011110")
+    BitParser.add(BLACK, 2560, "000000011111")
+
+    UNCOMPRESSED = [None, None]
+    BitParser.add(UNCOMPRESSED, "1", "1")
+    BitParser.add(UNCOMPRESSED, "01", "01")
+    BitParser.add(UNCOMPRESSED, "001", "001")
+    BitParser.add(UNCOMPRESSED, "0001", "0001")
+    BitParser.add(UNCOMPRESSED, "00001", "00001")
+    BitParser.add(UNCOMPRESSED, "00000", "000001")
+    BitParser.add(UNCOMPRESSED, "T00", "00000011")
+    BitParser.add(UNCOMPRESSED, "T10", "00000010")
+    BitParser.add(UNCOMPRESSED, "T000", "000000011")
+    BitParser.add(UNCOMPRESSED, "T100", "000000010")
+    BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
+    BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
+    BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
+    BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
+
+    class EOFB(Exception):
+        pass
+
+    class InvalidData(Exception):
+        pass
+
+    class ByteSkip(Exception):
+        pass
+
+    _color: int
+
+    def __init__(self, width: int, bytealign: bool = False) -> None:
+        BitParser.__init__(self)
+        self.width = width
+        self.bytealign = bytealign
+        self.reset()
+        return
+
+    def feedbytes(self, data: bytes) -> None:
+        for byte in get_bytes(data):
+            try:
+                for m in (128, 64, 32, 16, 8, 4, 2, 1):
+                    self._parse_bit(byte & m)
+            except self.ByteSkip:
+                self._accept = self._parse_mode
+                self._state = self.MODE
+            except self.EOFB:
+                break
+        return
+
+    def _parse_mode(self, mode: object) -> BitParserState:
+        if mode == "p":
+            self._do_pass()
+            self._flush_line()
+            return self.MODE
+        elif mode == "h":
+            self._n1 = 0
+            self._accept = self._parse_horiz1
+            if self._color:
+                return self.WHITE
+            else:
+                return self.BLACK
+        elif mode == "u":
+            self._accept = self._parse_uncompressed
+            return self.UNCOMPRESSED
+        elif mode == "e":
+            raise self.EOFB
+        elif isinstance(mode, int):
+            self._do_vertical(mode)
+            self._flush_line()
+            return self.MODE
+        else:
+            raise self.InvalidData(mode)
+
+    def _parse_horiz1(self, n: Any) -> BitParserState:
+        if n is None:
+            raise self.InvalidData
+        self._n1 += n
+        if n < 64:
+            self._n2 = 0
+            self._color = 1 - self._color
+            self._accept = self._parse_horiz2
+        if self._color:
+            return self.WHITE
+        else:
+            return self.BLACK
+
+    def _parse_horiz2(self, n: Any) -> BitParserState:
+        if n is None:
+            raise self.InvalidData
+        self._n2 += n
+        if n < 64:
+            self._color = 1 - self._color
+            self._accept = self._parse_mode
+            self._do_horizontal(self._n1, self._n2)
+            self._flush_line()
+            return self.MODE
+        elif self._color:
+            return self.WHITE
+        else:
+            return self.BLACK
+
+    def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
+        if not bits:
+            raise self.InvalidData
+        if bits.startswith("T"):
+            self._accept = self._parse_mode
+            self._color = int(bits[1])
+            self._do_uncompressed(bits[2:])
+            return self.MODE
+        else:
+            self._do_uncompressed(bits)
+            return self.UNCOMPRESSED
+
+    def _get_bits(self) -> str:
+        return "".join(str(b) for b in self._curline[: self._curpos])
+
+    def _get_refline(self, i: int) -> str:
+        if i < 0:
+            return "[]" + "".join(str(b) for b in self._refline)
+        elif len(self._refline) <= i:
+            return "".join(str(b) for b in self._refline) + "[]"
+        else:
+            return (
+                "".join(str(b) for b in self._refline[:i])
+                + "["
+                + str(self._refline[i])
+                + "]"
+                + "".join(str(b) for b in self._refline[i + 1 :])
+            )
+
+    def reset(self) -> None:
+        self._y = 0
+        self._curline = array.array("b", [1] * self.width)
+        self._reset_line()
+        self._accept = self._parse_mode
+        self._state = self.MODE
+        return
+
+    def output_line(self, y: int, bits: Sequence[int]) -> None:
+        print(y, "".join(str(b) for b in bits))
+        return
+
+    def _reset_line(self) -> None:
+        self._refline = self._curline
+        self._curline = array.array("b", [1] * self.width)
+        self._curpos = -1
+        self._color = 1
+        return
+
+    def _flush_line(self) -> None:
+        if self.width <= self._curpos:
+            self.output_line(self._y, self._curline)
+            self._y += 1
+            self._reset_line()
+            if self.bytealign:
+                raise self.ByteSkip
+        return
+
+    def _do_vertical(self, dx: int) -> None:
+        x1 = self._curpos + 1
+        while 1:
+            if x1 == 0:
+                if self._color == 1 and self._refline[x1] != self._color:
+                    break
+            elif x1 == len(self._refline):
+                break
+            elif (
+                self._refline[x1 - 1] == self._color
+                and self._refline[x1] != self._color
+            ):
+                break
+            x1 += 1
+        x1 += dx
+        x0 = max(0, self._curpos)
+        x1 = max(0, min(self.width, x1))
+        if x1 < x0:
+            for x in range(x1, x0):
+                self._curline[x] = self._color
+        elif x0 < x1:
+            for x in range(x0, x1):
+                self._curline[x] = self._color
+        self._curpos = x1
+        self._color = 1 - self._color
+        return
+
+    def _do_pass(self) -> None:
+        x1 = self._curpos + 1
+        while 1:
+            if x1 == 0:
+                if self._color == 1 and self._refline[x1] != self._color:
+                    break
+            elif x1 == len(self._refline):
+                break
+            elif (
+                self._refline[x1 - 1] == self._color
+                and self._refline[x1] != self._color
+            ):
+                break
+            x1 += 1
+        while 1:
+            if x1 == 0:
+                if self._color == 0 and self._refline[x1] == self._color:
+                    break
+            elif x1 == len(self._refline):
+                break
+            elif (
+                self._refline[x1 - 1] != self._color
+                and self._refline[x1] == self._color
+            ):
+                break
+            x1 += 1
+        for x in range(self._curpos, x1):
+            self._curline[x] = self._color
+        self._curpos = x1
+        return
+
+    def _do_horizontal(self, n1: int, n2: int) -> None:
+        if self._curpos < 0:
+            self._curpos = 0
+        x = self._curpos
+        for _ in range(n1):
+            if len(self._curline) <= x:
+                break
+            self._curline[x] = self._color
+            x += 1
+        for _ in range(n2):
+            if len(self._curline) <= x:
+                break
+            self._curline[x] = 1 - self._color
+            x += 1
+        self._curpos = x
+        return
+
+    def _do_uncompressed(self, bits: str) -> None:
+        for c in bits:
+            self._curline[self._curpos] = int(c)
+            self._curpos += 1
+            self._flush_line()
+        return
+
+
+class CCITTFaxDecoder(CCITTG4Parser):
+    def __init__(
+        self, width: int, bytealign: bool = False, reversed: bool = False
+    ) -> None:
+        CCITTG4Parser.__init__(self, width, bytealign=bytealign)
+        self.reversed = reversed
+        self._buf = b""
+        return
+
+    def close(self) -> bytes:
+        return self._buf
+
+    def output_line(self, y: int, bits: Sequence[int]) -> None:
+        arr = array.array("B", [0] * ((len(bits) + 7) // 8))
+        if self.reversed:
+            bits = [1 - b for b in bits]
+        for (i, b) in enumerate(bits):
+            if b:
+                arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
+        self._buf += arr.tobytes()
+        return
+
+
+def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
+    K = params.get("K")
+    if K == -1:
+        cols = cast(int, params.get("Columns"))
+        bytealign = cast(bool, params.get("EncodedByteAlign"))
+        reversed = cast(bool, params.get("BlackIs1"))
+        parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
+    else:
+        raise ValueError(K)
+    parser.feedbytes(data)
+    return parser.close()
+
+
+# test
+def main(argv: List[str]) -> None:
+    if not argv[1:]:
+        import unittest
+
+        unittest.main()
+        return
+
+    class Parser(CCITTG4Parser):
+        def __init__(self, width: int, bytealign: bool = False) -> None:
+            import pygame  # type: ignore[import]
+
+            CCITTG4Parser.__init__(self, width, bytealign=bytealign)
+            self.img = pygame.Surface((self.width, 1000))
+            return
+
+        def output_line(self, y: int, bits: Sequence[int]) -> None:
+            for (x, b) in enumerate(bits):
+                if b:
+                    self.img.set_at((x, y), (255, 255, 255))
+                else:
+                    self.img.set_at((x, y), (0, 0, 0))
+            return
+
+        def close(self) -> None:
+            import pygame
+
+            pygame.image.save(self.img, "out.bmp")
+            return
+
+    for path in argv[1:]:
+        fp = open(path, "rb")
+        (_, _, k, w, h, _) = path.split(".")
+        parser = Parser(int(w))
+        parser.feedbytes(fp.read())
+        parser.close()
+        fp.close()
+    return
--- a/pdfminer/cmap/78-EUC-H.pickle.gz
+++ b/pdfminer/cmap/78-EUC-H.pickle.gz
--- a/pdfminer/cmap/78-EUC-V.pickle.gz
+++ b/pdfminer/cmap/78-EUC-V.pickle.gz
--- a/pdfminer/cmap/78-H.pickle.gz
+++ b/pdfminer/cmap/78-H.pickle.gz
--- a/pdfminer/cmap/78-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/78-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/78-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/78-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/78-V.pickle.gz
+++ b/pdfminer/cmap/78-V.pickle.gz
--- a/pdfminer/cmap/78ms-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/78ms-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/78ms-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/78ms-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/83pv-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/83pv-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/83pv-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/83pv-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90ms-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90ms-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90ms-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90ms-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90msp-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90msp-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90msp-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90msp-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90pv-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90pv-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90pv-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90pv-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Add-H.pickle.gz
+++ b/pdfminer/cmap/Add-H.pickle.gz
--- a/pdfminer/cmap/Add-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/Add-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/Add-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/Add-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Add-V.pickle.gz
+++ b/pdfminer/cmap/Add-V.pickle.gz
--- a/pdfminer/cmap/B5-H.pickle.gz
+++ b/pdfminer/cmap/B5-H.pickle.gz
--- a/pdfminer/cmap/B5-V.pickle.gz
+++ b/pdfminer/cmap/B5-V.pickle.gz
--- a/pdfminer/cmap/B5pc-H.pickle.gz
+++ b/pdfminer/cmap/B5pc-H.pickle.gz
--- a/pdfminer/cmap/B5pc-V.pickle.gz
+++ b/pdfminer/cmap/B5pc-V.pickle.gz
--- a/pdfminer/cmap/CNS-EUC-H.pickle.gz
+++ b/pdfminer/cmap/CNS-EUC-H.pickle.gz
--- a/pdfminer/cmap/CNS-EUC-V.pickle.gz
+++ b/pdfminer/cmap/CNS-EUC-V.pickle.gz
--- a/pdfminer/cmap/CNS1-H.pickle.gz
+++ b/pdfminer/cmap/CNS1-H.pickle.gz
--- a/pdfminer/cmap/CNS1-V.pickle.gz
+++ b/pdfminer/cmap/CNS1-V.pickle.gz
--- a/pdfminer/cmap/CNS2-H.pickle.gz
+++ b/pdfminer/cmap/CNS2-H.pickle.gz
--- a/pdfminer/cmap/CNS2-V.pickle.gz
+++ b/pdfminer/cmap/CNS2-V.pickle.gz
--- a/pdfminer/cmap/ETHK-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETHK-B5-H.pickle.gz
--- a/pdfminer/cmap/ETHK-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETHK-B5-V.pickle.gz
--- a/pdfminer/cmap/ETen-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETen-B5-H.pickle.gz
--- a/pdfminer/cmap/ETen-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETen-B5-V.pickle.gz
--- a/pdfminer/cmap/ETenms-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETenms-B5-H.pickle.gz
--- a/pdfminer/cmap/ETenms-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETenms-B5-V.pickle.gz
--- a/pdfminer/cmap/EUC-H.pickle.gz
+++ b/pdfminer/cmap/EUC-H.pickle.gz
--- a/pdfminer/cmap/EUC-V.pickle.gz
+++ b/pdfminer/cmap/EUC-V.pickle.gz
--- a/pdfminer/cmap/Ext-H.pickle.gz
+++ b/pdfminer/cmap/Ext-H.pickle.gz
--- a/pdfminer/cmap/Ext-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/Ext-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/Ext-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/Ext-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Ext-V.pickle.gz
+++ b/pdfminer/cmap/Ext-V.pickle.gz
--- a/pdfminer/cmap/GB-EUC-H.pickle.gz
+++ b/pdfminer/cmap/GB-EUC-H.pickle.gz
--- a/pdfminer/cmap/GB-EUC-V.pickle.gz
+++ b/pdfminer/cmap/GB-EUC-V.pickle.gz
--- a/pdfminer/cmap/GB-H.pickle.gz
+++ b/pdfminer/cmap/GB-H.pickle.gz
--- a/pdfminer/cmap/GB-V.pickle.gz
+++ b/pdfminer/cmap/GB-V.pickle.gz
--- a/pdfminer/cmap/GBK-EUC-H.pickle.gz
+++ b/pdfminer/cmap/GBK-EUC-H.pickle.gz
--- a/pdfminer/cmap/GBK-EUC-V.pickle.gz
+++ b/pdfminer/cmap/GBK-EUC-V.pickle.gz
--- a/pdfminer/cmap/GBK2K-H.pickle.gz
+++ b/pdfminer/cmap/GBK2K-H.pickle.gz
--- a/Show More
+++ b/Show More