Initial commit

2023-08-07 18:10:10 +06:00 · 2023-08-07 18:10:10 +06:00 · ec5b4ea6ee
commit ec5b4ea6ee
1535 changed files with 467951 additions and 0 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,5 @@
 [flake8]
 max-line-length = 88
 extend-ignore = 
    # See https://github.com/PyCQA/pycodestyle/issues/373
    E203,
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,20 @@
 ---
 name: Bug report
 about: Report a bug
 title: ''
 labels: bug
 assignees: ''
 ---
 **Bug report**
 Thanks for finding the bug! To help us fix it, please make sure that you 
 include the following information:
 - A description of the bug
 - Steps to reproduce the bug. Try to minimize the number of steps needed. 
  Include the command and/or script that you use. Also include the PDF that 
  you use.
 - If relevant, include the output and/or error stacktrace. 
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,18 @@
 ---
 name: Feature request
 about: Request a new feature
 title: ''
 labels: enhancement
 assignees: ''
 ---
 **Feature request**
 Thanks for your suggestion on improving pdfminer.six. To helps us discuss and
 implement this request, please make sure to include the following information:
 - A description of the feature you would like to have
 - If relevant, the context that you are in. What are you trying to achieve?
 - If possible, an example of what you want to achieve. Include the PDF that
  you are working on. Include the output that you would like to have. 
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1,15 @@
 **Pull request**
 Please *remove* this paragraph and replace it with a description of your PR. Also include the issue that it fixes. 
 **How Has This Been Tested?**
 Please *remove* this paragraph with a description of how this PR has been tested.
 **Checklist**
 - [ ] I have read [CONTRIBUTING.md](../CONTRIBUTING.md). 
 - [ ] I have added a concise human-readable description of the change to [CHANGELOG.md](../CHANGELOG.md).
 - [ ] I have tested that this fix is effective or that this feature works.
 - [ ] I have added docstrings to newly created methods and classes.
 - [ ] I have updated the [README.md](../README.md) and the [readthedocs](../docs/source) documentation. Or verified that this is not necessary.
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@ -0,0 +1,164 @@
 name: Continuous integration
 on:
  push: # run when commits are added to master
    branches:
      - master
    tags:
      - '[0-9]+'  # match version tags with only numbers
  pull_request: # run on pr's against master
    branches:
      - master
 env:
  default-python: "3.10"
 jobs:
  check-code-formatting:
    name: Check coding style
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Set up Python ${{ env.default-python }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.default-python }}
      - name: Upgrade pip, Install nox
        run: |
          python -m pip install --upgrade pip
          python -m pip install nox
      - name: Check coding style
        run: |
          nox --error-on-missing-interpreters --non-interactive --session format
  check-coding-style:
    name: Check coding style
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Set up Python ${{ env.default-python }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.default-python }}
      - name: Upgrade pip, Install nox
        run: |
          python -m pip install --upgrade pip
          python -m pip install nox
      - name: Check coding style
        run: |
          nox --error-on-missing-interpreters --non-interactive --session lint
  check-static-types:
    name: Check static types
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Set up Python ${{ env.default-python }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.default-python }}
      - name: Upgrade pip, Install nox
        run: |
          python -m pip install --upgrade pip
          python -m pip install nox
      - name: Check static types
        run: |
          nox --error-on-missing-interpreters --non-interactive --session types
  tests:
    name: Run tests
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ ubuntu-latest ]
        python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10" ]
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Determine pip cache directory
        id: pip-cache
        run: |
          echo "::set-output name=dir::$(pip cache dir)"
      - name: Cache pip cache
        uses: actions/cache@v2
        with:
          path: ${{ steps.pip-cache.outputs.dir }}
          key: ${{ runner.os }}-pip${{ matrix.python-version }}
      - name: Upgrade pip and install nox
        run: |
          python -m pip install --upgrade pip
          python -m pip install nox
      - name: Run tests
        run: |
          nox --non-interactive --session tests-${{ matrix.python-version }}
  build-docs:
    name: Test building docs
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Set up Python ${{ env.default-python }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ env.default-python }}
      - name: Upgrade pip and install nox
        run: |
          python -m pip install --upgrade pip
          python -m pip install nox
      - name: Build docs
        run: |
          nox --error-on-missing-interpreters --non-interactive --session docs
  publish:
    name: Publish to PyPi
    runs-on: ubuntu-latest
    needs:
      - check-code-formatting
      - check-coding-style
      - check-static-types
      - tests
      - build-docs
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
      - name: Install dependencies
        run: python -m pip install wheel
      - name: Set version
        run: |
          if [[ "${{ github.ref }}" == "refs/tags/"* ]]
          then
            VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,' | sed -e 's/^v//')
          else
            VERSION=$(date +%Y%m%d).$(date +%H%M%S)
          fi
          echo ${VERSION}
          sed -i "s/__VERSION__/${VERSION}/g" pdfminer/__init__.py
      - name: Build package
        run: python setup.py sdist bdist_wheel
      - name: Generate changelog
        run: sed '1,/## \[/d;/## \[/Q' CHANGELOG.md > ${{ github.workspace }}-CHANGELOG.md
      - name: Publish package to PyPi
        if: startsWith(github.ref, 'refs/tags')
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          user: __token__
          password: ${{ secrets.PYPI_API_TOKEN }}
      - name: Create GitHub release
        if: startsWith(github.ref, 'refs/tags')
        uses: softprops/action-gh-release@v1
        id: create_release
        with:
          token: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
          body_path: ${{ github.workspace }}-CHANGELOG.md
          files: |
            dist/*.tar.gz
            dist/*.whl
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,28 @@
 *.class
 *.pyc
 *.pyo
 .svn
 .env
 _svn
 .pythoscope
 .ipynb_checkpoints
 .settings
 _update.bat
 docs/_build
 /Goulib.egg-info/
 /build/
 /dist/
 /pdfminer.six.egg-info/
 tests/*.xml
 tests/*.txt
 .idea/
 .tox/
 .nox/
 # python venv management tools
 Pipfile
 Pipfile.lock
 .noseids
 .vscode/
 pyproject.toml
 poetry.lock
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,256 @@
 # Changelog
 All notable changes in pdfminer.six will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 ### Added
 - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
 - Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790))
 - Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829))
 - Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801))
 ### Fixed
 - `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
 - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766))
 - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760))
 - `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720))
 - Installing typing-extensions on Python 3.6 and 3.7 ([#775](https://github.com/pdfminer/pdfminer.six/pull/775))
 - `TypeError` in cmapdb.py when parsing null characters ([#768](https://github.com/pdfminer/pdfminer.six/pull/768))
 - Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794))
 - `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827))
 - Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828))
 ### Deprecated
 - Usage of `if __name__ == "__main__"` where it was only intended for testing purposes ([#756](https://github.com/pdfminer/pdfminer.six/pull/756))
 ## [20220524]
 ### Fixed
 - Ignoring (invalid) path constructors that do not begin with `m` ([#749](https://github.com/pdfminer/pdfminer.six/pull/749))
 ### Changed
 - Removed upper version bounds ([#755](https://github.com/pdfminer/pdfminer.six/pull/755))
 ## [20220506]
 ### Fixed
 - `IndexError` when handling invalid bfrange code map in
  CMap ([#731](https://github.com/pdfminer/pdfminer.six/pull/731))
 - `TypeError` in lzw.py when `self.table` is not set ([#732](https://github.com/pdfminer/pdfminer.six/pull/732))
 - `TypeError` in encodingdb.py when name of unicode is not
  str ([#733](https://github.com/pdfminer/pdfminer.six/pull/733))
 - `TypeError` in HTMLConverter when using a bytes fontname ([#734](https://github.com/pdfminer/pdfminer.six/pull/734))
 ### Added
 - Exporting images without any specific encoding ([#737](https://github.com/pdfminer/pdfminer.six/pull/737))
 ### Changed
 - Using charset-normalizer instead of chardet for less restrictive license ([#744](https://github.com/pdfminer/pdfminer.six/pull/744))
 ## [20220319]
 ### Added
 - Export type annotations from pypi package per PEP561 ([#679](https://github.com/pdfminer/pdfminer.six/pull/679))
 - Support for identity cmap's ([#626](https://github.com/pdfminer/pdfminer.six/pull/626))
 - Add support for PDF page labels ([#680](https://github.com/pdfminer/pdfminer.six/pull/680))
 - Installation of Pillow as an optional extra dependency ([#714](https://github.com/pdfminer/pdfminer.six/pull/714))
 ### Fixed
 - Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
 - Regression (since 20191107) in `LTLayoutContainer.group_textboxes` that returned some text lines out of order ([#659](https://github.com/pdfminer/pdfminer.six/pull/659))
 - Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
 - Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
 - Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))
 - Only use xref fallback if `PDFNoValidXRef` is raised and `fallback` is True ([#684](https://github.com/pdfminer/pdfminer.six/pull/684))
 - Ignore empty characters when analyzing layout ([#499](https://github.com/pdfminer/pdfminer.six/pull/499))
 ### Changed
 - Replace warnings.warn with logging.Logger.warning in line with [recommended use](https://docs.python.org/3/howto/logging.html#when-to-use-logging) ([#673](https://github.com/pdfminer/pdfminer.six/pull/673))
 - Switched from nose to pytest, from tox to nox and from Travis CI to GitHub Actions ([#704](https://github.com/pdfminer/pdfminer.six/pull/704))
 ### Removed
 - Unnecessary return statements without argument at the end of functions ([#707](https://github.com/pdfminer/pdfminer.six/pull/707))
 ## [20211012]
 ### Added
 - Add support for PDF 2.0 (ISO 32000-2) AES-256 encryption ([#614](https://github.com/pdfminer/pdfminer.six/pull/614))
 - Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537))
 - Type annotations ([#661](https://github.com/pdfminer/pdfminer.six/pull/661))
 ### Fixed
 - `KeyError` when `'Encrypt'` but not `'ID'` present in `trailer` ([#594](https://github.com/pdfminer/pdfminer.six/pull/594))
 - Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
 - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
 - Fix `PermissionError` when creating temporary filepaths on windows when running tests ([#484](https://github.com/pdfminer/pdfminer.six/pull/484))
 - Fix `AttributeError` when dumping a TOC with bytes destinations ([#600](https://github.com/pdfminer/pdfminer.six/pull/600))
 - Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
 - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
 - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
 - Raising `UnboundLocalError` when a bad `--output-type`  is used ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
 - `TypeError` when using `TagExtractor` with non-string or non-bytes tag values ([#610](https://github.com/pdfminer/pdfminer.six/pull/610))
 - Using `io.TextIOBase` as the file to write to ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
 - Parsing \r\n after the escape character in a literal string ([#616](https://github.com/pdfminer/pdfminer.six/pull/616))
 ### Removed
 - Support for Python 3.4 and 3.5 ([#522](https://github.com/pdfminer/pdfminer.six/pull/522))
 - Unused dependency on `sortedcontainers` package ([#525](https://github.com/pdfminer/pdfminer.six/pull/525))
 - Support for non-standard output streams that are not binary ([#523](https://github.com/pdfminer/pdfminer.six/pull/523))
 - Dependency on typing-extensions introduced by [#661](https://github.com/pdfminer/pdfminer.six/pull/661) ([#677](https://github.com/pdfminer/pdfminer.six/pull/677))
 ## [20201018]
 ### Deprecated
 - Support for Python 3.4 and 3.5 ([#507](https://github.com/pdfminer/pdfminer.six/pull/507))
 ### Added
 - Option to disable boxes flow layout analysis when using pdf2txt ([#479](https://github.com/pdfminer/pdfminer.six/pull/479))
 - Support for `pathlib.PurePath` in `open_filename` ([#492](https://github.com/pdfminer/pdfminer.six/pull/492))
 ### Fixed
 - Pass caching parameter to PDFResourceManager in `high_level` functions ([#475](https://github.com/pdfminer/pdfminer.six/pull/475))
 - Fix `.paint_path` logic for handling non-rect quadrilaterals and decomposing complex paths ([#512](https://github.com/pdfminer/pdfminer.six/pull/512))
 - Fix out-of-bound access on some PDFs ([#483](https://github.com/pdfminer/pdfminer.six/pull/483))
 ### Removed
 - Remove unused rijndael encryption implementation ([#465](https://github.com/pdfminer/pdfminer.six/pull/465))
 ## [20200726]
 ### Fixed
 - Rename PDFTextExtractionNotAllowedError to PDFTextExtractionNotAllowed to revert breaking change ([#461](https://github.com/pdfminer/pdfminer.six/pull/461))
 - Always try to get CMap, not only for identity encodings ([#438](https://github.com/pdfminer/pdfminer.six/pull/438))
 ## [20200720]
 ### Added
 - Support for painting multiple rectangles at once ([#371](https://github.com/pdfminer/pdfminer.six/pull/371))
 ### Fixed
 - Validate image object in do_EI is a PDFStream ([#451](https://github.com/pdfminer/pdfminer.six/pull/451))
 ### Changed
 - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
 - Raise a warning instead of an error when extracting text from a non-extractable PDF ([#453](https://github.com/pdfminer/pdfminer.six/pull/453))
 - Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))
 ## [20200517]
 ### Added
 - Python3 shebang line to script in tools ([#408](https://github.com/pdfminer/pdfminer.six/pull/408))
 ### Fixed
 - Fix ordering of textlines within a textbox when `boxes_flow=None` ([#412](https://github.com/pdfminer/pdfminer.six/pull/412))
 ## [20200402]
 ### Added
 - Allow boxes_flow LAParam to be passed as None, validate the input, and update documentation ([#396](https://github.com/pdfminer/pdfminer.six/pull/396))
 - Also accept file-like objects in high level functions `extract_text` and `extract_pages` ([#393](https://github.com/pdfminer/pdfminer.six/pull/393))
 ### Fixed
 - Text no longer comes in reverse order when advanced layout analysis is disabled ([#399](https://github.com/pdfminer/pdfminer.six/pull/399))
 - Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407))
 - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389))
 - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386))
 ### Changed
 - Group text lines if they are centered ([#384](https://github.com/pdfminer/pdfminer.six/pull/384))
 ## [20200124]
 ### Security
 - Removed samples/issue-00152-embedded-pdf.pdf because it contains a possible security thread; a javascript enabled object ([#364](https://github.com/pdfminer/pdfminer.six/pull/364))
 ## [20200121]
 ### Fixed
 - Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
 - Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
 - Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
 - KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))
 ### Removed
 - The command-line utility latin2ascii.py ([#360](https://github.com/pdfminer/pdfminer.six/pull/360))
 ## [20200104]
 ### Removed
 - Support for Python 2 ([#346](https://github.com/pdfminer/pdfminer.six/pull/346))
 ### Changed
 - Enforce pep8 coding style by adding flake8 to CI ([#345](https://github.com/pdfminer/pdfminer.six/pull/345))
 ## [20191110]
 ### Fixed
 - Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))
 ## [20191107]
 ### Deprecated
 - The argument `_py2_no_more_posargs` because Python2 is removed on January
 , 2020 ([#328](https://github.com/pdfminer/pdfminer.six/pull/328) and 
 [#307](https://github.com/pdfminer/pdfminer.six/pull/307))
 ### Added
 - Simple wrapper to easily extract text from a PDF file [#330](https://github.com/pdfminer/pdfminer.six/pull/330)
 - Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
 - Sphinx documentation that is published on 
  [Read the Docs](https://pdfminersix.readthedocs.io/)
  ([#329](https://github.com/pdfminer/pdfminer.six/pull/329))
 ### Fixed
 - Unhandled AssertionError when dumping pdf containing reference to object id 0 
 ([#318](https://github.com/pdfminer/pdfminer.six/pull/318))
 - Debug flag actually changes logging level to debug for pdf2txt.py and
 dumppdf.py ([#325](https://github.com/pdfminer/pdfminer.six/pull/325))
 ### Changed
 - Using argparse instead of getopt for command line interface of dumppdf.py ([#321](https://github.com/pdfminer/pdfminer.six/pull/321))
 - Refactor `LTLayoutContainer.group_textboxes` for a significant speed up in layout analysis ([#315](https://github.com/pdfminer/pdfminer.six/pull/315))
 ### Removed
 - Files for external applications such as django, cgi and pyinstaller ([#320](https://github.com/pdfminer/pdfminer.six/pull/320))
 ## [20191020]
 ### Deprecated
 - Support for Python 2 is dropped at January 1st, 2020 ([#307](https://github.com/pdfminer/pdfminer.six/pull/307))
 ### Added
 - Contribution guidelines in [CONTRIBUTING.md](CONTRIBUTING.md) ([#259](https://github.com/pdfminer/pdfminer.six/pull/259))
 - Support new encodings OneByteEncoding and DLIdent for CMaps ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
 ### Fixed
 - Use `six.iteritems()` instead of `dict().iteritems()` to ensure Python2 and Python3 compatibility ([#274](https://github.com/pdfminer/pdfminer.six/pull/274))
 - Properly convert Adobe Glyph names to unicode characters ([#263](https://github.com/pdfminer/pdfminer.six/pull/263))
 - Allow CMap to be a content stream ([#283](https://github.com/pdfminer/pdfminer.six/pull/283))
 - Resolve indirect objects for width and bounding boxes for fonts ([#273](https://github.com/pdfminer/pdfminer.six/pull/273))
 - Actually updating stroke color in graphic state ([#298](https://github.com/pdfminer/pdfminer.six/pull/298))
 - Interpret (invalid) negative font descent as a positive descent ([#203](https://github.com/pdfminer/pdfminer.six/pull/203))
 - Correct colorspace comparision for images ([#132](https://github.com/pdfminer/pdfminer.six/pull/132))
 - Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
 ### Changed
 - All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
 ## [20181108]
 ### Changed
 - Speedup layout analysis ([#141](https://github.com/pdfminer/pdfminer.six/pull/141))
 - Use argparse instead of replace deprecated getopt ([#173](https://github.com/pdfminer/pdfminer.six/pull/173))
 - Allow pdfminer.six to be compiled with cython ([#142](https://github.com/pdfminer/pdfminer.six/pull/142))
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,75 @@
 # Contributing guidelines
 Any contribution is appreciated! You might want to:
 * Fix spelling errors
 * Improve documentation
 * Add tests for untested code
 * Add new features
 * Fix bugs
 ## How can I contribute?
 * Use [issues](https://github.com/pdfminer/pdfminer.six/issues) to report bugs and features
    - If you report a bug in the results for a particular pdf, include that pdf. This allows others to replicate the
     issue. 
 * Fix issues by [creating pull requests](https://help.github.com/en/articles/creating-a-pull-request).
 * Help others by sharing your thoughs in comments on issues and pull requests.
 * Join the chat on [gitter](https://gitter.im/pdfminer-six/Lobby)
 ## Guidelines for creating issues
 * Search previous issues, as yours might be a duplicate.
 * When creating a new issue for a bug, include a minimal reproducible example.
 * When creating a new issue for a feature, be sure to describe the context of the problem you are trying to solve. This
  will help others to see the importance of your feature request. 
 ## Guideline for creating pull request
 * A pull request should close an existing issue. For example, use "Fix #123" to indicate that your PR fixes issue 123. 
 * Pull requests should be merged to master.
 * Include unit tests when possible. In case of bugs, this will help to prevent the same mistake in the future. In case
  of features, this will show that your code works correctly.
 * Code should work for Python 3.6+.
 * Test your code by using nox (see below). 
 * New features should be well documented using docstrings.
 * Check if the [README.md](../README.md) or [readthedocs](../docs/source) documentation needs to be updated. 
 * Check spelling and grammar.
 * Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]). 
 ## Guidelines for posting comments
 * [Be cordial and positive](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way)
 ## Guidelines for publishing
 * Publishing is automated. Add a YYYYMMDD version tag and GitHub workflows will do the rest. 
 ## Getting started
 1. Clone the repository
    ```sh
    git clone https://github.com/pdfminer/pdfminer.six
    cd pdfminer.six
    ```
 2. Install dev dependencies
    ```sh
    pip install -e .[dev]
    ```
 3. Run the tests
    On all Python versions:
    ```sh
    nox
   ```
   Or on a single Python version:
   ```sh
    nox -e py36
    ```
--- a/22
+++ b/22
@ -0,0 +1,22 @@
 Copyright (c) 2004-2016  Yusuke Shinyama <yusuke at shinyama dot jp>
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
 files (the "Software"), to deal in the Software without
 restriction, including without limitation the rights to use,
 copy, modify, merge, publish, distribute, sublicense, and/or
 sell copies of the Software, and to permit persons to whom the
 Software is furnished to do so, subject to the following
 conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
 KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,11 @@
 include Makefile
 include LICENSE
 include *.txt
 include *.md
 include *.py
 graft cmaprsrc
 graft pdfminer
 graft tools
 global-exclude *.pyc
 prune samples
 prune docs
--- a/29
+++ b/29
@ -0,0 +1,29 @@
 ##  Makefile (for maintenance purpose)
 ##
 PYTHON=python
 RM=rm -f
 CP=cp -f
 MKDIR=mkdir
 CONV_CMAP=$(PYTHON) tools/conv_cmap.py
 CMAPSRC=cmaprsrc
 CMAPDST=pdfminer/cmap
 cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
 	$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
 cmap_clean:
 	-$(RM) -r $(CMAPDST)
 $(CMAPDST):
 	$(MKDIR) $(CMAPDST)
 $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
 	$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
 $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
 	$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
 		$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
 $(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
 	$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
 $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
 	$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
 		$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
--- a/README.md
+++ b/README.md
@ -0,0 +1,72 @@
 pdfminer.six
 ============
 [![Continuous integration](https://github.com/pdfminer/pdfminer.six/actions/workflows/actions.yml/badge.svg)](https://github.com/pdfminer/pdfminer.six/actions/workflows/actions.yml)
 [![PyPI version](https://img.shields.io/pypi/v/pdfminer.six.svg)](https://pypi.python.org/pypi/pdfminer.six/)
 [![gitter](https://badges.gitter.im/pdfminer-six/Lobby.svg)](https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium)
 *We fathom PDF*
 Pdfminer.six is a community maintained fork of the original PDFMiner. It is a tool for extracting information from PDF
 documents. It focuses on getting and analyzing text data. Pdfminer.six extracts the text from a page directly from the
 sourcecode of the PDF. It can also be used to get the exact location, font or color of the text.
 It is built in a modular way such that each component of pdfminer.six can be replaced easily. You can implement your own
 interpreter or rendering device that uses the power of pdfminer.six for other purposes than text analysis.
 Check out the full documentation on
 [Read the Docs](https://pdfminersix.readthedocs.io).
 Features
 --------
 * Written entirely in Python.
 * Parse, analyze, and convert PDF documents.
 * Extract content as text, images, html or [hOCR](https://en.wikipedia.org/wiki/HOCR).
 * PDF-1.7 specification support. (well, almost).
 * CJK languages and vertical writing scripts support.
 * Various font types (Type1, TrueType, Type3, and CID) support.
 * Support for extracting images (JPG, JBIG2, Bitmaps).
 * Support for various compressions (ASCIIHexDecode, ASCII85Decode, LZWDecode, FlateDecode, RunLengthDecode,
  CCITTFaxDecode)
 * Support for RC4 and AES encryption.
 * Support for AcroForm interactive form extraction.
 * Table of contents extraction.
 * Tagged contents extraction.
 * Automatic layout analysis.
 How to use
 ----------
 * Install Python 3.6 or newer.
 * Install pdfminer.six.
  `pip install pdfminer.six`
 * (Optionally) install extra dependencies for extracting images.
  `pip install 'pdfminer.six[image]'`
 * Use the command-line interface to extract text from pdf.
  `pdf2txt.py example.pdf`
 * Or use it with Python. 
 ```python
 from pdfminer.high_level import extract_text
 text = extract_text("example.pdf")
 print(text)
 ```
 Contributing
 ------------
 Be sure to read the [contribution guidelines](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). 
 Acknowledgement
 ---------------
 This repository includes code from `pyHanko` ; the original license has been included [here](/docs/licenses/LICENSE.pyHanko).
--- a/cmaprsrc/README.txt
+++ b/cmaprsrc/README.txt
@ -0,0 +1,60 @@
 README.txt for cmaprsrc
 This directory contains Adobe CMap resources. CMaps are required 
 to decode text data written in CJK (Chinese, Japanese, Korean) language.
 CMap resources are now available freely from Adobe web site:
 http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
 The following files were extracted from the downloadable tarballs:
 cid2code_Adobe_CNS1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_cns1-6.tar.z
 cid2code_Adobe_GB1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_gb1-5.tar.z
 cid2code_Adobe_Japan1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_japan1-6.tar.z
 cid2code_Adobe_Korea1.txt:
 	http://download.macromedia.com/pub/opensource/cmap/cmapresources_korean1-2.tar.z
 Here is the license information in the original files:
 %%Copyright: -----------------------------------------------------------
 %%Copyright: Copyright 1990-20xx Adobe Systems Incorporated.
 %%Copyright: All rights reserved.
 %%Copyright:
 %%Copyright: Redistribution and use in source and binary forms, with or
 %%Copyright: without modification, are permitted provided that the
 %%Copyright: following conditions are met:
 %%Copyright:
 %%Copyright: Redistributions of source code must retain the above
 %%Copyright: copyright notice, this list of conditions and the following
 %%Copyright: disclaimer.
 %%Copyright:
 %%Copyright: Redistributions in binary form must reproduce the above
 %%Copyright: copyright notice, this list of conditions and the following
 %%Copyright: disclaimer in the documentation and/or other materials
 %%Copyright: provided with the distribution.
 %%Copyright:
 %%Copyright: Neither the name of Adobe Systems Incorporated nor the names
 %%Copyright: of its contributors may be used to endorse or promote
 %%Copyright: products derived from this software without specific prior
 %%Copyright: written permission.
 %%Copyright:
 %%Copyright: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 %%Copyright: CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 %%Copyright: INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 %%Copyright: MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 %%Copyright: DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 %%Copyright: CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 %%Copyright: SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 %%Copyright: NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 %%Copyright: LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 %%Copyright: HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 %%Copyright: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 %%Copyright: OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 %%Copyright: SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 %%Copyright: -----------------------------------------------------------
--- a/cmaprsrc/cid2code_Adobe_CNS1.txt
+++ b/cmaprsrc/cid2code_Adobe_CNS1.txt
--- a/cmaprsrc/cid2code_Adobe_GB1.txt
+++ b/cmaprsrc/cid2code_Adobe_GB1.txt
--- a/cmaprsrc/cid2code_Adobe_Japan1.txt
+++ b/cmaprsrc/cid2code_Adobe_Japan1.txt
--- a/cmaprsrc/cid2code_Adobe_Korea1.txt
+++ b/cmaprsrc/cid2code_Adobe_Korea1.txt
--- a/docs/.gitignore
+++ b/docs/.gitignore
@ -0,0 +1 @@
 build/
--- a/docs/Makefile
+++ b/docs/Makefile
@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/licenses/LICENSE.pyHanko
+++ b/docs/licenses/LICENSE.pyHanko
@ -0,0 +1,23 @@
 This package contains various elements based on code from the pyHanko project, of which we reproduce the license below.
 MIT License
 Copyright (c) 2020 Matthias Valvekens 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/docs/make.bat
+++ b/docs/make.bat
@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
 if "%1" == "" goto help
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -0,0 +1 @@
 sphinx-argparse
--- a/docs/source/_static/layout_analysis.html
+++ b/docs/source/_static/layout_analysis.html
@ -0,0 +1,28 @@
 <style>
    td {
        text-align: center;
    }
 </style>
 <table style="margin: 10px; padding: 10px;">
    <tr>
        <td style="text-align: right; border-right:1px red solid">&rarr;</td>
        <td colspan="4"
            style="text-align: left; border-left:1px red solid">&larr; <em><font
                color="red">M</font></em></td>
    </tr>
    <tr>
        <td style="border:1px solid"><code>Q u i</code></td>
        <td style="border:1px solid"><code>c k</code></td>
        <td width="10px"></td>
        <td style="border:1px solid"><code>b r o w n</code></td>
    </tr>
    <tr>
        <td colspan="2" style="text-align: right; border-right:1px green solid">
            &rarr;
        </td>
        <td></td>
        <td colspan="2"
            style="text-align: left; border-left:1px green solid">&larr;
            <em><font color="green">W</font></em></td>
    </tr>
 </table>
--- a/docs/source/_static/layout_analysis_group_boxes.html
+++ b/docs/source/_static/layout_analysis_group_boxes.html
@ -0,0 +1,23 @@
 <style>
    .background-blue {
        background-color: lightblue;
        border: 2px solid lightblue;
    }
 </style>
 <table style="margin: 10px; padding: 10px;">
    <tr>
        <td style="border:1px solid; text-align: left">
            <code>
                Q u i c k &nbsp; b r o w n<br/> f o x
            </code>
        </td>
        <td class="background-blue" colspan="3"></td>
    </tr>
    <tr style="height: 10px;">
        <td class="background-blue" colspan="4"></td>
    </tr>
    <tr>
        <td class="background-blue" colspan="3"></td>
        <td style="border:1px solid"><code>j u m p s ...</code></td>
    </tr>
 </table>
--- a/docs/source/_static/layout_analysis_group_lines.html
+++ b/docs/source/_static/layout_analysis_group_lines.html
@ -0,0 +1,45 @@
 <style>
    td {
        text-align: center;
    }
 </style>
 <table style="margin: 10px; padding: 10px;">
    <tr>
        <td></td>
        <td></td>
        <td align=right style="border-bottom:1px blue solid">&darr;</td>
        <td></td>
    </tr>
    <tr>
        <td colspan="2" style="border:1px solid"><code>Q u i c k &nbsp; b r o w
            n</code></td>
        <td></td>
        <td align=right style="border-bottom:1px blue solid">&darr;</td>
    </tr>
    <tr>
        <td></td>
        <td></td>
        <td align=center valign=center><em><font color="blue">
            L<sub>1</sub>
        </font></em></td>
        <td></td>
    </tr>
    <tr>
        <td style="border:1px solid;">
            <code>f o x</code>
        </td>
        <td>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
            &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
        </td>
        <td align=right style="border-top:1px blue solid">&uarr;</td>
        <td align=center valign=center><em><font color="blue">
            L<sub>2</sub>
        </font></em></td>
    </tr>
    <tr>
        <td></td>
        <td></td>
        <td></td>
        <td align=right style="border-top:1px blue solid">&uarr;</td>
    </tr>
 </table>
--- a/docs/source/_static/layout_analysis_output.png
+++ b/docs/source/_static/layout_analysis_output.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -0,0 +1,64 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
 import sys
 from typing import List
 import pdfminer
 sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../"))
 # -- Project information -----------------------------------------------------
 project = "pdfminer.six"
 copyright = "2019, Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
 author = "Yusuke Shinyama, Philippe Guglielmetti & Pieter Marsman"
 # The full version, including alpha/beta/rc tags
 release = pdfminer.__version__
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    "sphinxarg.ext",
    "sphinx.ext.autodoc",
    "sphinx.ext.doctest",
 ]
 # Root rst file
 master_doc = "index"
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns: List[str] = []
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = "alabaster"
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
--- a/docs/source/faq.rst
+++ b/docs/source/faq.rst
@ -0,0 +1,68 @@
 .. _faq:
 Frequently asked questions
 **************************
 Why is it called pdfminer.six?
 ==============================
 Pdfminer.six is a fork of the `original pdfminer created by Euske
 <https://github.com/euske>`_. Almost all of the code and architecture are in
 -fact created by Euske. But, for a long time, this original pdfminer did not
 support Python 3. Until 2020 the original pdfminer only supported Python 2.
 The original goal of pdfminer.six was to add support for Python 3. This was
 done with the `six` package. The `six` package helps to write code that is
 compatible with both Python 2 and Python 3. Hence, pdfminer.six.
 As of 2020, pdfminer.six dropped the support for Python 2 because it was
 `end-of-life <https://www.python.org/doc/sunset-python-2/>`_. While the .six
 part is no longer applicable, we kept the name to prevent breaking changes for
 existing users.
 The current punchline "We fathom PDF" is a `whimsical reference
 <https://github.com/pdfminer/pdfminer.six/issues/197#issuecomment-655091942>`_
 to the six. Fathom means both deeply understanding something, and a fathom is
 also equal to six feet.
 How does pdfminer.six compare to other forks of pdfminer?
 ==========================================================
 Pdfminer.six is now an independent and community-maintained package for
 extracting text from PDFs with Python. We actively fix bugs (also for PDFs
 that don't strictly follow the PDF Reference), add new features and improve
 the usability of pdfminer.six. This community separates pdfminer.six from the
 other forks of the original pdfminer. PDF as a format is very diverse and
 there are countless deviations from the official format. The only way to
 support all the PDFs out there is to have a community that actively uses and
 improves pdfminer.
 Since 2020, the original pdfminer is `dormant
 <https://github.com/euske/pdfminer#pdfminer>`_, and pdfminer.six is the fork
 which Euske recommends if you need an actively maintained version of pdfminer.
 Why are there `(cid:x)` values in the textual output?
 =====================================================
 One of the most common issues with pdfminer.six is that the textual output
 contains raw character id's `(cid:x)`. This is often experienced as confusing
 because the text is shown fine in a PDF viewer and other text from the same
 PDF is extracted properly.
 The underlying problem is that a PDF has two different representations
 of each character. Each character is mapped to a glyph that determines
 how the character is shown in a PDF viewer. And each character is also
 mapped to its unicode value that is used when copy-pasting the character.
 Some PDF's have incomplete unicode mappings and therefore it is impossible
 to convert the character to unicode. In these cases pdfminer.six defaults
 to showing the raw character id `(cid:x)`
 A quick test to see if pdfminer.six should be able to do better is to
 copy-paste the text from a PDF viewer to a text editor. If the result
 is proper text, pdfminer.six should also be able to extract proper text.
 If the result is gibberish, pdfminer.six will also not be able to convert
 the characters to unicode.
 References: 
 #. `Chapter 5: Text, PDF Reference 1.7 <https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/index.html#pdf-reference>`_
 #. `Text: PDF, Wikipedia <https://en.wikipedia.org/wiki/PDF#Text>`_
--- a/docs/source/howto/acro_forms.rst
+++ b/docs/source/howto/acro_forms.rst
@ -0,0 +1,148 @@
 .. _acro_forms:
 How to extract AcroForm interactive form fields from a PDF using PDFMiner
 *************************************************************************
 Before you start, make sure you have :ref:`installed pdfminer.six<install>`.
 The second thing you need is a PDF with AcroForms (as found in PDF files with fillable forms or multiple choices). There are some examples of these in the GitHub repository under `samples/acroform`.
 Only AcroForm interactive forms are supported, XFA forms are not supported.
 .. code-block:: python
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdftypes import resolve1
    from pdfminer.psparser import PSLiteral, PSKeyword
    from pdfminer.utils import decode_text    
    data = {}
    def decode_value(value):
        # decode PSLiteral, PSKeyword
        if isinstance(value, (PSLiteral, PSKeyword)):
            value = value.name
        # decode bytes
        if isinstance(value, bytes):
            value = decode_text(value)
        return value
    with open(file_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        res = resolve1(doc.catalog)
        if 'AcroForm' not in res:
            raise ValueError("No AcroForm Found")
        fields = resolve1(doc.catalog['AcroForm'])['Fields']  # may need further resolving
        for f in fields:
            field = resolve1(f)
            name, values = field.get('T'), field.get('V')
            # decode name
            name = decode_text(name)
            # resolve indirect obj
            values = resolve1(values)
            # decode value(s)
            if isinstance(values, list):
                values = [decode_value(v) for v in values]
            else:
                values = decode_value(values)
            data.update({name: values})    
            print(name, values)
 This code snippet will print all the fields' names and values and save them in the "data" dictionary.
 How it works:
 - Initialize the parser and the PDFDocument objects
 .. code-block:: python
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
 - Get the Catalog
  (the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/index.html#pdf-reference)
 .. code-block:: python
    res = resolve1(doc.catalog)
 - Check if the catalog contains the AcroForm key and raise ValueError if not 
  (the PDF does not contain Acroform type of interactive forms if this key is missing in the catalog, see section 12.7.2 of PDF 32000-1:2008 specs)
 .. code-block:: python
    if 'AcroForm' not in res:
        raise ValueError("No AcroForm Found")
 - Get the field list resolving the entry in the catalog
 .. code-block:: python
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for f in fields:
        field = resolve1(f)
 - Get field name and field value(s)
 .. code-block:: python
    name, values = field.get('T'), field.get('V')
 - Decode field name.
 .. code-block:: python
    name = decode_text(name)
 - Resolve indirect field value objects
 .. code-block:: python
    values = resolve1(value)
 - Call the value(s) decoding method as needed
  (a single field can hold multiple values, for example, a combo box can hold more than one value at a time)
 .. code-block:: python
    if isinstance(values, list):
        values = [decode_value(v) for v in values]
    else:
        values = decode_value(values)
 (the decode_value method takes care of decoding the field's value, returning a string)
 - Decode PSLiteral and PSKeyword field values
 .. code-block:: python
    if isinstance(value, (PSLiteral, PSKeyword)):
        value = value.name
 - Decode bytes field values
 .. code-block:: python
    if isinstance(value, bytes):
        value = utils.decode_text(value)
--- a/docs/source/howto/images.rst
+++ b/docs/source/howto/images.rst
@ -0,0 +1,19 @@
 .. _images:
 How to extract images from a PDF
 ********************************
 Before you start, make sure you have :ref:`installed pdfminer.six<install>`.
 The second thing you need is a PDF with images. If you don't have one,
 you can download `this research paper
 <https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf>`_
 with images of cats and dogs and save it as `example.pdf`::
    $ curl https://www.robots.ox.ac.uk/~vgg/publications/2012/parkhi12a/parkhi12a.pdf --output example.pdf
 Then run the :ref:`pdf2txt<api_pdf2txt>` command::
    $ pdf2txt.py example.pdf --output-dir cats-and-dogs
 This command extracts all the images from the PDF and saves them into the
 `cats-and-dogs` directory.
--- a/docs/source/howto/index.rst
+++ b/docs/source/howto/index.rst
@ -0,0 +1,12 @@
 .. _howto:
 How-to guides
 *************
 How-to guides help you to solve specific problems with pdfminer.six.
 .. toctree::
    :maxdepth: 1
    images
    acro_forms
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -0,0 +1,94 @@
 Welcome to pdfminer.six's documentation!
 ****************************************
 .. image:: https://travis-ci.org/pdfminer/pdfminer.six.svg?branch=master
    :target: https://travis-ci.org/pdfminer/pdfminer.six
    :alt: Travis-ci build badge
 .. image:: https://img.shields.io/pypi/v/pdfminer.six.svg
    :target: https://pypi.python.org/pypi/pdfminer.six/
    :alt: PyPi version badge
 .. image:: https://badges.gitter.im/pdfminer-six/Lobby.svg
    :target: https://gitter.im/pdfminer-six/Lobby?utm_source=badge&utm_medium
    :alt: gitter badge
 We fathom PDF.
 Pdfminer.six is a python package for extracting information from PDF documents.
 Check out the source on `github <https://github.com/pdfminer/pdfminer.six>`_.
 Content
 =======
 This documentation is organized into four sections (according to the `Diátaxis
 documentation framework <https://diataxis.fr>`_). The
 :ref:`tutorial` section helps you setup and use pdfminer.six for the first
 time. Read this section if this is your first time working with pdfminer.six.
 The :ref:`howto` offers specific recipies for solving common problems.
 Take a look at the :ref:`topic` if you want more background information on
 how pdfminer.six works internally. The :ref:`reference` provides
 detailed api documentation for all the common classes and functions in
 pdfminer.six.
 .. toctree::
    :maxdepth: 2
    tutorial/index
    howto/index
    topic/index
    reference/index
    faq
 Features
 ========
 * Parse all objects from a PDF document into Python objects.
 * Analyze and group text in a human-readable way.
 * Extract text, images (JPG, JBIG2 and Bitmaps), table-of-contents, tagged
  contents and more.
 * Support for (almost all) features from the PDF-1.7 specification
 * Support for Chinese, Japanese and Korean CJK) languages as well as vertical writing.
 * Support for various font types (Type1, TrueType, Type3, and CID).
 * Support for RC4 and AES encryption.
 * Support for AcroForm interactive form extraction.
 Installation instructions
 =========================
 * Install Python 3.6 or newer.
 * Install pdfminer.six.
 ::
    $ pip install pdfminer.six`
 * (Optionally) install extra dependencies for extracting images.
 ::
    $ pip install 'pdfminer.six[image]'`
 * Use the command-line interface to extract text from pdf.
 ::
    $ pdf2txt.py example.pdf`
 * Or use it with Python.
 .. code-block:: python
    from pdfminer.high_level import extract_text
    text = extract_text("example.pdf")
    print(text)
 Contributing
 ============
 We welcome any contributors to pdfminer.six! But, before doing anything, take
 a look at the `contribution guide
 <https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md>`_.
--- a/docs/source/reference/commandline.rst
+++ b/docs/source/reference/commandline.rst
@ -0,0 +1,25 @@
 .. _api_commandline:
 Command-line API
 ****************
 .. _api_pdf2txt:
 pdf2txt.py
 ==========
 .. argparse::
    :module: tools.pdf2txt
    :func: create_parser
    :prog: python tools/pdf2txt.py
 .. _api_dumppdf:
 dumppdf.py
 ==========
 .. argparse::
    :module: tools.dumppdf
    :func: create_parser
    :prog: python tools/dumppdf.py
--- a/docs/source/reference/composable.rst
+++ b/docs/source/reference/composable.rst
@ -0,0 +1,20 @@
 .. _api_composable:
 Composable API
 **************
 .. _api_laparams:
 LAParams
 ========
 .. currentmodule:: pdfminer.layout
 .. autoclass:: LAParams
 Todo:
 =====
 - `PDFDevice`
    - `TextConverter`
    - `PDFPageAggregator`
 - `PDFPageInterpreter`
--- a/docs/source/reference/highlevel.rst
+++ b/docs/source/reference/highlevel.rst
@ -0,0 +1,30 @@
 .. _api_highlevel:
 High-level functions API
 ************************
 .. _api_extract_text:
 extract_text
 ============
 .. currentmodule:: pdfminer.high_level
 .. autofunction:: extract_text
 .. _api_extract_text_to_fp:
 extract_text_to_fp
 ==================
 .. currentmodule:: pdfminer.high_level
 .. autofunction:: extract_text_to_fp
 .. _api_extract_pages:
 extract_pages
 =============
 .. currentmodule:: pdfminer.high_level
 .. autofunction:: extract_pages
--- a/docs/source/reference/index.rst
+++ b/docs/source/reference/index.rst
@ -0,0 +1,11 @@
 .. _reference:
 API Reference
 *************
 .. toctree::
    :maxdepth: 2
    commandline
    highlevel
    composable
--- a/docs/source/topic/converting_pdf_to_text.rst
+++ b/docs/source/topic/converting_pdf_to_text.rst
@ -0,0 +1,131 @@
 .. _topic_pdf_to_text:
 Converting a PDF file to text
 *****************************
 Most PDF files look like they contain well-structured text. But the reality is
 that a PDF file does not contain anything that resembles paragraphs,
 sentences or even words. When it comes to text, a PDF file is only aware of
 the characters and their placement.
 This makes extracting meaningful pieces of text from PDF files difficult.
 The characters that compose a paragraph are no different from those that
 compose the table, the page footer or the description of a figure. Unlike
 other document formats, like a `.txt` file or a word document, the PDF format
 does not contain a stream of text.
 A PDF document consists of a collection of objects that together describe
 the appearance of one or more pages, possibly accompanied by additional
 interactive elements and higher-level application data. A PDF file contains
 the objects making up a PDF document along with associated structural
 information, all represented as a single self-contained sequence of bytes. [1]_
 .. _topic_pdf_to_text_layout:
 Layout analysis algorithm
 =========================
 PDFMiner attempts to reconstruct some of those structures by using heuristics
 on the positioning of characters. This works well for sentences and
 paragraphs because meaningful groups of nearby characters can be made.
 The layout analysis consists of three different stages: it groups characters
 into words and lines, then it groups lines into boxes and finally it groups
 textboxes hierarchically. These stages are discussed in the following
 sections. The resulting output of the layout analysis is an ordered hierarchy
 of layout objects on a PDF page.
 .. figure:: ../_static/layout_analysis_output.png
    :align: center
    The output of the layout analysis is a hierarchy of layout objects.
 The output of the layout analysis heavily depends on a couple of parameters.
 All these parameters are part of the :ref:`api_laparams` class.
 Grouping characters into words and lines
 ----------------------------------------
 The first step in going from characters to text is to group characters in a
 meaningful way. Each character has an x-coordinate and a y-coordinate for its
 bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer.six 
 uses these bounding boxes to decide which characters belong together.
 Characters that are both horizontally and vertically close are grouped onto
 one line. How close they should be is determined by the `char_margin`
 (M in the figure) and the `line_overlap` (not in figure) parameter. The horizontal
 *distance* between the bounding boxes of two characters should be smaller than
 the `char_margin` and the vertical *overlap* between the bounding boxes should
 be smaller than the `line_overlap`.
 .. raw:: html
    :file: ../_static/layout_analysis.html
 The values of `char_margin` and `line_overlap` are relative to the size of
 the bounding boxes of the characters. The `char_margin` is relative to the
 maximum width of either one of the bounding boxes, and the `line_overlap` is
 relative to the minimum height of either one of the bounding boxes.
 Spaces need to be inserted between characters because the PDF format has no
 notion of the space character. A space is inserted if the characters are
 further apart than the `word_margin` (W in the figure). The `word_margin` is
 relative to the maximum width or height of the new character. Having a smaller
 `word_margin` creates smaller words. Note that the `word_margin` should at
 least be smaller than the `char_margin` otherwise none of the characters will
 be separated by a space.
 The result of this stage is a list of lines. Each line consists of a list of
 characters. These characters are either original `LTChar` characters that
 originate from the PDF file or inserted `LTAnno` characters that
 represent spaces between words or newlines at the end of each line.
 Grouping lines into boxes
 -------------------------
 The second step is grouping lines in a meaningful way. Each line has a
 bounding box that is determined by the bounding boxes of the characters that
 it contains. Like grouping characters, pdfminer.six uses the bounding boxes
 to group the lines.
 Lines that are both horizontally overlapping and vertically close are grouped.
 How vertically close the lines should be is determined by the `line_margin`.
 This margin is specified relative to the height of the bounding box. Lines
 are close if the gap between the tops (see L :sub:`1` in the figure) and bottoms
 (see L :sub:`2`) in the figure) of the bounding boxes are closer together
 than the absolute line margin, i.e. the `line_margin` multiplied by the
 height of the bounding box.
 .. raw:: html
    :file: ../_static/layout_analysis_group_lines.html
 The result of this stage is a list of text boxes. Each box consists of a list
 of lines.
 Grouping textboxes hierarchically
 ---------------------------------
 The last step is to group the text boxes in a meaningful way. This step
 repeatedly merges the two text boxes that are closest to each other.
 The closeness of bounding boxes is computed as the area that is between the
 two text boxes (the blue area in the figure). In other words, it is the area of
 the bounding box that surrounds both lines, minus the area of the bounding
 boxes of the individual lines.
 .. raw:: html
    :file: ../_static/layout_analysis_group_boxes.html
 Working with rotated characters
 ===============================
 The algorithm described above assumes that all characters have the same
 orientation. However, any writing direction is possible in a PDF. To
 accommodate for this, pdfminer.six allows detecting vertical writing with the
 `detect_vertical` parameter. This will apply all the grouping steps as if the
 pdf was rotated 90 (or 270) degrees
 References
 ==========
 .. [1] Adobe System Inc. (2007). *Pdf reference: Adobe portable document
  format, version 1.7.*
--- a/docs/source/topic/index.rst
+++ b/docs/source/topic/index.rst
@ -0,0 +1,9 @@
 .. _topic:
 Topics
 ******
 .. toctree::
    :maxdepth: 2
    converting_pdf_to_text
--- a/docs/source/tutorial/commandline.rst
+++ b/docs/source/tutorial/commandline.rst
@ -0,0 +1,41 @@
 .. _tutorial_commandline:
 Extract text from a PDF using the commandline
 *********************************************
 pdfminer.six has several tools that can be used from the command line. The
 command-line tools are aimed at users that occasionally want to extract text
 from a pdf.
 Take a look at the high-level or composable interface if you want to use
 pdfminer.six programmatically.
 Examples
 ========
 pdf2txt.py
 ----------
 ::
    $ pdf2txt.py example.pdf
    all the text from the pdf appears on the command line
 The :ref:`api_pdf2txt` tool extracts all the text from a PDF. It uses layout
 analysis with sensible defaults to order and group the text in a sensible way.
 dumppdf.py
 ----------
 ::
    $ dumppdf.py -a example.pdf
    <pdf><object id="1">
    ...
    </object>
    ...
    </pdf>
 The :ref:`api_dumppdf` tool can be used to extract the internal structure from a
 PDF. This tool is primarily for debugging purposes, but that can be useful to
 anybody working with PDF's.
--- a/docs/source/tutorial/composable.rst
+++ b/docs/source/tutorial/composable.rst
@ -0,0 +1,33 @@
 .. _tutorial_composable:
 Extract text from a PDF using Python - part 2
 *********************************************
 The command line tools and the high-level API are just shortcuts for often
 used combinations of pdfminer.six components. You can use these components to
 modify pdfminer.six to your own needs.
 For example, to extract the text from a PDF file and save it in a python
 variable::
    from io import StringIO
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfparser import PDFParser
    output_string = StringIO()
    with open('samples/simple1.pdf', 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    print(output_string.getvalue())
--- a/docs/source/tutorial/extract_pages.rst
+++ b/docs/source/tutorial/extract_pages.rst
@ -0,0 +1,47 @@
 .. _tutorial_extract_pages:
 Extract elements from a PDF using Python
 ****************************************
 The high level functions can be used to achieve common tasks. In this case,
 we can use :ref:`api_extract_pages`:
 .. code-block:: python
   from pdfminer.high_level import extract_pages
   for page_layout in extract_pages("test.pdf"):
       for element in page_layout:
           print(element)
 Each ``element`` will be an ``LTTextBox``, ``LTFigure``, ``LTLine``, ``LTRect``
 or an ``LTImage``. Some of these can be iterated further, for example iterating
 though an ``LTTextBox`` will give you an ``LTTextLine``, and these in turn can
 be iterated through to get an ``LTChar``. See the diagram here:
 :ref:`topic_pdf_to_text_layout`.
 Let's say we want to extract all of the text. We could do:
 .. code-block:: python
   from pdfminer.high_level import extract_pages
   from pdfminer.layout import LTTextContainer
   for page_layout in extract_pages("test.pdf"):
       for element in page_layout:
           if isinstance(element, LTTextContainer):
               print(element.get_text())
 Or, we could extract the fontname or size of each individual character:
 .. code-block:: python
   from pdfminer.high_level import extract_pages
   from pdfminer.layout import LTTextContainer, LTChar
   for page_layout in extract_pages("test.pdf"):
       for element in page_layout:
           if isinstance(element, LTTextContainer):
               for text_line in element:
                   for character in text_line:
                       if isinstance(character, LTChar):
                           print(character.fontname)
                           print(character.size)
--- a/docs/source/tutorial/highlevel.rst
+++ b/docs/source/tutorial/highlevel.rst
@ -0,0 +1,59 @@
 .. _tutorial_highlevel:
 Extract text from a PDF using Python
 ************************************
 The high-level API can be used to do common tasks.
 The most simple way to extract text from a PDF is to use
 :ref:`api_extract_text`:
 .. doctest::
    >>> from pdfminer.high_level import extract_text
    >>> text = extract_text('samples/simple1.pdf')
    >>> print(repr(text))
    'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\x0c'
    >>> print(text)
    ... # doctest: +NORMALIZE_WHITESPACE
    Hello
    <BLANKLINE>
    World
    <BLANKLINE>
    Hello
    <BLANKLINE>
    World
    <BLANKLINE>
    H e l l o
    <BLANKLINE>
    W o r l d
    <BLANKLINE>
    H e l l o
    <BLANKLINE>
    W o r l d
    <BLANKLINE>
 To read text from a PDF and print it on the command line:
 .. doctest::
    >>> from io import StringIO
    >>> from pdfminer.high_level import extract_text_to_fp
    >>> output_string = StringIO()
    >>> with open('samples/simple1.pdf', 'rb') as fin:
    ...     extract_text_to_fp(fin, output_string)
    >>> print(output_string.getvalue().strip())
    Hello WorldHello WorldHello WorldHello World
 Or to convert it to html and use layout analysis:
 .. doctest::
    >>> from io import StringIO
    >>> from pdfminer.high_level import extract_text_to_fp
    >>> from pdfminer.layout import LAParams
    >>> output_string = StringIO()
    >>> with open('samples/simple1.pdf', 'rb') as fin:
    ...     extract_text_to_fp(fin, output_string, laparams=LAParams(),
    ...                        output_type='html', codec=None)
--- a/docs/source/tutorial/index.rst
+++ b/docs/source/tutorial/index.rst
@ -0,0 +1,15 @@
 .. _tutorial:
 Tutorials
 *********
 Tutorials help you get started with specific parts of pdfminer.six.
 .. toctree::
    :maxdepth: 1
    install
    commandline
    highlevel
    composable
    extract_pages
--- a/docs/source/tutorial/install.rst
+++ b/docs/source/tutorial/install.rst
@ -0,0 +1,39 @@
 .. _install:
 Install pdfminer.six as a Python package
 ****************************************
 To use pdfminer.six for the first time, you need to install the Python
 package in your Python environment.
 This tutorial requires you to have a system with a working Python and pip
 installation. If you don't have one and don't know how to install it, take a
 look at `The Hitchhiker's Guide to Python! <https://docs.python-guide.org/>`_.
 Install using pip
 =================
 Run the following command on the commandline to install pdfminer.six as a
 Python package::
    pip install pdfminer.six
 Test pdfminer.six installation
 ==============================
 You can test the pdfminer.six installation by importing it in Python.
 Open an interactive Python session from the commandline import pdfminer
 .six::
    >>> import pdfminer
    >>> print(pdfminer.__version__)  # doctest: +IGNORE_RESULT
    '<installed version>'
 Now you can use pdfminer.six as a Python package. But pdfminer.six also
 comes with a couple of useful commandline tools. To test if these tools are
 correctly installed, run the following on your commandline::
    $ pdf2txt.py --version
    pdfminer.six <installed version>
--- a/mypy.ini
+++ b/mypy.ini
@ -0,0 +1,33 @@
 [mypy]
 warn_unused_configs = True
 disallow_any_generics = True
 disallow_subclassing_any = True
 disallow_untyped_calls = True
 disallow_incomplete_defs = True
 disallow_untyped_decorators = True
 no_implicit_optional = True
 warn_redundant_casts = True
 warn_return_any = True
 no_implicit_reexport = True
 strict_equality = True
 # This seems impossible to turn on in a version-independent manner
 warn_unused_ignores = False
 [mypy-pdfminer.*]
 disallow_untyped_defs = True
 [mypy-cryptography.hazmat.*]
 ignore_missing_imports = True
 [mypy-pytest.*]
 ignore_missing_imports = True
 [mypy-setuptools.*]
 ignore_missing_imports = True
 [mypy-nox.*]
 ignore_missing_imports = True
 [mypy-charset_normalizer.*]
 ignore_missing_imports = True
--- a/noxfile.py
+++ b/noxfile.py
@ -0,0 +1,52 @@
 import os
 import nox
 PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"]
 PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"]
@nox.session
 def format(session):
    session.install("black")
    # Format files locally with black, but only check in cicd
    if "CI" in os.environ:
        session.run("black", "--check", *PYTHON_MODULES)
    else:
        session.run("black", *PYTHON_MODULES)
@nox.session
 def lint(session):
    session.install("flake8")
    session.run("flake8", *PYTHON_MODULES, "--count", "--statistics")
@nox.session
 def types(session):
    session.install("mypy")
    session.run(
        "mypy",
        "--install-types",
        "--non-interactive",
        "--show-error-codes",
        *PYTHON_MODULES,
    )
@nox.session(python=PYTHON_ALL_VERSIONS)
 def tests(session):
    session.install("-e", ".[dev]")
    session.run("pytest")
@nox.session
 def docs(session):
    session.install("-e", ".[docs]")
    session.run(
        "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
    )
    session.run(
        "python", "-m", "sphinx", "-b", "doctest", "docs/source", "docs/build/doctest"
    )
--- a/pdfminer/init.py
+++ b/pdfminer/init.py
@ -0,0 +1,4 @@
 __version__ = "__VERSION__"  # auto replaced with tag in github actions
 if __name__ == "__main__":
    print(__version__)
--- a/pdfminer/_saslprep.py
+++ b/pdfminer/_saslprep.py
@ -0,0 +1,95 @@
 # Copyright 2016-present MongoDB, Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Some changes copyright 2021-present Matthias Valvekens,
 # licensed under the license of the pyHanko project (see LICENSE file).
 """An implementation of RFC4013 SASLprep."""
 __all__ = ["saslprep"]
 import stringprep
 from typing import Callable, Tuple
 import unicodedata
 # RFC4013 section 2.3 prohibited output.
 _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
    # A strict reading of RFC 4013 requires table c12 here, but
    # characters from it are mapped to SPACE in the Map step. Can
    # normalization reintroduce them somehow?
    stringprep.in_table_c12,
    stringprep.in_table_c21_c22,
    stringprep.in_table_c3,
    stringprep.in_table_c4,
    stringprep.in_table_c5,
    stringprep.in_table_c6,
    stringprep.in_table_c7,
    stringprep.in_table_c8,
    stringprep.in_table_c9,
 )
 def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
    """An implementation of RFC4013 SASLprep.
    :param data:
        The string to SASLprep.
    :param prohibit_unassigned_code_points:
        RFC 3454 and RFCs for various SASL mechanisms distinguish between
        `queries` (unassigned code points allowed) and
        `stored strings` (unassigned code points prohibited). Defaults
        to ``True`` (unassigned code points are prohibited).
    :return: The SASLprep'ed version of `data`.
    """
    if prohibit_unassigned_code_points:
        prohibited = _PROHIBITED + (stringprep.in_table_a1,)
    else:
        prohibited = _PROHIBITED
    # RFC3454 section 2, step 1 - Map
    # RFC4013 section 2.1 mappings
    # Map Non-ASCII space characters to SPACE (U+0020). Map
    # commonly mapped to nothing characters to, well, nothing.
    in_table_c12 = stringprep.in_table_c12
    in_table_b1 = stringprep.in_table_b1
    data = "".join(
        ["\u0020" if in_table_c12(elt) else elt for elt in data if not in_table_b1(elt)]
    )
    # RFC3454 section 2, step 2 - Normalize
    # RFC4013 section 2.2 normalization
    data = unicodedata.ucd_3_2_0.normalize("NFKC", data)
    in_table_d1 = stringprep.in_table_d1
    if in_table_d1(data[0]):
        if not in_table_d1(data[-1]):
            # RFC3454, Section 6, #3. If a string contains any
            # RandALCat character, the first and last characters
            # MUST be RandALCat characters.
            raise ValueError("SASLprep: failed bidirectional check")
        # RFC3454, Section 6, #2. If a string contains any RandALCat
        # character, it MUST NOT contain any LCat character.
        prohibited = prohibited + (stringprep.in_table_d2,)
    else:
        # RFC3454, Section 6, #3. Following the logic of #3, if
        # the first character is not a RandALCat, no other character
        # can be either.
        prohibited = prohibited + (in_table_d1,)
    # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
    for char in data:
        if any(in_table(char) for in_table in prohibited):
            raise ValueError("SASLprep: failed prohibited character check")
    return data
--- a/pdfminer/arcfour.py
+++ b/pdfminer/arcfour.py
@ -0,0 +1,36 @@
 """ Python implementation of Arcfour encryption algorithm.
 See https://en.wikipedia.org/wiki/RC4
 This code is in the public domain.
 """
 from typing import Sequence
 class Arcfour:
    def __init__(self, key: Sequence[int]) -> None:
        # because Py3 range is not indexable
        s = [i for i in range(256)]
        j = 0
        klen = len(key)
        for i in range(256):
            j = (j + s[i] + key[i % klen]) % 256
            (s[i], s[j]) = (s[j], s[i])
        self.s = s
        (self.i, self.j) = (0, 0)
    def process(self, data: bytes) -> bytes:
        (i, j) = (self.i, self.j)
        s = self.s
        r = b""
        for c in iter(data):
            i = (i + 1) % 256
            j = (j + s[i]) % 256
            (s[i], s[j]) = (s[j], s[i])
            k = s[(s[i] + s[j]) % 256]
            r += bytes((c ^ k,))
        (self.i, self.j) = (i, j)
        return r
    encrypt = decrypt = process
--- a/pdfminer/ascii85.py
+++ b/pdfminer/ascii85.py
@ -0,0 +1,72 @@
 """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
 This code is in the public domain.
 """
 import re
 import struct
 # ascii85decode(data)
 def ascii85decode(data: bytes) -> bytes:
    """
    In ASCII85 encoding, every four bytes are encoded with five ASCII
    letters, using 85 different types of characters (as 256**4 < 85**5).
    When the length of the original bytes is not a multiple of 4, a special
    rule is used for round up.
    The Adobe's ASCII85 implementation is slightly different from
    its original in handling the last characters.
    """
    n = b = 0
    out = b""
    for i in iter(data):
        c = bytes((i,))
        if b"!" <= c and c <= b"u":
            n += 1
            b = b * 85 + (ord(c) - 33)
            if n == 5:
                out += struct.pack(">L", b)
                n = b = 0
        elif c == b"z":
            assert n == 0, str(n)
            out += b"\0\0\0\0"
        elif c == b"~":
            if n:
                for _ in range(5 - n):
                    b = b * 85 + 84
                out += struct.pack(">L", b)[: n - 1]
            break
    return out
 # asciihexdecode(data)
 hex_re = re.compile(rb"([a-f\d]{2})", re.IGNORECASE)
 trail_re = re.compile(rb"^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$", re.IGNORECASE)
 def asciihexdecode(data: bytes) -> bytes:
    """
    ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
    For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
    ASCIIHexDecode filter produces one byte of binary data. All white-space
    characters are ignored. A right angle bracket character (>) indicates
    EOD. Any other characters will cause an error. If the filter encounters
    the EOD marker after reading an odd number of hexadecimal digits, it
    will behave as if a 0 followed the last digit.
    """
    def decode(x: bytes) -> bytes:
        i = int(x, 16)
        return bytes((i,))
    out = b""
    for x in hex_re.findall(data):
        out += decode(x)
    m = trail_re.search(data)
    if m:
        out += decode(m.group(1) + b"0")
    return out
--- a/pdfminer/ccitt.py
+++ b/pdfminer/ccitt.py
@ -0,0 +1,629 @@
 # CCITT Fax decoder
 #
 # Bugs: uncompressed mode untested.
 #
 # cf.
 #  ITU-T Recommendation T.4
 #    "Standardization of Group 3 facsimile terminals
 #    for document transmission"
 #  ITU-T Recommendation T.6
 #    "FACSIMILE CODING SCHEMES AND CODING CONTROL FUNCTIONS
 #    FOR GROUP 4 FACSIMILE APPARATUS"
 import array
 from typing import (
    Any,
    Callable,
    Dict,
    Iterator,
    List,
    MutableSequence,
    Optional,
    Sequence,
    Union,
    cast,
 )
 def get_bytes(data: bytes) -> Iterator[int]:
    yield from data
 # Workaround https://github.com/python/mypy/issues/731
 BitParserState = MutableSequence[Any]
 # A better definition (not supported by mypy) would be:
 # BitParserState = MutableSequence[Union["BitParserState", int, str, None]]
 class BitParser:
    _state: BitParserState
    # _accept is declared Optional solely as a workaround for
    # https://github.com/python/mypy/issues/708
    _accept: Optional[Callable[[Any], BitParserState]]
    def __init__(self) -> None:
        self._pos = 0
    @classmethod
    def add(cls, root: BitParserState, v: Union[int, str], bits: str) -> None:
        p: BitParserState = root
        b = None
        for i in range(len(bits)):
            if 0 < i:
                assert b is not None
                if p[b] is None:
                    p[b] = [None, None]
                p = p[b]
            if bits[i] == "1":
                b = 1
            else:
                b = 0
        assert b is not None
        p[b] = v
    def feedbytes(self, data: bytes) -> None:
        for byte in get_bytes(data):
            for m in (128, 64, 32, 16, 8, 4, 2, 1):
                self._parse_bit(byte & m)
    def _parse_bit(self, x: object) -> None:
        if x:
            v = self._state[1]
        else:
            v = self._state[0]
        self._pos += 1
        if isinstance(v, list):
            self._state = v
        else:
            assert self._accept is not None
            self._state = self._accept(v)
 class CCITTG4Parser(BitParser):
    MODE = [None, None]
    BitParser.add(MODE, 0, "1")
    BitParser.add(MODE, +1, "011")
    BitParser.add(MODE, -1, "010")
    BitParser.add(MODE, "h", "001")
    BitParser.add(MODE, "p", "0001")
    BitParser.add(MODE, +2, "000011")
    BitParser.add(MODE, -2, "000010")
    BitParser.add(MODE, +3, "0000011")
    BitParser.add(MODE, -3, "0000010")
    BitParser.add(MODE, "u", "0000001111")
    BitParser.add(MODE, "x1", "0000001000")
    BitParser.add(MODE, "x2", "0000001001")
    BitParser.add(MODE, "x3", "0000001010")
    BitParser.add(MODE, "x4", "0000001011")
    BitParser.add(MODE, "x5", "0000001100")
    BitParser.add(MODE, "x6", "0000001101")
    BitParser.add(MODE, "x7", "0000001110")
    BitParser.add(MODE, "e", "000000000001000000000001")
    WHITE = [None, None]
    BitParser.add(WHITE, 0, "00110101")
    BitParser.add(WHITE, 1, "000111")
    BitParser.add(WHITE, 2, "0111")
    BitParser.add(WHITE, 3, "1000")
    BitParser.add(WHITE, 4, "1011")
    BitParser.add(WHITE, 5, "1100")
    BitParser.add(WHITE, 6, "1110")
    BitParser.add(WHITE, 7, "1111")
    BitParser.add(WHITE, 8, "10011")
    BitParser.add(WHITE, 9, "10100")
    BitParser.add(WHITE, 10, "00111")
    BitParser.add(WHITE, 11, "01000")
    BitParser.add(WHITE, 12, "001000")
    BitParser.add(WHITE, 13, "000011")
    BitParser.add(WHITE, 14, "110100")
    BitParser.add(WHITE, 15, "110101")
    BitParser.add(WHITE, 16, "101010")
    BitParser.add(WHITE, 17, "101011")
    BitParser.add(WHITE, 18, "0100111")
    BitParser.add(WHITE, 19, "0001100")
    BitParser.add(WHITE, 20, "0001000")
    BitParser.add(WHITE, 21, "0010111")
    BitParser.add(WHITE, 22, "0000011")
    BitParser.add(WHITE, 23, "0000100")
    BitParser.add(WHITE, 24, "0101000")
    BitParser.add(WHITE, 25, "0101011")
    BitParser.add(WHITE, 26, "0010011")
    BitParser.add(WHITE, 27, "0100100")
    BitParser.add(WHITE, 28, "0011000")
    BitParser.add(WHITE, 29, "00000010")
    BitParser.add(WHITE, 30, "00000011")
    BitParser.add(WHITE, 31, "00011010")
    BitParser.add(WHITE, 32, "00011011")
    BitParser.add(WHITE, 33, "00010010")
    BitParser.add(WHITE, 34, "00010011")
    BitParser.add(WHITE, 35, "00010100")
    BitParser.add(WHITE, 36, "00010101")
    BitParser.add(WHITE, 37, "00010110")
    BitParser.add(WHITE, 38, "00010111")
    BitParser.add(WHITE, 39, "00101000")
    BitParser.add(WHITE, 40, "00101001")
    BitParser.add(WHITE, 41, "00101010")
    BitParser.add(WHITE, 42, "00101011")
    BitParser.add(WHITE, 43, "00101100")
    BitParser.add(WHITE, 44, "00101101")
    BitParser.add(WHITE, 45, "00000100")
    BitParser.add(WHITE, 46, "00000101")
    BitParser.add(WHITE, 47, "00001010")
    BitParser.add(WHITE, 48, "00001011")
    BitParser.add(WHITE, 49, "01010010")
    BitParser.add(WHITE, 50, "01010011")
    BitParser.add(WHITE, 51, "01010100")
    BitParser.add(WHITE, 52, "01010101")
    BitParser.add(WHITE, 53, "00100100")
    BitParser.add(WHITE, 54, "00100101")
    BitParser.add(WHITE, 55, "01011000")
    BitParser.add(WHITE, 56, "01011001")
    BitParser.add(WHITE, 57, "01011010")
    BitParser.add(WHITE, 58, "01011011")
    BitParser.add(WHITE, 59, "01001010")
    BitParser.add(WHITE, 60, "01001011")
    BitParser.add(WHITE, 61, "00110010")
    BitParser.add(WHITE, 62, "00110011")
    BitParser.add(WHITE, 63, "00110100")
    BitParser.add(WHITE, 64, "11011")
    BitParser.add(WHITE, 128, "10010")
    BitParser.add(WHITE, 192, "010111")
    BitParser.add(WHITE, 256, "0110111")
    BitParser.add(WHITE, 320, "00110110")
    BitParser.add(WHITE, 384, "00110111")
    BitParser.add(WHITE, 448, "01100100")
    BitParser.add(WHITE, 512, "01100101")
    BitParser.add(WHITE, 576, "01101000")
    BitParser.add(WHITE, 640, "01100111")
    BitParser.add(WHITE, 704, "011001100")
    BitParser.add(WHITE, 768, "011001101")
    BitParser.add(WHITE, 832, "011010010")
    BitParser.add(WHITE, 896, "011010011")
    BitParser.add(WHITE, 960, "011010100")
    BitParser.add(WHITE, 1024, "011010101")
    BitParser.add(WHITE, 1088, "011010110")
    BitParser.add(WHITE, 1152, "011010111")
    BitParser.add(WHITE, 1216, "011011000")
    BitParser.add(WHITE, 1280, "011011001")
    BitParser.add(WHITE, 1344, "011011010")
    BitParser.add(WHITE, 1408, "011011011")
    BitParser.add(WHITE, 1472, "010011000")
    BitParser.add(WHITE, 1536, "010011001")
    BitParser.add(WHITE, 1600, "010011010")
    BitParser.add(WHITE, 1664, "011000")
    BitParser.add(WHITE, 1728, "010011011")
    BitParser.add(WHITE, 1792, "00000001000")
    BitParser.add(WHITE, 1856, "00000001100")
    BitParser.add(WHITE, 1920, "00000001101")
    BitParser.add(WHITE, 1984, "000000010010")
    BitParser.add(WHITE, 2048, "000000010011")
    BitParser.add(WHITE, 2112, "000000010100")
    BitParser.add(WHITE, 2176, "000000010101")
    BitParser.add(WHITE, 2240, "000000010110")
    BitParser.add(WHITE, 2304, "000000010111")
    BitParser.add(WHITE, 2368, "000000011100")
    BitParser.add(WHITE, 2432, "000000011101")
    BitParser.add(WHITE, 2496, "000000011110")
    BitParser.add(WHITE, 2560, "000000011111")
    BLACK = [None, None]
    BitParser.add(BLACK, 0, "0000110111")
    BitParser.add(BLACK, 1, "010")
    BitParser.add(BLACK, 2, "11")
    BitParser.add(BLACK, 3, "10")
    BitParser.add(BLACK, 4, "011")
    BitParser.add(BLACK, 5, "0011")
    BitParser.add(BLACK, 6, "0010")
    BitParser.add(BLACK, 7, "00011")
    BitParser.add(BLACK, 8, "000101")
    BitParser.add(BLACK, 9, "000100")
    BitParser.add(BLACK, 10, "0000100")
    BitParser.add(BLACK, 11, "0000101")
    BitParser.add(BLACK, 12, "0000111")
    BitParser.add(BLACK, 13, "00000100")
    BitParser.add(BLACK, 14, "00000111")
    BitParser.add(BLACK, 15, "000011000")
    BitParser.add(BLACK, 16, "0000010111")
    BitParser.add(BLACK, 17, "0000011000")
    BitParser.add(BLACK, 18, "0000001000")
    BitParser.add(BLACK, 19, "00001100111")
    BitParser.add(BLACK, 20, "00001101000")
    BitParser.add(BLACK, 21, "00001101100")
    BitParser.add(BLACK, 22, "00000110111")
    BitParser.add(BLACK, 23, "00000101000")
    BitParser.add(BLACK, 24, "00000010111")
    BitParser.add(BLACK, 25, "00000011000")
    BitParser.add(BLACK, 26, "000011001010")
    BitParser.add(BLACK, 27, "000011001011")
    BitParser.add(BLACK, 28, "000011001100")
    BitParser.add(BLACK, 29, "000011001101")
    BitParser.add(BLACK, 30, "000001101000")
    BitParser.add(BLACK, 31, "000001101001")
    BitParser.add(BLACK, 32, "000001101010")
    BitParser.add(BLACK, 33, "000001101011")
    BitParser.add(BLACK, 34, "000011010010")
    BitParser.add(BLACK, 35, "000011010011")
    BitParser.add(BLACK, 36, "000011010100")
    BitParser.add(BLACK, 37, "000011010101")
    BitParser.add(BLACK, 38, "000011010110")
    BitParser.add(BLACK, 39, "000011010111")
    BitParser.add(BLACK, 40, "000001101100")
    BitParser.add(BLACK, 41, "000001101101")
    BitParser.add(BLACK, 42, "000011011010")
    BitParser.add(BLACK, 43, "000011011011")
    BitParser.add(BLACK, 44, "000001010100")
    BitParser.add(BLACK, 45, "000001010101")
    BitParser.add(BLACK, 46, "000001010110")
    BitParser.add(BLACK, 47, "000001010111")
    BitParser.add(BLACK, 48, "000001100100")
    BitParser.add(BLACK, 49, "000001100101")
    BitParser.add(BLACK, 50, "000001010010")
    BitParser.add(BLACK, 51, "000001010011")
    BitParser.add(BLACK, 52, "000000100100")
    BitParser.add(BLACK, 53, "000000110111")
    BitParser.add(BLACK, 54, "000000111000")
    BitParser.add(BLACK, 55, "000000100111")
    BitParser.add(BLACK, 56, "000000101000")
    BitParser.add(BLACK, 57, "000001011000")
    BitParser.add(BLACK, 58, "000001011001")
    BitParser.add(BLACK, 59, "000000101011")
    BitParser.add(BLACK, 60, "000000101100")
    BitParser.add(BLACK, 61, "000001011010")
    BitParser.add(BLACK, 62, "000001100110")
    BitParser.add(BLACK, 63, "000001100111")
    BitParser.add(BLACK, 64, "0000001111")
    BitParser.add(BLACK, 128, "000011001000")
    BitParser.add(BLACK, 192, "000011001001")
    BitParser.add(BLACK, 256, "000001011011")
    BitParser.add(BLACK, 320, "000000110011")
    BitParser.add(BLACK, 384, "000000110100")
    BitParser.add(BLACK, 448, "000000110101")
    BitParser.add(BLACK, 512, "0000001101100")
    BitParser.add(BLACK, 576, "0000001101101")
    BitParser.add(BLACK, 640, "0000001001010")
    BitParser.add(BLACK, 704, "0000001001011")
    BitParser.add(BLACK, 768, "0000001001100")
    BitParser.add(BLACK, 832, "0000001001101")
    BitParser.add(BLACK, 896, "0000001110010")
    BitParser.add(BLACK, 960, "0000001110011")
    BitParser.add(BLACK, 1024, "0000001110100")
    BitParser.add(BLACK, 1088, "0000001110101")
    BitParser.add(BLACK, 1152, "0000001110110")
    BitParser.add(BLACK, 1216, "0000001110111")
    BitParser.add(BLACK, 1280, "0000001010010")
    BitParser.add(BLACK, 1344, "0000001010011")
    BitParser.add(BLACK, 1408, "0000001010100")
    BitParser.add(BLACK, 1472, "0000001010101")
    BitParser.add(BLACK, 1536, "0000001011010")
    BitParser.add(BLACK, 1600, "0000001011011")
    BitParser.add(BLACK, 1664, "0000001100100")
    BitParser.add(BLACK, 1728, "0000001100101")
    BitParser.add(BLACK, 1792, "00000001000")
    BitParser.add(BLACK, 1856, "00000001100")
    BitParser.add(BLACK, 1920, "00000001101")
    BitParser.add(BLACK, 1984, "000000010010")
    BitParser.add(BLACK, 2048, "000000010011")
    BitParser.add(BLACK, 2112, "000000010100")
    BitParser.add(BLACK, 2176, "000000010101")
    BitParser.add(BLACK, 2240, "000000010110")
    BitParser.add(BLACK, 2304, "000000010111")
    BitParser.add(BLACK, 2368, "000000011100")
    BitParser.add(BLACK, 2432, "000000011101")
    BitParser.add(BLACK, 2496, "000000011110")
    BitParser.add(BLACK, 2560, "000000011111")
    UNCOMPRESSED = [None, None]
    BitParser.add(UNCOMPRESSED, "1", "1")
    BitParser.add(UNCOMPRESSED, "01", "01")
    BitParser.add(UNCOMPRESSED, "001", "001")
    BitParser.add(UNCOMPRESSED, "0001", "0001")
    BitParser.add(UNCOMPRESSED, "00001", "00001")
    BitParser.add(UNCOMPRESSED, "00000", "000001")
    BitParser.add(UNCOMPRESSED, "T00", "00000011")
    BitParser.add(UNCOMPRESSED, "T10", "00000010")
    BitParser.add(UNCOMPRESSED, "T000", "000000011")
    BitParser.add(UNCOMPRESSED, "T100", "000000010")
    BitParser.add(UNCOMPRESSED, "T0000", "0000000011")
    BitParser.add(UNCOMPRESSED, "T1000", "0000000010")
    BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
    BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
    class EOFB(Exception):
        pass
    class InvalidData(Exception):
        pass
    class ByteSkip(Exception):
        pass
    _color: int
    def __init__(self, width: int, bytealign: bool = False) -> None:
        BitParser.__init__(self)
        self.width = width
        self.bytealign = bytealign
        self.reset()
        return
    def feedbytes(self, data: bytes) -> None:
        for byte in get_bytes(data):
            try:
                for m in (128, 64, 32, 16, 8, 4, 2, 1):
                    self._parse_bit(byte & m)
            except self.ByteSkip:
                self._accept = self._parse_mode
                self._state = self.MODE
            except self.EOFB:
                break
        return
    def _parse_mode(self, mode: object) -> BitParserState:
        if mode == "p":
            self._do_pass()
            self._flush_line()
            return self.MODE
        elif mode == "h":
            self._n1 = 0
            self._accept = self._parse_horiz1
            if self._color:
                return self.WHITE
            else:
                return self.BLACK
        elif mode == "u":
            self._accept = self._parse_uncompressed
            return self.UNCOMPRESSED
        elif mode == "e":
            raise self.EOFB
        elif isinstance(mode, int):
            self._do_vertical(mode)
            self._flush_line()
            return self.MODE
        else:
            raise self.InvalidData(mode)
    def _parse_horiz1(self, n: Any) -> BitParserState:
        if n is None:
            raise self.InvalidData
        self._n1 += n
        if n < 64:
            self._n2 = 0
            self._color = 1 - self._color
            self._accept = self._parse_horiz2
        if self._color:
            return self.WHITE
        else:
            return self.BLACK
    def _parse_horiz2(self, n: Any) -> BitParserState:
        if n is None:
            raise self.InvalidData
        self._n2 += n
        if n < 64:
            self._color = 1 - self._color
            self._accept = self._parse_mode
            self._do_horizontal(self._n1, self._n2)
            self._flush_line()
            return self.MODE
        elif self._color:
            return self.WHITE
        else:
            return self.BLACK
    def _parse_uncompressed(self, bits: Optional[str]) -> BitParserState:
        if not bits:
            raise self.InvalidData
        if bits.startswith("T"):
            self._accept = self._parse_mode
            self._color = int(bits[1])
            self._do_uncompressed(bits[2:])
            return self.MODE
        else:
            self._do_uncompressed(bits)
            return self.UNCOMPRESSED
    def _get_bits(self) -> str:
        return "".join(str(b) for b in self._curline[: self._curpos])
    def _get_refline(self, i: int) -> str:
        if i < 0:
            return "[]" + "".join(str(b) for b in self._refline)
        elif len(self._refline) <= i:
            return "".join(str(b) for b in self._refline) + "[]"
        else:
            return (
                "".join(str(b) for b in self._refline[:i])
                + "["
                + str(self._refline[i])
                + "]"
                + "".join(str(b) for b in self._refline[i + 1 :])
            )
    def reset(self) -> None:
        self._y = 0
        self._curline = array.array("b", [1] * self.width)
        self._reset_line()
        self._accept = self._parse_mode
        self._state = self.MODE
        return
    def output_line(self, y: int, bits: Sequence[int]) -> None:
        print(y, "".join(str(b) for b in bits))
        return
    def _reset_line(self) -> None:
        self._refline = self._curline
        self._curline = array.array("b", [1] * self.width)
        self._curpos = -1
        self._color = 1
        return
    def _flush_line(self) -> None:
        if self.width <= self._curpos:
            self.output_line(self._y, self._curline)
            self._y += 1
            self._reset_line()
            if self.bytealign:
                raise self.ByteSkip
        return
    def _do_vertical(self, dx: int) -> None:
        x1 = self._curpos + 1
        while 1:
            if x1 == 0:
                if self._color == 1 and self._refline[x1] != self._color:
                    break
            elif x1 == len(self._refline):
                break
            elif (
                self._refline[x1 - 1] == self._color
                and self._refline[x1] != self._color
            ):
                break
            x1 += 1
        x1 += dx
        x0 = max(0, self._curpos)
        x1 = max(0, min(self.width, x1))
        if x1 < x0:
            for x in range(x1, x0):
                self._curline[x] = self._color
        elif x0 < x1:
            for x in range(x0, x1):
                self._curline[x] = self._color
        self._curpos = x1
        self._color = 1 - self._color
        return
    def _do_pass(self) -> None:
        x1 = self._curpos + 1
        while 1:
            if x1 == 0:
                if self._color == 1 and self._refline[x1] != self._color:
                    break
            elif x1 == len(self._refline):
                break
            elif (
                self._refline[x1 - 1] == self._color
                and self._refline[x1] != self._color
            ):
                break
            x1 += 1
        while 1:
            if x1 == 0:
                if self._color == 0 and self._refline[x1] == self._color:
                    break
            elif x1 == len(self._refline):
                break
            elif (
                self._refline[x1 - 1] != self._color
                and self._refline[x1] == self._color
            ):
                break
            x1 += 1
        for x in range(self._curpos, x1):
            self._curline[x] = self._color
        self._curpos = x1
        return
    def _do_horizontal(self, n1: int, n2: int) -> None:
        if self._curpos < 0:
            self._curpos = 0
        x = self._curpos
        for _ in range(n1):
            if len(self._curline) <= x:
                break
            self._curline[x] = self._color
            x += 1
        for _ in range(n2):
            if len(self._curline) <= x:
                break
            self._curline[x] = 1 - self._color
            x += 1
        self._curpos = x
        return
    def _do_uncompressed(self, bits: str) -> None:
        for c in bits:
            self._curline[self._curpos] = int(c)
            self._curpos += 1
            self._flush_line()
        return
 class CCITTFaxDecoder(CCITTG4Parser):
    def __init__(
        self, width: int, bytealign: bool = False, reversed: bool = False
    ) -> None:
        CCITTG4Parser.__init__(self, width, bytealign=bytealign)
        self.reversed = reversed
        self._buf = b""
        return
    def close(self) -> bytes:
        return self._buf
    def output_line(self, y: int, bits: Sequence[int]) -> None:
        arr = array.array("B", [0] * ((len(bits) + 7) // 8))
        if self.reversed:
            bits = [1 - b for b in bits]
        for (i, b) in enumerate(bits):
            if b:
                arr[i // 8] += (128, 64, 32, 16, 8, 4, 2, 1)[i % 8]
        self._buf += arr.tobytes()
        return
 def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
    K = params.get("K")
    if K == -1:
        cols = cast(int, params.get("Columns"))
        bytealign = cast(bool, params.get("EncodedByteAlign"))
        reversed = cast(bool, params.get("BlackIs1"))
        parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
    else:
        raise ValueError(K)
    parser.feedbytes(data)
    return parser.close()
 # test
 def main(argv: List[str]) -> None:
    if not argv[1:]:
        import unittest
        unittest.main()
        return
    class Parser(CCITTG4Parser):
        def __init__(self, width: int, bytealign: bool = False) -> None:
            import pygame  # type: ignore[import]
            CCITTG4Parser.__init__(self, width, bytealign=bytealign)
            self.img = pygame.Surface((self.width, 1000))
            return
        def output_line(self, y: int, bits: Sequence[int]) -> None:
            for (x, b) in enumerate(bits):
                if b:
                    self.img.set_at((x, y), (255, 255, 255))
                else:
                    self.img.set_at((x, y), (0, 0, 0))
            return
        def close(self) -> None:
            import pygame
            pygame.image.save(self.img, "out.bmp")
            return
    for path in argv[1:]:
        fp = open(path, "rb")
        (_, _, k, w, h, _) = path.split(".")
        parser = Parser(int(w))
        parser.feedbytes(fp.read())
        parser.close()
        fp.close()
    return
--- a/pdfminer/cmap/78-EUC-H.pickle.gz
+++ b/pdfminer/cmap/78-EUC-H.pickle.gz
--- a/pdfminer/cmap/78-EUC-V.pickle.gz
+++ b/pdfminer/cmap/78-EUC-V.pickle.gz
--- a/pdfminer/cmap/78-H.pickle.gz
+++ b/pdfminer/cmap/78-H.pickle.gz
--- a/pdfminer/cmap/78-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/78-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/78-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/78-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/78-V.pickle.gz
+++ b/pdfminer/cmap/78-V.pickle.gz
--- a/pdfminer/cmap/78ms-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/78ms-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/78ms-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/78ms-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/83pv-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/83pv-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/83pv-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/83pv-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90ms-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90ms-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90ms-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90ms-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90msp-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90msp-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90msp-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90msp-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/90pv-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/90pv-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/90pv-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/90pv-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Add-H.pickle.gz
+++ b/pdfminer/cmap/Add-H.pickle.gz
--- a/pdfminer/cmap/Add-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/Add-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/Add-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/Add-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Add-V.pickle.gz
+++ b/pdfminer/cmap/Add-V.pickle.gz
--- a/pdfminer/cmap/B5-H.pickle.gz
+++ b/pdfminer/cmap/B5-H.pickle.gz
--- a/pdfminer/cmap/B5-V.pickle.gz
+++ b/pdfminer/cmap/B5-V.pickle.gz
--- a/pdfminer/cmap/B5pc-H.pickle.gz
+++ b/pdfminer/cmap/B5pc-H.pickle.gz
--- a/pdfminer/cmap/B5pc-V.pickle.gz
+++ b/pdfminer/cmap/B5pc-V.pickle.gz
--- a/pdfminer/cmap/CNS-EUC-H.pickle.gz
+++ b/pdfminer/cmap/CNS-EUC-H.pickle.gz
--- a/pdfminer/cmap/CNS-EUC-V.pickle.gz
+++ b/pdfminer/cmap/CNS-EUC-V.pickle.gz
--- a/pdfminer/cmap/CNS1-H.pickle.gz
+++ b/pdfminer/cmap/CNS1-H.pickle.gz
--- a/pdfminer/cmap/CNS1-V.pickle.gz
+++ b/pdfminer/cmap/CNS1-V.pickle.gz
--- a/pdfminer/cmap/CNS2-H.pickle.gz
+++ b/pdfminer/cmap/CNS2-H.pickle.gz
--- a/pdfminer/cmap/CNS2-V.pickle.gz
+++ b/pdfminer/cmap/CNS2-V.pickle.gz
--- a/pdfminer/cmap/ETHK-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETHK-B5-H.pickle.gz
--- a/pdfminer/cmap/ETHK-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETHK-B5-V.pickle.gz
--- a/pdfminer/cmap/ETen-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETen-B5-H.pickle.gz
--- a/pdfminer/cmap/ETen-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETen-B5-V.pickle.gz
--- a/pdfminer/cmap/ETenms-B5-H.pickle.gz
+++ b/pdfminer/cmap/ETenms-B5-H.pickle.gz
--- a/pdfminer/cmap/ETenms-B5-V.pickle.gz
+++ b/pdfminer/cmap/ETenms-B5-V.pickle.gz
--- a/pdfminer/cmap/EUC-H.pickle.gz
+++ b/pdfminer/cmap/EUC-H.pickle.gz
--- a/pdfminer/cmap/EUC-V.pickle.gz
+++ b/pdfminer/cmap/EUC-V.pickle.gz
--- a/pdfminer/cmap/Ext-H.pickle.gz
+++ b/pdfminer/cmap/Ext-H.pickle.gz
--- a/pdfminer/cmap/Ext-RKSJ-H.pickle.gz
+++ b/pdfminer/cmap/Ext-RKSJ-H.pickle.gz
--- a/pdfminer/cmap/Ext-RKSJ-V.pickle.gz
+++ b/pdfminer/cmap/Ext-RKSJ-V.pickle.gz
--- a/pdfminer/cmap/Ext-V.pickle.gz
+++ b/pdfminer/cmap/Ext-V.pickle.gz
--- a/pdfminer/cmap/GB-EUC-H.pickle.gz
+++ b/pdfminer/cmap/GB-EUC-H.pickle.gz
--- a/pdfminer/cmap/GB-EUC-V.pickle.gz
+++ b/pdfminer/cmap/GB-EUC-V.pickle.gz
--- a/pdfminer/cmap/GB-H.pickle.gz
+++ b/pdfminer/cmap/GB-H.pickle.gz
--- a/pdfminer/cmap/GB-V.pickle.gz
+++ b/pdfminer/cmap/GB-V.pickle.gz
--- a/pdfminer/cmap/GBK-EUC-H.pickle.gz
+++ b/pdfminer/cmap/GBK-EUC-H.pickle.gz
--- a/pdfminer/cmap/GBK-EUC-V.pickle.gz
+++ b/pdfminer/cmap/GBK-EUC-V.pickle.gz
--- a/pdfminer/cmap/GBK2K-H.pickle.gz
+++ b/pdfminer/cmap/GBK2K-H.pickle.gz
--- a/Show More
+++ b/Show More