diff --git a/CHANGELOG.md b/CHANGELOG.md index 17b8c6b..5107fbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Added +- Support for Paeth PNG filter compression (predictor value = 4) ([#537](https://github.com/pdfminer/pdfminer.six/pull/537)) + ### Fixed - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529)) - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469)) diff --git a/README.md b/README.md index bed07bc..7b4ff50 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,12 @@ pdfminer.six *We fathom PDF* -Pdfminer.six is a community maintained fork of the original PDFMiner. It is a -tool for extracting information from PDF documents. It focuses on getting -and analyzing text data. Pdfminer.six extracts the text from a page directly -from the sourcecode of the PDF. It can also be used to get the exact location, -font or color of the text. +Pdfminer.six is a community maintained fork of the original PDFMiner. It is a tool for extracting information from PDF +documents. It focuses on getting and analyzing text data. Pdfminer.six extracts the text from a page directly from the +sourcecode of the PDF. It can also be used to get the exact location, font or color of the text. -It is built in a modular way such that each component of pdfminer.six can be -replaced easily. You can implement your own interpreter or rendering device -that uses the power of pdfminer.six for other purposes than text analysis. +It is built in a modular way such that each component of pdfminer.six can be replaced easily. You can implement your own +interpreter or rendering device that uses the power of pdfminer.six for other purposes than text analysis. Check out the full documentation on [Read the Docs](https://pdfminersix.readthedocs.io). @@ -24,31 +21,31 @@ Check out the full documentation on Features -------- - * Written entirely in Python. - * Parse, analyze, and convert PDF documents. - * PDF-1.7 specification support. (well, almost). - * CJK languages and vertical writing scripts support. - * Various font types (Type1, TrueType, Type3, and CID) support. - * Support for extracting images (JPG, JBIG2 and Bitmaps). - * Support for RC4 and AES encryption. - * Support for AcroForm interactive form extraction. - * Table of contents extraction. - * Tagged contents extraction. - * Automatic layout analysis. - +* Written entirely in Python. +* Parse, analyze, and convert PDF documents. +* PDF-1.7 specification support. (well, almost). +* CJK languages and vertical writing scripts support. +* Various font types (Type1, TrueType, Type3, and CID) support. +* Support for extracting images (JPG, JBIG2, Bitmaps). +* Support for various compressions (ASCIIHexDecode, ASCII85Decode, LZWDecode, FlateDecode, RunLengthDecode, + CCITTFaxDecode) +* Support for RC4 and AES encryption. +* Support for AcroForm interactive form extraction. +* Table of contents extraction. +* Tagged contents extraction. +* Automatic layout analysis. How to use ---------- - * Install Python 3.6 or newer. - * Install +* Install Python 3.6 or newer. +* Install - `pip install pdfminer.six` + `pip install pdfminer.six` - * Use command-line interface to extract text from pdf: - - `python pdf2txt.py samples/simple1.pdf` +* Use command-line interface to extract text from pdf: + `python pdf2txt.py samples/simple1.pdf` Contributing ------------ diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 0c67ff1..d6507ef 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -77,44 +77,116 @@ def compatible_encode_method(bytesorstring, encoding='utf-8', return bytesorstring.decode(encoding, erraction) +def paeth_predictor(left, above, upper_left): + # From http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html + # Initial estimate + p = left + above - upper_left + # Distances to a,b,c + pa = abs(p - left) + pb = abs(p - above) + pc = abs(p - upper_left) + + # Return nearest of a,b,c breaking ties in order a,b,c + if pa <= pb and pa <= pc: + return left + elif pb <= pc: + return above + else: + return upper_left + + def apply_png_predictor(pred, colors, columns, bitspercomponent, data): + """Reverse the effect of the PNG predictor + + Documentation: http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html + """ if bitspercomponent != 8: - # unsupported - raise ValueError("Unsupported `bitspercomponent': %d" % - bitspercomponent) + msg = "Unsupported `bitspercomponent': %d" % bitspercomponent + raise ValueError(msg) + nbytes = colors * columns * bitspercomponent // 8 + bpp = colors * bitspercomponent // 8 # number of bytes per complete pixel buf = b'' - line0 = b'\x00' * columns - for i in range(0, len(data), nbytes + 1): - ft = data[i] - i += 1 - line1 = data[i:i + nbytes] - line2 = b'' - if ft == 0: - # PNG none - line2 += line1 - elif ft == 1: - # PNG sub (UNTESTED) - c = 0 - for b in line1: - c = (c + b) & 255 - line2 += bytes((c,)) - elif ft == 2: - # PNG up - for (a, b) in zip(line0, line1): - c = (a + b) & 255 - line2 += bytes((c,)) - elif ft == 3: - # PNG average (UNTESTED) - c = 0 - for (a, b) in zip(line0, line1): - c = ((c + a + b) // 2) & 255 - line2 += bytes((c,)) + line_above = b'\x00' * columns + for scanline_i in range(0, len(data), nbytes + 1): + filter_type = data[scanline_i] + line_encoded = data[scanline_i + 1:scanline_i + 1 + nbytes] + raw = b'' + + if filter_type == 0: + # Filter type 0: None + raw += line_encoded + + elif filter_type == 1: + # Filter type 1: Sub + # To reverse the effect of the Sub() filter after decompression, + # output the following value: + # Raw(x) = Sub(x) + Raw(x - bpp) + # (computed mod 256), where Raw() refers to the bytes already + # decoded. + for j, sub_x in enumerate(line_encoded): + if j - bpp < 0: + raw_x_bpp = 0 + else: + raw_x_bpp = int(raw[j - bpp]) + raw_x = (sub_x + raw_x_bpp) & 255 + raw += bytes((raw_x,)) + + elif filter_type == 2: + # Filter type 2: Up + # To reverse the effect of the Up() filter after decompression, + # output the following value: + # Raw(x) = Up(x) + Prior(x) + # (computed mod 256), where Prior() refers to the decoded bytes of + # the prior scanline. + for (up_x, prior_x) in zip(line_encoded, line_above): + raw_x = (up_x + prior_x) & 255 + raw += bytes((raw_x,)) + + elif filter_type == 3: + # Filter type 3: Average + # To reverse the effect of the Average() filter after + # decompression, output the following value: + # Raw(x) = Average(x) + floor((Raw(x-bpp)+Prior(x))/2) + # where the result is computed mod 256, but the prediction is + # calculated in the same way as for encoding. Raw() refers to the + # bytes already decoded, and Prior() refers to the decoded bytes of + # the prior scanline. + for j, average_x in enumerate(line_encoded): + if j - bpp < 0: + raw_x_bpp = 0 + else: + raw_x_bpp = int(raw[j - bpp]) + prior_x = int(line_above[j]) + raw_x = (average_x + (raw_x_bpp + prior_x) // 2) & 255 + raw += bytes((raw_x,)) + + elif filter_type == 4: + # Filter type 4: Paeth + # To reverse the effect of the Paeth() filter after decompression, + # output the following value: + # Raw(x) = Paeth(x) + # + PaethPredictor(Raw(x-bpp), Prior(x), Prior(x-bpp)) + # (computed mod 256), where Raw() and Prior() refer to bytes + # already decoded. Exactly the same PaethPredictor() function is + # used by both encoder and decoder. + for j, paeth_x in enumerate(line_encoded): + if j - bpp < 0: + raw_x_bpp = 0 + prior_x_bpp = 0 + else: + raw_x_bpp = int(raw[j - bpp]) + prior_x_bpp = int(line_above[j - bpp]) + prior_x = int(line_above[j]) + paeth = paeth_predictor(raw_x_bpp, prior_x, prior_x_bpp) + raw_x = (paeth_x + paeth) & 255 + raw += bytes((raw_x,)) + else: - # unsupported - raise ValueError("Unsupported predictor value: %d" % ft) - buf += line2 - line0 = line2 + raise ValueError("Unsupported predictor value: %d" % filter_type) + + buf += raw + line_above = raw return buf