Add tools/pdfstats.py which counts all LT* types in a PDF (#68)

2017-05-29 03:11:58 -04:00 · 2017-05-29 03:11:58 -04:00 · 35a58ee5b5
parent 488545ddc7
commit 35a58ee5b5
1 changed files with 80 additions and 0 deletions
--- a/tools/pdfstats.py
+++ b/tools/pdfstats.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 # Exercise pdfminer, looking deeply into a PDF document, print some stats to stdout
 # Usage: pdfstats.py <PDF-filename>
 import sys, os
 import collections
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfdevice import PDFDevice
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LAParams, LTContainer
 _, SCRIPT = os.path.split(__file__)
 def msg(*args, **kwargs):
    print(' '.join(map(str, args)), file=sys.stdout, **kwargs)
 def flat_iter(obj):
    yield obj
    if isinstance(obj, LTContainer):
        for ob in obj:
            yield from flat_iter(ob)
 def main(args):
    msg(SCRIPT, args)
    if len(args) != 1:
        msg('Parse a PDF file and print some pdfminer-specific stats')
        msg('Usage:', SCRIPT, '<PDF-filename>')
        return 1
    infilename, = args
    lt_types = collections.Counter()
    with open(infilename, 'rb') as pdf_file:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(pdf_file)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ''
        document = PDFDocument(parser, password)
        # Check if the document allows text extraction.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed(filename)
        # Make a page iterator
        pages = PDFPage.create_pages(document)
        # Set up for some analysis
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
            )
        #device = PDFDevice(rsrcmgr)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Look at all (nested) objects on each page
        for page_count, page in enumerate(pages, 1):
            # oh so stateful
            interpreter.process_page(page)
            layout = device.get_result()
            lt_types.update(type(item).__name__ for item in flat_iter(layout))
    msg('page_count', page_count)
    msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
 if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))