2017-05-29 07:11:58 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
# Exercise pdfminer, looking deeply into a PDF document,
|
|
|
|
# print some stats to stdout
|
2017-05-29 07:11:58 +00:00
|
|
|
# Usage: pdfstats.py <PDF-filename>
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
import sys
|
|
|
|
import os
|
2017-05-29 07:11:58 +00:00
|
|
|
import collections
|
|
|
|
|
|
|
|
from pdfminer.pdfparser import PDFParser
|
|
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
|
|
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
|
|
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
|
from pdfminer.converter import PDFPageAggregator
|
|
|
|
from pdfminer.layout import LAParams, LTContainer
|
|
|
|
|
|
|
|
|
|
|
|
_, SCRIPT = os.path.split(__file__)
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
2017-05-29 07:11:58 +00:00
|
|
|
def msg(*args, **kwargs):
|
2019-12-29 20:20:20 +00:00
|
|
|
print(' '.join(map(str, args)), **kwargs) # noqa E999
|
|
|
|
|
2017-05-29 07:11:58 +00:00
|
|
|
|
|
|
|
def flat_iter(obj):
|
|
|
|
yield obj
|
|
|
|
if isinstance(obj, LTContainer):
|
|
|
|
for ob in obj:
|
|
|
|
yield from flat_iter(ob)
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
2017-05-29 07:11:58 +00:00
|
|
|
def main(args):
|
|
|
|
msg(SCRIPT, args)
|
|
|
|
|
|
|
|
if len(args) != 1:
|
|
|
|
msg('Parse a PDF file and print some pdfminer-specific stats')
|
|
|
|
msg('Usage:', SCRIPT, '<PDF-filename>')
|
|
|
|
return 1
|
|
|
|
|
|
|
|
infilename, = args
|
|
|
|
|
|
|
|
lt_types = collections.Counter()
|
|
|
|
|
|
|
|
with open(infilename, 'rb') as pdf_file:
|
|
|
|
|
|
|
|
# Create a PDF parser object associated with the file object.
|
|
|
|
parser = PDFParser(pdf_file)
|
|
|
|
|
|
|
|
# Create a PDF document object that stores the document structure.
|
|
|
|
# Supply the password for initialization.
|
|
|
|
password = ''
|
|
|
|
document = PDFDocument(parser, password)
|
|
|
|
# Check if the document allows text extraction.
|
|
|
|
if not document.is_extractable:
|
2019-12-29 20:20:20 +00:00
|
|
|
raise PDFTextExtractionNotAllowed(infilename)
|
2017-05-29 07:11:58 +00:00
|
|
|
|
|
|
|
# Make a page iterator
|
|
|
|
pages = PDFPage.create_pages(document)
|
|
|
|
|
|
|
|
rsrcmgr = PDFResourceManager()
|
|
|
|
laparams = LAParams(
|
|
|
|
detect_vertical=True,
|
|
|
|
all_texts=True,
|
|
|
|
)
|
|
|
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
|
|
|
|
# Look at all (nested) objects on each page
|
|
|
|
for page_count, page in enumerate(pages, 1):
|
|
|
|
# oh so stateful
|
|
|
|
interpreter.process_page(page)
|
|
|
|
layout = device.get_result()
|
|
|
|
|
|
|
|
lt_types.update(type(item).__name__ for item in flat_iter(layout))
|
|
|
|
|
|
|
|
msg('page_count', page_count)
|
|
|
|
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
|
|
|
|
2019-12-29 20:20:20 +00:00
|
|
|
|
2017-05-29 07:11:58 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
sys.exit(main(sys.argv[1:]))
|