Add tools/pdfstats.py which counts all LT* types in a PDF (#68)
parent
488545ddc7
commit
35a58ee5b5
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Exercise pdfminer, looking deeply into a PDF document, print some stats to stdout
|
||||
# Usage: pdfstats.py <PDF-filename>
|
||||
|
||||
import sys, os
|
||||
import collections
|
||||
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfdevice import PDFDevice
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTContainer
|
||||
|
||||
|
||||
_, SCRIPT = os.path.split(__file__)
|
||||
|
||||
def msg(*args, **kwargs):
|
||||
print(' '.join(map(str, args)), file=sys.stdout, **kwargs)
|
||||
|
||||
def flat_iter(obj):
|
||||
yield obj
|
||||
if isinstance(obj, LTContainer):
|
||||
for ob in obj:
|
||||
yield from flat_iter(ob)
|
||||
|
||||
def main(args):
|
||||
msg(SCRIPT, args)
|
||||
|
||||
if len(args) != 1:
|
||||
msg('Parse a PDF file and print some pdfminer-specific stats')
|
||||
msg('Usage:', SCRIPT, '<PDF-filename>')
|
||||
return 1
|
||||
|
||||
infilename, = args
|
||||
|
||||
lt_types = collections.Counter()
|
||||
|
||||
with open(infilename, 'rb') as pdf_file:
|
||||
|
||||
# Create a PDF parser object associated with the file object.
|
||||
parser = PDFParser(pdf_file)
|
||||
|
||||
# Create a PDF document object that stores the document structure.
|
||||
# Supply the password for initialization.
|
||||
password = ''
|
||||
document = PDFDocument(parser, password)
|
||||
# Check if the document allows text extraction.
|
||||
if not document.is_extractable:
|
||||
raise PDFTextExtractionNotAllowed(filename)
|
||||
|
||||
# Make a page iterator
|
||||
pages = PDFPage.create_pages(document)
|
||||
|
||||
|
||||
# Set up for some analysis
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams(
|
||||
detect_vertical=True,
|
||||
all_texts=True,
|
||||
)
|
||||
#device = PDFDevice(rsrcmgr)
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
# Look at all (nested) objects on each page
|
||||
for page_count, page in enumerate(pages, 1):
|
||||
# oh so stateful
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
|
||||
lt_types.update(type(item).__name__ for item in flat_iter(layout))
|
||||
|
||||
msg('page_count', page_count)
|
||||
msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv[1:]))
|
Loading…
Reference in New Issue