test/some.py

20 lines
585 B
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

from pdfminer.high_level import extract_text
import pytesseract
from pdf2image import convert_from_path
file_path = 'Исх. № 0145-07-23 от 13.07.2023г. битум ГПК.pdf'
text = extract_text(file_path)
print(text)
if text.isspace():
# Convert the PDF to a series of images
images = convert_from_path(file_path)
# Extract text from each image
all_text = ""
for img in images:
text = pytesseract.image_to_string(img, lang='rus+eng') # 'rus' is for Russian. 'eng' is for English.
all_text += text
print(all_text)