20 lines
585 B
Python
20 lines
585 B
Python
|
from pdfminer.high_level import extract_text
|
|||
|
import pytesseract
|
|||
|
from pdf2image import convert_from_path
|
|||
|
|
|||
|
|
|||
|
file_path = 'Исх. № 0145-07-23 от 13.07.2023г. битум ГПК.pdf'
|
|||
|
text = extract_text(file_path)
|
|||
|
print(text)
|
|||
|
if text.isspace():
|
|||
|
# Convert the PDF to a series of images
|
|||
|
images = convert_from_path(file_path)
|
|||
|
|
|||
|
# Extract text from each image
|
|||
|
all_text = ""
|
|||
|
for img in images:
|
|||
|
text = pytesseract.image_to_string(img, lang='rus+eng') # 'rus' is for Russian. 'eng' is for English.
|
|||
|
all_text += text
|
|||
|
|
|||
|
print(all_text)
|