diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..71f2e06 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +cffi==1.15.1 +charset-normalizer==3.2.0 +cryptography==41.0.3 +packaging==23.1 +pdf2image==1.16.3 +pdfminer.six==20221105 +Pillow==10.0.0 +pycparser==2.21 +pytesseract==0.3.10 diff --git a/some.py b/some.py new file mode 100644 index 0000000..c80d990 --- /dev/null +++ b/some.py @@ -0,0 +1,19 @@ +from pdfminer.high_level import extract_text +import pytesseract +from pdf2image import convert_from_path + + +file_path = 'Исх. № 0145-07-23 от 13.07.2023г. битум ГПК.pdf' +text = extract_text(file_path) +print(text) +if text.isspace(): + # Convert the PDF to a series of images + images = convert_from_path(file_path) + + # Extract text from each image + all_text = "" + for img in images: + text = pytesseract.image_to_string(img, lang='rus+eng') # 'rus' is for Russian. 'eng' is for English. + all_text += text + + print(all_text)