From 82fd443cb2e9b3bb4857f60997f3195b24d7bfe8 Mon Sep 17 00:00:00 2001 From: zacc806 Date: Mon, 7 Aug 2023 18:27:38 +0600 Subject: [PATCH] Committed script Committed script, requirements file --- requirements.txt | 9 +++++++++ some.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 requirements.txt create mode 100644 some.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..71f2e06 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +cffi==1.15.1 +charset-normalizer==3.2.0 +cryptography==41.0.3 +packaging==23.1 +pdf2image==1.16.3 +pdfminer.six==20221105 +Pillow==10.0.0 +pycparser==2.21 +pytesseract==0.3.10 diff --git a/some.py b/some.py new file mode 100644 index 0000000..c80d990 --- /dev/null +++ b/some.py @@ -0,0 +1,19 @@ +from pdfminer.high_level import extract_text +import pytesseract +from pdf2image import convert_from_path + + +file_path = 'Исх. № 0145-07-23 от 13.07.2023г. битум ГПК.pdf' +text = extract_text(file_path) +print(text) +if text.isspace(): + # Convert the PDF to a series of images + images = convert_from_path(file_path) + + # Extract text from each image + all_text = "" + for img in images: + text = pytesseract.image_to_string(img, lang='rus+eng') # 'rus' is for Russian. 'eng' is for English. + all_text += text + + print(all_text)