parent
5114acdda6
commit
82fd443cb2
|
@ -0,0 +1,9 @@
|
|||
cffi==1.15.1
|
||||
charset-normalizer==3.2.0
|
||||
cryptography==41.0.3
|
||||
packaging==23.1
|
||||
pdf2image==1.16.3
|
||||
pdfminer.six==20221105
|
||||
Pillow==10.0.0
|
||||
pycparser==2.21
|
||||
pytesseract==0.3.10
|
|
@ -0,0 +1,19 @@
|
|||
from pdfminer.high_level import extract_text
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
|
||||
file_path = 'Исх. № 0145-07-23 от 13.07.2023г. битум ГПК.pdf'
|
||||
text = extract_text(file_path)
|
||||
print(text)
|
||||
if text.isspace():
|
||||
# Convert the PDF to a series of images
|
||||
images = convert_from_path(file_path)
|
||||
|
||||
# Extract text from each image
|
||||
all_text = ""
|
||||
for img in images:
|
||||
text = pytesseract.image_to_string(img, lang='rus+eng') # 'rus' is for Russian. 'eng' is for English.
|
||||
all_text += text
|
||||
|
||||
print(all_text)
|
Loading…
Reference in New Issue