From 886018e8f0238cb23d1ff4812b86e5d5492d61cd Mon Sep 17 00:00:00 2001 From: MSI Date: Fri, 25 Aug 2023 14:55:39 +0600 Subject: [PATCH] Added new_directory --- main.py | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ url_finder.py | 39 +++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 main.py create mode 100644 url_finder.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..35df407 --- /dev/null +++ b/main.py @@ -0,0 +1,101 @@ +import requests +from bs4 import BeautifulSoup +import os +import time +import random +import hashlib +import threading +from PIL import Image +import io + +class ImageDownloaderBot: + def __init__(self, folder_name="downloaded_images"): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + self.folder_name = folder_name + self.downloaded_images = set() + self.stop_bot = False + self.thread = None + + def _get_image_hash(self, content): + return hashlib.md5(content).hexdigest() + + def resize_image(self, content, base_width=300): + image = Image.open(io.BytesIO(content)) + w_percent = base_width / float(image.size[0]) + h_size = int(float(image.size[1]) * float(w_percent)) + image = image.resize((base_width, h_size), Image.ANTIALIAS) + with io.BytesIO() as output: + image.save(output, format="JPEG") + return output.getvalue() + + def get_images_from_url(self, url): + try: + response = requests.get(url, headers=self.headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + img_tags = soup.find_all('img') + + count = 0 + for img_tag in img_tags: + if self.stop_bot: + print("Stopping bot as requested.") + return + + img_url = img_tag.get('src', '') + if not img_url or not img_url.startswith(('http:', 'https:')): + continue + + content = requests.get(img_url, headers=self.headers, timeout=10).content + img_hash = self._get_image_hash(content) + + if img_hash in self.downloaded_images: + print(f"Skipped downloading duplicate image: {img_url}") + continue + + self.downloaded_images.add(img_hash) + + # Resize the image + resized_content = self.resize_image(content) + # Create directory based on the domain name + directory = os.path.join(self.folder_name, url.split("//")[-1].split("/")[0]) + if not os.path.exists(directory): + os.makedirs(directory) + + img_name = os.path.join(directory, os.path.basename(img_url)) + with open(img_name, 'wb') as f: + f.write(resized_content) + count += 1 + + time.sleep(random.uniform(1, 3)) + + print(f"Downloaded {count} new images from {url}.") + + except requests.RequestException as e: + print(f"Error during requests to {url} : {str(e)}") + except Exception as e: + print(f"Error: {str(e)}") + + def start(self, url): + if self.thread and self.thread.is_alive(): + print("Bot is already running.") + return + + self.stop_bot = False + self.thread = threading.Thread(target=self.get_images_from_url, args=(url,)) + self.thread.start() + + def stop(self): + self.stop_bot = True + if self.thread: + self.thread.join() + + +bot = ImageDownloaderBot() +url = "YOUR_WEBSITE_URL_HERE" + +bot.start(url) +# To stop the bot at any point, you can call: +# bot.stop() diff --git a/url_finder.py b/url_finder.py new file mode 100644 index 0000000..057be9a --- /dev/null +++ b/url_finder.py @@ -0,0 +1,39 @@ +from selenium import webdriver +from bs4 import BeautifulSoup +import os + +BASE_URL = "https://3ddd.ru/3dmodels/" +CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe' + +os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH + +def get_item_urls(page_number=1): + options = webdriver.ChromeOptions() + # options.add_argument('--headless') # Commented out for now to see the loaded page + #options.add_argument('--disable-gpu') # Disable GPU acceleration, useful for headless + + driver = webdriver.Chrome(options=options) + try: + driver.get(f"{BASE_URL}?page={page_number}") + + # Wait for the content to load. You can adjust the sleep time + driver.implicitly_wait(10) + + soup = BeautifulSoup(driver.page_source, 'html.parser') + + # Debugging prints + print(soup.prettify()[:1000]) # Print out first 1000 characters of page source + all_links = soup.find_all('a', href=True) + print(f"Total 'a' tags found: {len(all_links)}") + + # Extracting all 'a' tags with the specific pattern + urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']] + finally: + driver.quit() + + return urls + +# Example: Get items from the main page +urls = get_item_urls() +for url in urls: + print(url)