Added new_directory
commit
886018e8f0
|
@ -0,0 +1,101 @@
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
import hashlib
|
||||||
|
import threading
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
|
||||||
|
class ImageDownloaderBot:
|
||||||
|
def __init__(self, folder_name="downloaded_images"):
|
||||||
|
self.headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
self.folder_name = folder_name
|
||||||
|
self.downloaded_images = set()
|
||||||
|
self.stop_bot = False
|
||||||
|
self.thread = None
|
||||||
|
|
||||||
|
def _get_image_hash(self, content):
|
||||||
|
return hashlib.md5(content).hexdigest()
|
||||||
|
|
||||||
|
def resize_image(self, content, base_width=300):
|
||||||
|
image = Image.open(io.BytesIO(content))
|
||||||
|
w_percent = base_width / float(image.size[0])
|
||||||
|
h_size = int(float(image.size[1]) * float(w_percent))
|
||||||
|
image = image.resize((base_width, h_size), Image.ANTIALIAS)
|
||||||
|
with io.BytesIO() as output:
|
||||||
|
image.save(output, format="JPEG")
|
||||||
|
return output.getvalue()
|
||||||
|
|
||||||
|
def get_images_from_url(self, url):
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=self.headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
img_tags = soup.find_all('img')
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for img_tag in img_tags:
|
||||||
|
if self.stop_bot:
|
||||||
|
print("Stopping bot as requested.")
|
||||||
|
return
|
||||||
|
|
||||||
|
img_url = img_tag.get('src', '')
|
||||||
|
if not img_url or not img_url.startswith(('http:', 'https:')):
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = requests.get(img_url, headers=self.headers, timeout=10).content
|
||||||
|
img_hash = self._get_image_hash(content)
|
||||||
|
|
||||||
|
if img_hash in self.downloaded_images:
|
||||||
|
print(f"Skipped downloading duplicate image: {img_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.downloaded_images.add(img_hash)
|
||||||
|
|
||||||
|
# Resize the image
|
||||||
|
resized_content = self.resize_image(content)
|
||||||
|
# Create directory based on the domain name
|
||||||
|
directory = os.path.join(self.folder_name, url.split("//")[-1].split("/")[0])
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
os.makedirs(directory)
|
||||||
|
|
||||||
|
img_name = os.path.join(directory, os.path.basename(img_url))
|
||||||
|
with open(img_name, 'wb') as f:
|
||||||
|
f.write(resized_content)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
time.sleep(random.uniform(1, 3))
|
||||||
|
|
||||||
|
print(f"Downloaded {count} new images from {url}.")
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error during requests to {url} : {str(e)}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {str(e)}")
|
||||||
|
|
||||||
|
def start(self, url):
|
||||||
|
if self.thread and self.thread.is_alive():
|
||||||
|
print("Bot is already running.")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.stop_bot = False
|
||||||
|
self.thread = threading.Thread(target=self.get_images_from_url, args=(url,))
|
||||||
|
self.thread.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.stop_bot = True
|
||||||
|
if self.thread:
|
||||||
|
self.thread.join()
|
||||||
|
|
||||||
|
|
||||||
|
bot = ImageDownloaderBot()
|
||||||
|
url = "YOUR_WEBSITE_URL_HERE"
|
||||||
|
|
||||||
|
bot.start(url)
|
||||||
|
# To stop the bot at any point, you can call:
|
||||||
|
# bot.stop()
|
|
@ -0,0 +1,39 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import os
|
||||||
|
|
||||||
|
BASE_URL = "https://3ddd.ru/3dmodels/"
|
||||||
|
CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe'
|
||||||
|
|
||||||
|
os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH
|
||||||
|
|
||||||
|
def get_item_urls(page_number=1):
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
# options.add_argument('--headless') # Commented out for now to see the loaded page
|
||||||
|
#options.add_argument('--disable-gpu') # Disable GPU acceleration, useful for headless
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
try:
|
||||||
|
driver.get(f"{BASE_URL}?page={page_number}")
|
||||||
|
|
||||||
|
# Wait for the content to load. You can adjust the sleep time
|
||||||
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
|
||||||
|
# Debugging prints
|
||||||
|
print(soup.prettify()[:1000]) # Print out first 1000 characters of page source
|
||||||
|
all_links = soup.find_all('a', href=True)
|
||||||
|
print(f"Total 'a' tags found: {len(all_links)}")
|
||||||
|
|
||||||
|
# Extracting all 'a' tags with the specific pattern
|
||||||
|
urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']]
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
# Example: Get items from the main page
|
||||||
|
urls = get_item_urls()
|
||||||
|
for url in urls:
|
||||||
|
print(url)
|
Loading…
Reference in New Issue