website_bot/url_finder.py

from selenium import webdriver
from bs4 import BeautifulSoup
import os

BASE_URL = "https://3ddd.ru/3dmodels/"
CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe'

os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH

def get_item_urls(page_number=1):
    options = webdriver.ChromeOptions()
    # options.add_argument('--headless')  # Commented out for now to see the loaded page
    #options.add_argument('--disable-gpu')  # Disable GPU acceleration, useful for headless

    driver = webdriver.Chrome(options=options)
    try:
        driver.get(f"{BASE_URL}?page={page_number}")
    
        # Wait for the content to load. You can adjust the sleep time
        driver.implicitly_wait(10)
    
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Debugging prints
        print(soup.prettify()[:1000])  # Print out first 1000 characters of page source
        all_links = soup.find_all('a', href=True)
        print(f"Total 'a' tags found: {len(all_links)}")
    
        # Extracting all 'a' tags with the specific pattern
        urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']]
    finally:
        driver.quit()
    
    return urls

# Example: Get items from the main page
urls = get_item_urls()
for url in urls:
    print(url)
Added new_directory 2023-08-25 08:55:39 +00:00			`from selenium import webdriver`
			`from bs4 import BeautifulSoup`
			`import os`

			`BASE_URL = "https://3ddd.ru/3dmodels/"`
			`CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe'`

			`os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH`

			`def get_item_urls(page_number=1):`
			`options = webdriver.ChromeOptions()`
			`# options.add_argument('--headless') # Commented out for now to see the loaded page`
			`#options.add_argument('--disable-gpu') # Disable GPU acceleration, useful for headless`

			`driver = webdriver.Chrome(options=options)`
			`try:`
			`driver.get(f"{BASE_URL}?page={page_number}")`

			`# Wait for the content to load. You can adjust the sleep time`
			`driver.implicitly_wait(10)`

			`soup = BeautifulSoup(driver.page_source, 'html.parser')`

			`# Debugging prints`
			`print(soup.prettify()[:1000]) # Print out first 1000 characters of page source`
			`all_links = soup.find_all('a', href=True)`
			`print(f"Total 'a' tags found: {len(all_links)}")`

			`# Extracting all 'a' tags with the specific pattern`
			`urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']]`
			`finally:`
			`driver.quit()`

			`return urls`

			`# Example: Get items from the main page`
			`urls = get_item_urls()`
			`for url in urls:`
			`print(url)`