website_bot/url_finder.py

40 lines
1.3 KiB
Python
Raw Permalink Normal View History

2023-08-25 08:55:39 +00:00
from selenium import webdriver
from bs4 import BeautifulSoup
import os
BASE_URL = "https://3ddd.ru/3dmodels/"
CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe'
os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH
def get_item_urls(page_number=1):
options = webdriver.ChromeOptions()
# options.add_argument('--headless') # Commented out for now to see the loaded page
#options.add_argument('--disable-gpu') # Disable GPU acceleration, useful for headless
driver = webdriver.Chrome(options=options)
try:
driver.get(f"{BASE_URL}?page={page_number}")
# Wait for the content to load. You can adjust the sleep time
driver.implicitly_wait(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Debugging prints
print(soup.prettify()[:1000]) # Print out first 1000 characters of page source
all_links = soup.find_all('a', href=True)
print(f"Total 'a' tags found: {len(all_links)}")
# Extracting all 'a' tags with the specific pattern
urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']]
finally:
driver.quit()
return urls
# Example: Get items from the main page
urls = get_item_urls()
for url in urls:
print(url)