40 lines
1.3 KiB
Python
40 lines
1.3 KiB
Python
from selenium import webdriver
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
|
|
BASE_URL = "https://3ddd.ru/3dmodels/"
|
|
CHROME_DRIVER_PATH = 'C:\\Program Files\\Google\\Chrome\\Application\\chromedriver-win64\\chromedriver.exe'
|
|
|
|
os.environ["webdriver.chrome.driver"] = CHROME_DRIVER_PATH
|
|
|
|
def get_item_urls(page_number=1):
|
|
options = webdriver.ChromeOptions()
|
|
# options.add_argument('--headless') # Commented out for now to see the loaded page
|
|
#options.add_argument('--disable-gpu') # Disable GPU acceleration, useful for headless
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
try:
|
|
driver.get(f"{BASE_URL}?page={page_number}")
|
|
|
|
# Wait for the content to load. You can adjust the sleep time
|
|
driver.implicitly_wait(10)
|
|
|
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|
|
|
# Debugging prints
|
|
print(soup.prettify()[:1000]) # Print out first 1000 characters of page source
|
|
all_links = soup.find_all('a', href=True)
|
|
print(f"Total 'a' tags found: {len(all_links)}")
|
|
|
|
# Extracting all 'a' tags with the specific pattern
|
|
urls = ['https://3ddd.ru' + link['href'] for link in all_links if '/3dmodels/show/' in link['href']]
|
|
finally:
|
|
driver.quit()
|
|
|
|
return urls
|
|
|
|
# Example: Get items from the main page
|
|
urls = get_item_urls()
|
|
for url in urls:
|
|
print(url)
|