Spaces:
Sleeping
Sleeping
from imports import * | |
def scrape_sports(url, driver_path='chromedriver.exe'): | |
# Initialize the WebDriver | |
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) | |
driver.get(url) | |
# Wait until the page is fully loaded | |
WebDriverWait(driver, 20).until(lambda d: d.execute_script("return document.readyState") == "complete") | |
# Initialize lists to store scraped data | |
titles = [] | |
links = [] | |
try: | |
wait = WebDriverWait(driver, 20) | |
if url == "https://www.indiatoday.in/search/sports": | |
divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'B1S3_content__wrap__9mSB6'))) | |
for div in divs: | |
try: | |
h3_tag = div.find_element(By.TAG_NAME, 'h3') | |
titles.append(h3_tag.text) | |
link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href') | |
links.append(link) | |
except NoSuchElementException: | |
continue | |
elif url == "https://indianexpress.com/section/technology/": | |
ul_element = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'article-list'))) | |
li_elements = ul_element.find_elements(By.TAG_NAME, 'li') | |
for li in li_elements: | |
try: | |
h3_tag = li.find_element(By.TAG_NAME, 'h3') | |
titles.append(h3_tag.text) | |
link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href') | |
links.append(link) | |
except NoSuchElementException: | |
continue | |
elif url == "https://indianexpress.com/section/entertainment/": | |
divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'articles'))) | |
for div in divs: | |
try: | |
h2_tag = div.find_element(By.CLASS_NAME, 'title') | |
titles.append(h2_tag.text) | |
link = h2_tag.find_element(By.TAG_NAME, 'a').get_attribute('href') | |
links.append(link) | |
except NoSuchElementException: | |
continue | |
else: | |
divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'articles'))) | |
for div in divs: | |
try: | |
h2_tag = div.find_element(By.TAG_NAME, 'h2') | |
titles.append(h2_tag.text) | |
link = h2_tag.find_element(By.TAG_NAME, 'a').get_attribute('href') | |
links.append(link) | |
except NoSuchElementException: | |
continue | |
except TimeoutException: | |
print("Elements not found using CLASS_NAME. Trying XPath.") | |
try: | |
divs = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@class="B1S3_B1__s3_widget__1S13T"]'))) | |
for div in divs: | |
try: | |
h3_tag = div.find_element(By.TAG_NAME, 'h3') | |
titles.append(h3_tag.text) | |
link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href') | |
links.append(link) | |
except NoSuchElementException: | |
continue | |
except TimeoutException: | |
print("Elements not found using XPath.") | |
# Close the WebDriver | |
driver.quit() | |
return titles, links | |
# Usage example | |
# url = "https://indianexpress.com/section/technology/" | |
# titles, links = scrape_sports(url) | |
# print("Titles:", titles) | |
# print("Links:", links) | |