Spaces:

Atuljha
/

News-Sentiment-Analysis

Sleeping

App Files Files Community

News-Sentiment-Analysis / web_scrapper.py

Atuljha

Upload 6 files

3f4aa97 verified 5 months ago

raw

history blame contribute delete

3.69 kB

	from imports import *

	def scrape_sports(url, driver_path='chromedriver.exe'):
	# Initialize the WebDriver
	driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
	driver.get(url)

	# Wait until the page is fully loaded
	WebDriverWait(driver, 20).until(lambda d: d.execute_script("return document.readyState") == "complete")

	# Initialize lists to store scraped data
	titles = []
	links = []

	try:
	wait = WebDriverWait(driver, 20)

	if url == "https://www.indiatoday.in/search/sports":
	divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'B1S3_content__wrap__9mSB6')))
	for div in divs:
	try:
	h3_tag = div.find_element(By.TAG_NAME, 'h3')
	titles.append(h3_tag.text)
	link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href')
	links.append(link)
	except NoSuchElementException:
	continue

	elif url == "https://indianexpress.com/section/technology/":
	ul_element = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'article-list')))
	li_elements = ul_element.find_elements(By.TAG_NAME, 'li')
	for li in li_elements:
	try:
	h3_tag = li.find_element(By.TAG_NAME, 'h3')
	titles.append(h3_tag.text)
	link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href')
	links.append(link)
	except NoSuchElementException:
	continue
	elif url == "https://indianexpress.com/section/entertainment/":
	divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'articles')))
	for div in divs:
	try:
	h2_tag = div.find_element(By.CLASS_NAME, 'title')
	titles.append(h2_tag.text)
	link = h2_tag.find_element(By.TAG_NAME, 'a').get_attribute('href')
	links.append(link)
	except NoSuchElementException:
	continue
	else:
	divs = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'articles')))
	for div in divs:
	try:
	h2_tag = div.find_element(By.TAG_NAME, 'h2')
	titles.append(h2_tag.text)
	link = h2_tag.find_element(By.TAG_NAME, 'a').get_attribute('href')
	links.append(link)
	except NoSuchElementException:
	continue

	except TimeoutException:
	print("Elements not found using CLASS_NAME. Trying XPath.")
	try:
	divs = wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@class="B1S3_B1__s3_widget__1S13T"]')))
	for div in divs:
	try:
	h3_tag = div.find_element(By.TAG_NAME, 'h3')
	titles.append(h3_tag.text)
	link = h3_tag.find_element(By.TAG_NAME, 'a').get_attribute('href')
	links.append(link)
	except NoSuchElementException:
	continue
	except TimeoutException:
	print("Elements not found using XPath.")

	# Close the WebDriver
	driver.quit()

	return titles, links

	# Usage example
	# url = "https://indianexpress.com/section/technology/"
	# titles, links = scrape_sports(url)
	# print("Titles:", titles)
	# print("Links:", links)