from typing_extensions import Self from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.action_chains import ActionChains from selenium.common.exceptions import NoSuchElementException, TimeoutException from selenium.common.exceptions import ElementClickInterceptedException from bs4 import BeautifulSoup import time import os import re import requests import json import csv from urllib.parse import urljoin # Set up Chrome WebDriver with options options = ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('log-level=3') # Initialize the Chrome WebDriver wd = webdriver.Chrome(options=options) def download_image(img_url): return img_url def sanitize_filename(filename): return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_') def scroll_page(wd, max_scrolls=7, articles_per_load=6, max_attempts=5): scroll_pause_time = 5 attempts = 0 for _ in range(max_scrolls): current_articles = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post")) wd.execute_script("window.scrollBy(0, document.body.scrollHeight);") time.sleep(scroll_pause_time) try: load_more_button = WebDriverWait(wd, 10).until( EC.presence_of_element_located((By.XPATH, '//a[@class="ts-button load-button load-button-a ts-button-alt" and @href="#"]')) ) wd.execute_script("arguments[0].scrollIntoView();", load_more_button) wd.execute_script("arguments[0].click();", load_more_button) attempts = 0 # Reset attempts after successful button click except TimeoutException: attempts += 1 if attempts >= max_attempts: print("Maximum attempts reached without new articles. Exiting.") return False # Exit the function new_article_count = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post")) if new_article_count > current_articles: attempts = 0 # Reset attempts after successfully loading new articles else: attempts += 1 if attempts >= max_attempts: print("No new articles found after several attempts. Exiting.") return False # Exit the function return True def scrape_article_details(article_url, wd): try: # Validate the URL if not article_url.startswith("http"): article_url = "https://" + article_url print("Navigating to:", article_url) wd.get(article_url) WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'the-post-tags'))) # Wait for a specific element to ensure the page has loaded soup = BeautifulSoup(wd.page_source, 'html.parser') content_tag = soup.find('div', class_='post-content cf entry-content content-spacious') content = content_tag.get_text().strip() if content_tag else "" category_tag = soup.find('span', class_='meta-item cat-labels') category_from_article = category_tag.get_text().strip() if category_tag else "Uncategorized" title_tag = soup.find('h1', class_='is-title post-title') art_title = title_tag.get_text().strip() if title_tag else "" date_tag = soup.find('span', class_='meta-item has-next-icon date') date = date_tag.get_text().strip() if date_tag else "" image_tag = soup.find('a', class_='image-link') image_url = image_tag['href'] if image_tag else None img_url = urljoin(article_url, image_url) image_path = download_image(img_url) if image_url else None return content, date, image_path, art_title, category_from_article except TimeoutException: print("Timed out waiting for page elements to load for URL:", article_url) return "", "", None, "", "" except Exception as e: print(f"An error occurred while scraping article details at {article_url}: {str(e)}") return "", "", None, "", "" def scrape_category(category_url,num_articles): # Set up Chrome WebDriver with options options = ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('log-level=3') # Initialize the Chrome WebDriver wd = webdriver.Chrome(options=options) print("Attempting to scrape:", category_url) articles_data = [] articles_count = 0 wd.get(category_url) # Adjusted to use num_articles for scrolling and loading articles scroll_page(wd, max_scrolls=int(num_articles/6), articles_per_load=6) soup = BeautifulSoup(wd.page_source, 'html.parser') articles = soup.find_all('article', class_='l-post grid-base-post grid-post') for article in articles[:num_articles]: # Limit to num_articles link_tag = article.find('a', class_='image-link media-ratio ratio-16-9') link = link_tag['href'] if link_tag else "" if link: wd.get(link) article_data = scrape_article_details(link, wd) if article_data[0]: # Check if content is non-empty articles_data.append({ "art_id": articles_count, "Title": article_data[3], "Date": article_data[1], "Category": article_data[4], "Content": article_data[0], "Link": link, "Image": article_data[2], }) articles_count += 1 print(f"Article #{articles_count} scraped: {article_data[3]}") category_name = sanitize_filename(category_url.split("/")[-1]) csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data.csv') try: with open(csv_file_path, 'w', newline='', encoding='utf-8') as file: fieldnames = ["art_id", "Title", "Date", "Category", "Content", "Link", "Image"] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for article in articles_data: writer.writerow(article) print(f"Data written to {csv_file_path} successfully.") except Exception as e: print(f"Error writing data to file: {e}") wd.quit() # Close the WebDriver print(f"Total articles scraped: {len(articles_data)}") return csv_file_path