|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import os |
|
from google_drive_handle import authenticate_google_drive |
|
drive = authenticate_google_drive() |
|
|
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' |
|
} |
|
|
|
def scrape_article(article_url): |
|
response = requests.get(article_url, headers=headers) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
title_tag = soup.find('h1') |
|
title = title_tag.get_text(strip=True) if title_tag else 'No Title' |
|
|
|
content_div = soup.find('div', id='article_holder') |
|
if content_div: |
|
content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p')) |
|
else: |
|
content = 'Content not found' |
|
|
|
return { |
|
'Title': title, |
|
'Content': content |
|
} |
|
|
|
def scrape_category(category_url, num_articles): |
|
articles_scraped = 0 |
|
all_articles = [] |
|
page_num = 1 |
|
|
|
|
|
site_name = category_url.split('/')[2] |
|
site_name = site_name.replace('www.', '') |
|
category_name = category_url.split('/')[-1] |
|
|
|
while articles_scraped < num_articles: |
|
paginated_url = f"{category_url}/index.{page_num}.html" |
|
|
|
response = requests.get(paginated_url, headers=headers) |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
|
|
article_links = soup.find_all('h2', class_='article_title') |
|
for article_link in article_links: |
|
a_tag = article_link.find('a') |
|
if a_tag and 'href' in a_tag.attrs: |
|
full_article_url = a_tag['href'] |
|
if not full_article_url.startswith('http'): |
|
full_article_url = f"{category_url}/{full_article_url}" |
|
article_data = scrape_article(full_article_url) |
|
|
|
all_articles.append(article_data) |
|
articles_scraped += 1 |
|
|
|
if articles_scraped >= num_articles: |
|
break |
|
|
|
if articles_scraped >= num_articles: |
|
break |
|
|
|
print(f"Going to next page: {paginated_url}") |
|
page_num += 1 |
|
|
|
|
|
|
|
df = pd.DataFrame(all_articles) |
|
csv_file_name = f"{site_name}_{category_name}_articles.csv" |
|
csv_file_path = os.path.join(os.getcwd(), csv_file_name) |
|
df.to_csv(csv_file_path, index=False) |
|
print(f"Articles saved to {csv_file_path}") |
|
|
|
return csv_file_path |
|
|
|
|
|
|
|
|
|
|