Upload 13 files
Browse files- .env +9 -0
- .gitignore +6 -0
- README.md +43 -11
- akhbarona_ar.py +79 -0
- al9anat_ar.py +169 -0
- app.py +123 -0
- config.json +92 -0
- file_id_mapping.json +1 -0
- google_drive_handle.py +64 -0
- hespress_ar.py +151 -0
- hespress_en.py +125 -0
- hespress_fr.py +153 -0
- liberation_fr.py +86 -0
.env
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CLIENT_ID=960753990614-492j6pk974fjq94qvls3mcgee1sbi4dv.apps.googleusercontent.com
|
2 |
+
PROJECT_ID=moroccan-news-aggregator
|
3 |
+
AUTH_URI=https://accounts.google.com/o/oauth2/auth
|
4 |
+
TOKEN_URI=https://oauth2.googleapis.com/token
|
5 |
+
AUTH_PROVIDER_X509_CERT_URL=https://www.googleapis.com/oauth2/v1/certs
|
6 |
+
CLIENT_SECRET=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
|
7 |
+
REFRESH_TOKEN=1//04ayA66paFryZCgYIARAAGAQSNwF-L9IreeWNlmWv38CCaRqvv_W8VHEp7ysy1A36bTZk3ViCCE9pOabmcKPWNfyz6HJgYm0fkZs
|
8 |
+
REDIRECT_URIS=https://developers.google.com/oauthplayground,http://localhost:8501,http://localhost:8080
|
9 |
+
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
file_id_mapping.json
|
2 |
+
__pycache/
|
3 |
+
.env
|
4 |
+
.gitignore
|
5 |
+
.git/
|
6 |
+
UI-design.pdf
|
README.md
CHANGED
@@ -1,13 +1,45 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
1 |
+
# Moroccan News Aggregator
|
2 |
+
|
3 |
+
The Moroccan News Aggregator is a simple web scraping project designed to extract news articles from popular Moroccan news websites. The goal is to provide users with a convenient way to access and categorize news content from different sources.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **Multi-Language Support:** Choose news articles in English, Arabic, or French languages from websites such as Hespress, Akhbarona, and Le360.
|
8 |
+
|
9 |
+
- **Category Selection:** Select specific categories within each language to filter news articles based on your interests.
|
10 |
+
|
11 |
+
- **Data Storage:** The scraped data is uploaded to Google Drive, ensuring easy access and sharing.
|
12 |
+
|
13 |
+
## Setup Instructions
|
14 |
+
|
15 |
+
1. **Clone the Repository:**
|
16 |
+
|
17 |
+
```bash
|
18 |
+
git clone <repository-url>
|
19 |
+
```
|
20 |
+
|
21 |
+
2. **Install Dependencies:**
|
22 |
+
|
23 |
+
```bash
|
24 |
+
pip install -r requirements.txt
|
25 |
+
```
|
26 |
+
|
27 |
+
3. **Run the App:**
|
28 |
+
|
29 |
+
```bash
|
30 |
+
streamlit run app.py
|
31 |
+
```
|
32 |
+
|
33 |
+
4. Follow the on-screen instructions to choose websites, languages, categories, and start scraping news articles.
|
34 |
+
|
35 |
+
## Configuration
|
36 |
+
|
37 |
+
Adjust settings in the `config.json` file to customize supported websites, languages, and categories.
|
38 |
+
|
39 |
+
## License
|
40 |
+
|
41 |
+
This project is licensed under the [MIT License](LICENSE).
|
42 |
+
|
43 |
---
|
44 |
|
45 |
+
Feel free to explore and customize the project for your needs. If you encounter any issues or have suggestions for improvements, please let us know!
|
akhbarona_ar.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
from google_drive_handle import authenticate_google_drive
|
6 |
+
drive = authenticate_google_drive()
|
7 |
+
|
8 |
+
headers = {
|
9 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
10 |
+
}
|
11 |
+
|
12 |
+
def scrape_article(article_url):
|
13 |
+
response = requests.get(article_url, headers=headers)
|
14 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
15 |
+
|
16 |
+
title_tag = soup.find('h1')
|
17 |
+
title = title_tag.get_text(strip=True) if title_tag else 'No Title'
|
18 |
+
|
19 |
+
content_div = soup.find('div', id='article_holder') # Ensure this is the correct ID
|
20 |
+
if content_div:
|
21 |
+
content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))
|
22 |
+
else:
|
23 |
+
content = 'Content not found'
|
24 |
+
|
25 |
+
return {
|
26 |
+
'Title': title,
|
27 |
+
'Content': content
|
28 |
+
}
|
29 |
+
|
30 |
+
def scrape_category(category_url, num_articles):
|
31 |
+
articles_scraped = 0
|
32 |
+
all_articles = []
|
33 |
+
page_num = 1
|
34 |
+
|
35 |
+
# Extract site and category from the URL
|
36 |
+
site_name = category_url.split('/')[2] # This gets 'www.akhbarona.com' from the URL
|
37 |
+
site_name = site_name.replace('www.', '')
|
38 |
+
category_name = category_url.split('/')[-1] # This gets the category name from the URL
|
39 |
+
|
40 |
+
while articles_scraped < num_articles:
|
41 |
+
paginated_url = f"{category_url}/index.{page_num}.html"
|
42 |
+
|
43 |
+
response = requests.get(paginated_url, headers=headers)
|
44 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
45 |
+
|
46 |
+
article_links = soup.find_all('h2', class_='article_title')
|
47 |
+
for article_link in article_links:
|
48 |
+
a_tag = article_link.find('a')
|
49 |
+
if a_tag and 'href' in a_tag.attrs:
|
50 |
+
full_article_url = a_tag['href']
|
51 |
+
if not full_article_url.startswith('http'):
|
52 |
+
full_article_url = f"{category_url}/{full_article_url}"
|
53 |
+
article_data = scrape_article(full_article_url)
|
54 |
+
|
55 |
+
all_articles.append(article_data)
|
56 |
+
articles_scraped += 1
|
57 |
+
|
58 |
+
if articles_scraped >= num_articles:
|
59 |
+
break
|
60 |
+
|
61 |
+
if articles_scraped >= num_articles:
|
62 |
+
break
|
63 |
+
|
64 |
+
print(f"Going to next page: {paginated_url}")
|
65 |
+
page_num += 1 # Increment the page number
|
66 |
+
|
67 |
+
|
68 |
+
#csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
|
69 |
+
df = pd.DataFrame(all_articles)
|
70 |
+
csv_file_name = f"{site_name}_{category_name}_articles.csv"
|
71 |
+
csv_file_path = os.path.join(os.getcwd(), csv_file_name) # Full file path
|
72 |
+
df.to_csv(csv_file_path, index=False)
|
73 |
+
print(f"Articles saved to {csv_file_path}")
|
74 |
+
|
75 |
+
return csv_file_path
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
al9anat_ar.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing_extensions import Self
|
2 |
+
from selenium import webdriver
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.chrome.service import Service
|
5 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
6 |
+
from selenium.webdriver.support import expected_conditions as EC
|
7 |
+
from selenium.webdriver.common.by import By
|
8 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
9 |
+
from selenium.webdriver.common.action_chains import ActionChains
|
10 |
+
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
11 |
+
from selenium.common.exceptions import ElementClickInterceptedException
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
+
import time
|
14 |
+
import os
|
15 |
+
import re
|
16 |
+
import requests
|
17 |
+
import json
|
18 |
+
import csv
|
19 |
+
from urllib.parse import urljoin
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
# Set up Chrome WebDriver with options
|
25 |
+
options = ChromeOptions()
|
26 |
+
options.add_argument('--headless')
|
27 |
+
options.add_argument('--no-sandbox')
|
28 |
+
options.add_argument('--disable-dev-shm-usage')
|
29 |
+
options.add_argument('log-level=3')
|
30 |
+
|
31 |
+
|
32 |
+
# Initialize the Chrome WebDriver
|
33 |
+
wd = webdriver.Chrome(options=options)
|
34 |
+
|
35 |
+
def download_image(img_url):
|
36 |
+
return img_url
|
37 |
+
|
38 |
+
def sanitize_filename(filename):
|
39 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
40 |
+
|
41 |
+
def scroll_page(wd, max_scrolls=7, articles_per_load=6, max_attempts=5):
|
42 |
+
scroll_pause_time = 5
|
43 |
+
attempts = 0
|
44 |
+
|
45 |
+
for _ in range(max_scrolls):
|
46 |
+
current_articles = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
|
47 |
+
wd.execute_script("window.scrollBy(0, document.body.scrollHeight);")
|
48 |
+
time.sleep(scroll_pause_time)
|
49 |
+
|
50 |
+
try:
|
51 |
+
load_more_button = WebDriverWait(wd, 10).until(
|
52 |
+
EC.presence_of_element_located((By.XPATH, '//a[@class="ts-button load-button load-button-a ts-button-alt" and @href="#"]'))
|
53 |
+
)
|
54 |
+
wd.execute_script("arguments[0].scrollIntoView();", load_more_button)
|
55 |
+
wd.execute_script("arguments[0].click();", load_more_button)
|
56 |
+
attempts = 0 # Reset attempts after successful button click
|
57 |
+
except TimeoutException:
|
58 |
+
attempts += 1
|
59 |
+
if attempts >= max_attempts:
|
60 |
+
print("Maximum attempts reached without new articles. Exiting.")
|
61 |
+
return False # Exit the function
|
62 |
+
|
63 |
+
new_article_count = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
|
64 |
+
if new_article_count > current_articles:
|
65 |
+
attempts = 0 # Reset attempts after successfully loading new articles
|
66 |
+
else:
|
67 |
+
attempts += 1
|
68 |
+
if attempts >= max_attempts:
|
69 |
+
print("No new articles found after several attempts. Exiting.")
|
70 |
+
return False # Exit the function
|
71 |
+
|
72 |
+
return True
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
def scrape_article_details(article_url, wd):
|
77 |
+
try:
|
78 |
+
# Validate the URL
|
79 |
+
if not article_url.startswith("http"):
|
80 |
+
article_url = "https://" + article_url
|
81 |
+
print("Navigating to:", article_url)
|
82 |
+
|
83 |
+
wd.get(article_url)
|
84 |
+
WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'the-post-tags'))) # Wait for a specific element to ensure the page has loaded
|
85 |
+
|
86 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
87 |
+
content_tag = soup.find('div', class_='post-content cf entry-content content-spacious')
|
88 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
89 |
+
|
90 |
+
category_tag = soup.find('span', class_='meta-item cat-labels')
|
91 |
+
category_from_article = category_tag.get_text().strip() if category_tag else "Uncategorized"
|
92 |
+
|
93 |
+
title_tag = soup.find('h1', class_='is-title post-title')
|
94 |
+
art_title = title_tag.get_text().strip() if title_tag else ""
|
95 |
+
|
96 |
+
date_tag = soup.find('span', class_='meta-item has-next-icon date')
|
97 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
98 |
+
|
99 |
+
image_tag = soup.find('a', class_='image-link')
|
100 |
+
image_url = image_tag['href'] if image_tag else None
|
101 |
+
img_url = urljoin(article_url, image_url)
|
102 |
+
image_path = download_image(img_url) if image_url else None
|
103 |
+
|
104 |
+
return content, date, image_path, art_title, category_from_article
|
105 |
+
except TimeoutException:
|
106 |
+
print("Timed out waiting for page elements to load for URL:", article_url)
|
107 |
+
return "", "", None, "", ""
|
108 |
+
except Exception as e:
|
109 |
+
print(f"An error occurred while scraping article details at {article_url}: {str(e)}")
|
110 |
+
return "", "", None, "", ""
|
111 |
+
|
112 |
+
|
113 |
+
def scrape_category(category_url,num_articles):
|
114 |
+
# Set up Chrome WebDriver with options
|
115 |
+
options = ChromeOptions()
|
116 |
+
options.add_argument('--headless')
|
117 |
+
options.add_argument('--no-sandbox')
|
118 |
+
options.add_argument('--disable-dev-shm-usage')
|
119 |
+
options.add_argument('log-level=3')
|
120 |
+
|
121 |
+
# Initialize the Chrome WebDriver
|
122 |
+
wd = webdriver.Chrome(options=options)
|
123 |
+
print("Attempting to scrape:", category_url)
|
124 |
+
articles_data = []
|
125 |
+
articles_count = 0
|
126 |
+
wd.get(category_url)
|
127 |
+
|
128 |
+
# Adjusted to use num_articles for scrolling and loading articles
|
129 |
+
scroll_page(wd, max_scrolls=int(num_articles/6), articles_per_load=6)
|
130 |
+
|
131 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
132 |
+
articles = soup.find_all('article', class_='l-post grid-base-post grid-post')
|
133 |
+
|
134 |
+
for article in articles[:num_articles]: # Limit to num_articles
|
135 |
+
link_tag = article.find('a', class_='image-link media-ratio ratio-16-9')
|
136 |
+
link = link_tag['href'] if link_tag else ""
|
137 |
+
if link:
|
138 |
+
wd.get(link)
|
139 |
+
article_data = scrape_article_details(link, wd)
|
140 |
+
if article_data[0]: # Check if content is non-empty
|
141 |
+
articles_data.append({
|
142 |
+
"art_id": articles_count,
|
143 |
+
"Title": article_data[3],
|
144 |
+
"Date": article_data[1],
|
145 |
+
"Category": article_data[4],
|
146 |
+
"Content": article_data[0],
|
147 |
+
"Link": link,
|
148 |
+
"Image": article_data[2],
|
149 |
+
})
|
150 |
+
articles_count += 1
|
151 |
+
print(f"Article #{articles_count} scraped: {article_data[3]}")
|
152 |
+
|
153 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
154 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data.csv')
|
155 |
+
try:
|
156 |
+
with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
|
157 |
+
fieldnames = ["art_id", "Title", "Date", "Category", "Content", "Link", "Image"]
|
158 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
159 |
+
writer.writeheader()
|
160 |
+
for article in articles_data:
|
161 |
+
writer.writerow(article)
|
162 |
+
print(f"Data written to {csv_file_path} successfully.")
|
163 |
+
except Exception as e:
|
164 |
+
print(f"Error writing data to file: {e}")
|
165 |
+
|
166 |
+
wd.quit() # Close the WebDriver
|
167 |
+
|
168 |
+
print(f"Total articles scraped: {len(articles_data)}")
|
169 |
+
return csv_file_path
|
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#web interface
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import json
|
6 |
+
import importlib
|
7 |
+
from selenium import webdriver
|
8 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
9 |
+
import google_drive_handle as gdrive
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import os
|
12 |
+
|
13 |
+
# Load config.json
|
14 |
+
with open('config.json') as f:
|
15 |
+
config = json.load(f)
|
16 |
+
|
17 |
+
# Set up Chrome WebDriver with options
|
18 |
+
options = ChromeOptions()
|
19 |
+
options.add_argument('--headless')
|
20 |
+
options.add_argument('--no-sandbox')
|
21 |
+
options.add_argument('--disable-dev-shm-usage')
|
22 |
+
options.add_argument('log-level=3')
|
23 |
+
|
24 |
+
|
25 |
+
# Initialize the Chrome WebDriver
|
26 |
+
wd = webdriver.Chrome(options=options)
|
27 |
+
|
28 |
+
|
29 |
+
drive = gdrive.authenticate_google_drive()
|
30 |
+
processed_files = set()
|
31 |
+
st.markdown(
|
32 |
+
"""
|
33 |
+
<style>
|
34 |
+
.centered {
|
35 |
+
display: flex;
|
36 |
+
align-items: center;
|
37 |
+
justify-content: center;
|
38 |
+
text-align: center;
|
39 |
+
}
|
40 |
+
</style>
|
41 |
+
""",
|
42 |
+
unsafe_allow_html=True
|
43 |
+
)
|
44 |
+
|
45 |
+
st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
|
46 |
+
|
47 |
+
selected_websites = {}
|
48 |
+
selected_categories = {}
|
49 |
+
|
50 |
+
def save_file_id_mapping(file_id_mapping):
|
51 |
+
with open("file_id_mapping.json", "w") as file:
|
52 |
+
json.dump(file_id_mapping, file)
|
53 |
+
|
54 |
+
def load_file_id_mapping():
|
55 |
+
try:
|
56 |
+
with open("file_id_mapping.json", "r") as file:
|
57 |
+
return json.load(file)
|
58 |
+
except FileNotFoundError:
|
59 |
+
return {} # Return an empty dictionary if the file doesn't exist
|
60 |
+
|
61 |
+
file_id_mapping = load_file_id_mapping()
|
62 |
+
|
63 |
+
selected_websites = {}
|
64 |
+
|
65 |
+
for website, details in config.items():
|
66 |
+
if st.checkbox(website, key=website):
|
67 |
+
# Language selection
|
68 |
+
languages = details.get("languages", {})
|
69 |
+
if languages and len(languages) > 1:
|
70 |
+
language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
|
71 |
+
selected_websites[website] = f"{website}_{language}" # like: hespress_en
|
72 |
+
else:
|
73 |
+
selected_websites[website] = website # like: akhbarona
|
74 |
+
|
75 |
+
# Category selection
|
76 |
+
categories = languages.get(language, {})
|
77 |
+
if categories:
|
78 |
+
categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
|
79 |
+
selected_categories[website] = categories
|
80 |
+
|
81 |
+
# Number of articles input
|
82 |
+
num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
|
83 |
+
|
84 |
+
# Start scraping button
|
85 |
+
if st.button('Start Scraping'):
|
86 |
+
with st.spinner('Scraping in progress...'):
|
87 |
+
progress_bar = st.progress(0)
|
88 |
+
total_tasks = sum(len(categories) for categories in selected_categories.values())
|
89 |
+
completed_tasks = 0
|
90 |
+
for website, module_name in selected_websites.items():
|
91 |
+
scraper_module = importlib.import_module(module_name)
|
92 |
+
for category in selected_categories.get(website, []):
|
93 |
+
category_url = config[website]['languages'][language][category]
|
94 |
+
if 'category_name' in config[website]:
|
95 |
+
category_name = config[website]['category_name'].get(category, 'default_category_name')
|
96 |
+
file_path = scraper_module.scrape_category(category_url, num_articles)
|
97 |
+
|
98 |
+
if file_path:
|
99 |
+
if file_path not in file_id_mapping:
|
100 |
+
file_id = gdrive.upload_file_to_drive(drive, file_path)
|
101 |
+
print(f"Uploading file: {file_path}, File ID: {file_id}")
|
102 |
+
file_id_mapping[file_path] = file_id
|
103 |
+
save_file_id_mapping(file_id_mapping)
|
104 |
+
else:
|
105 |
+
file_id = file_id_mapping[file_path]
|
106 |
+
print(f"File already uploaded. Using existing File ID: {file_id}")
|
107 |
+
|
108 |
+
if file_id:
|
109 |
+
download_link = gdrive.get_drive_download_link(drive, file_id)
|
110 |
+
if download_link:
|
111 |
+
#st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
|
112 |
+
|
113 |
+
df = pd.read_csv(file_path)
|
114 |
+
st.write(f"{website} - {category} Data:")
|
115 |
+
st.dataframe(df)
|
116 |
+
else:
|
117 |
+
st.error(f"Failed to retrieve download link for file ID: {file_id}")
|
118 |
+
else:
|
119 |
+
st.error(f"Failed to upload file for {website} - {category}")
|
120 |
+
else:
|
121 |
+
st.error(f"File not created for {website} - {category}")
|
122 |
+
|
123 |
+
st.success('Scraping Completed!')
|
config.json
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"hespress": {
|
3 |
+
"languages": {
|
4 |
+
"en": {
|
5 |
+
"politics": "https://en.hespress.com/politics",
|
6 |
+
"economy": "https://en.hespress.com/economy",
|
7 |
+
"society": "https://en.hespress.com/society",
|
8 |
+
"culture": "https://en.hespress.com/culture",
|
9 |
+
"sports": "https://en.hespress.com/sports",
|
10 |
+
"mena": "https://en.hespress.com/mena",
|
11 |
+
"international": "https://en.hespress.com/international"
|
12 |
+
},
|
13 |
+
"ar": {
|
14 |
+
"Politique": "https://www.hespress.com/politique",
|
15 |
+
"Economie": "https://www.hespress.com/economie",
|
16 |
+
"Tamazight": "https://www.hespress.com/tamazight",
|
17 |
+
"Sport": "https://www.hespress.com/sport",
|
18 |
+
"Societe": "https://www.hespress.com/societe",
|
19 |
+
"Culture": "https://www.hespress.com/art-et-culture",
|
20 |
+
"Medias": "https://www.hespress.com/medias",
|
21 |
+
"faits-divers": "https://www.hespress.com/faits-divers",
|
22 |
+
"Automoto": "https://www.hespress.com/automoto",
|
23 |
+
"Regions": "https://www.hespress.com/regions"
|
24 |
+
},
|
25 |
+
"fr": {
|
26 |
+
"Politique": "https://fr.hespress.com/politique",
|
27 |
+
"Economie": "https://fr.hespress.com/economie",
|
28 |
+
"Monde": "https://fr.hespress.com/monde",
|
29 |
+
"Sport": "https://fr.hespress.com/sport",
|
30 |
+
"Societe": "https://fr.hespress.com/societe",
|
31 |
+
"Culture": "https://fr.hespress.com/culture",
|
32 |
+
"Medias": "https://fr.hespress.com/media",
|
33 |
+
"High-tech": "https://fr.hespress.com/high-tech",
|
34 |
+
"Opinions": "https://fr.hespress.com/opinions",
|
35 |
+
"Regions": "https://fr.hespress.com/regions"
|
36 |
+
}
|
37 |
+
},
|
38 |
+
"module": "hespress"
|
39 |
+
},
|
40 |
+
"akhbarona": {
|
41 |
+
"languages": {
|
42 |
+
"ar": {
|
43 |
+
"economy": " https://www.akhbarona.com/economy",
|
44 |
+
"politic": "https://www.akhbarona.com/politic",
|
45 |
+
"national": "https://www.akhbarona.com/national",
|
46 |
+
"world": "https://www.akhbarona.com/world",
|
47 |
+
"health": "https://www.akhbarona.com/health",
|
48 |
+
"technology": "https://www.akhbarona.com/technology",
|
49 |
+
"culture": "https://www.akhbarona.com/culture",
|
50 |
+
"religion": "https://www.akhbarona.com/religion",
|
51 |
+
"last": "https://www.akhbarona.com/last"
|
52 |
+
},
|
53 |
+
"fr": {}
|
54 |
+
},
|
55 |
+
"module": "akhbarona"
|
56 |
+
},
|
57 |
+
"liberation": {
|
58 |
+
"languages": {
|
59 |
+
"fr": {
|
60 |
+
"Actualites": "https://www.libe.ma/Actualite_r5.html",
|
61 |
+
"Economie": "https://www.libe.ma/Economie_r10.html",
|
62 |
+
"Societe": "https://www.libe.ma/Societe_r7.html",
|
63 |
+
"culture": "https://www.libe.ma/Culture_r8.html",
|
64 |
+
"Sport": "https://www.libe.ma/Sport_r6.html",
|
65 |
+
"international": "https://www.libe.ma/Monde_r17.html",
|
66 |
+
"Entretien": "https://www.libe.ma/Entretien_r14.html",
|
67 |
+
"L'info": "https://www.libe.ma/L-info_r25.html",
|
68 |
+
"Portrait": "https://www.libe.ma/Portrait_r41.html",
|
69 |
+
"Horizon": "https://www.libe.ma/Horizons_r13.html",
|
70 |
+
"People": "https://www.libe.ma/People_r27.html"
|
71 |
+
},
|
72 |
+
"ar": {}
|
73 |
+
},
|
74 |
+
"module": "liberation"
|
75 |
+
},
|
76 |
+
"al9anat": {
|
77 |
+
"languages": {
|
78 |
+
"ar": {
|
79 |
+
"society": "https://www.al9anat.com/%d9%85%d8%ac%d8%aa%d9%85%d8%b9-2/",
|
80 |
+
"politic": "https://www.al9anat.com/%d8%b3%d9%8a%d8%a7%d8%b3%d8%a9/",
|
81 |
+
"economy": "https://www.al9anat.com/%d8%a5%d9%82%d8%aa%d8%b5%d8%a7%d8%af/",
|
82 |
+
"sport": "https://www.al9anat.com/%d8%b1%d9%8a%d8%a7%d8%b6%d8%a9/",
|
83 |
+
"art": "https://www.al9anat.com/%d8%a3%d8%af%d8%a8-%d9%88-%d9%81%d9%86%d9%88%d9%86/",
|
84 |
+
"international": "https://www.al9anat.com/%d8%af%d9%88%d9%84%d9%8a/",
|
85 |
+
"interviews": "https://www.al9anat.com/%d8%ad%d9%88%d8%a7%d8%b1%d8%a7%d8%aa/",
|
86 |
+
"Maroc_outWorld": "https://www.al9anat.com/%d9%85%d8%ba%d8%a7%d8%b1%d8%a8%d8%a9-%d8%a7%d9%84%d8%b9%d8%a7%d9%84%d9%85/"
|
87 |
+
},
|
88 |
+
"fr": {}
|
89 |
+
},
|
90 |
+
"module": "al9anat"
|
91 |
+
}
|
92 |
+
}
|
file_id_mapping.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"C:\\Users\\bourh\\Desktop\\news-scraper\\medias_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\art-et-culture_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\societe_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\_data_ar.csv": "1-O9jOHOzUh51njw3s0hAyGJa4vLpmFo9", "C:\\Users\\bourh\\Desktop\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_technology_articles.csv": "1EYglfLNOrV99rSg93a7SRBzzClHEyrcX", "liberation_art.csv": "1DyOzZg7zfQZB_7nJK01y6SLI-vV16pyS", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\_data.csv": "1k_vpDJ7BjZ37_SUQuXp2XNTLXcPI3aMi", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_health_articles.csv": "16lE-sUbDspD-LlgMneHChHdLn_xgkvjh", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_religion_articles.csv": "1D-dArtoLqf6rT2e5CpcHigTcMy8Gssl7", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\monde_data_fr.csv": "1HPCxTaIQSFLRjToqRp6jzRBDES-C2Nqc", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\societe_data_fr.csv": "1KyGSL7Qb6X9Ru04D9qm5DXfgYSY2KHen", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_culture_articles.csv": "1fFOoItXzEbWxfn9maFAxT2VPgO3ZjD47", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_last_articles.csv": "1pjSbjCsraB1SA2vtchzsP17VbjW8z4rY", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_economy_articles.csv": "1rBJVKgEBZO__XuyVQos5pN49bneLwN36", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_national_articles.csv": "1dsIFUh_rDEQOD2X_B3tzBOQNzUcjRRNG", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\culture_data_en.csv": "1B12V7CW0UfTRyXn6fc4opOkz1ap88Gug"}
|
google_drive_handle.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
from pydrive.auth import GoogleAuth
|
3 |
+
from pydrive.drive import GoogleDrive
|
4 |
+
from oauth2client.client import OAuth2Credentials
|
5 |
+
import os
|
6 |
+
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
CLIENT_ID = os.getenv('CLIENT_ID')
|
10 |
+
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
|
11 |
+
REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')
|
12 |
+
REDIRECT_URI = os.getenv('REDIRECT_URIS').split(',')[0] # Access the first URI
|
13 |
+
|
14 |
+
def authenticate_google_drive():
|
15 |
+
gauth = GoogleAuth()
|
16 |
+
gauth.credentials = OAuth2Credentials(None, CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, None,
|
17 |
+
"https://accounts.google.com/o/oauth2/token", None, "web")
|
18 |
+
drive = GoogleDrive(gauth)
|
19 |
+
return drive
|
20 |
+
|
21 |
+
drive = authenticate_google_drive()
|
22 |
+
|
23 |
+
def upload_file_to_drive(drive, file_path, folder_id=None):
|
24 |
+
if not os.path.exists(file_path):
|
25 |
+
print(f"Cannot upload, file does not exist at path: {file_path}")
|
26 |
+
return None
|
27 |
+
|
28 |
+
try:
|
29 |
+
file_metadata = {'title': os.path.basename(file_path)}
|
30 |
+
if folder_id:
|
31 |
+
file_metadata['parents'] = [{'id': folder_id}]
|
32 |
+
|
33 |
+
upload_file = drive.CreateFile(file_metadata)
|
34 |
+
|
35 |
+
# Check if the file already exists on Google Drive
|
36 |
+
existing_files = drive.ListFile({'q': f"title='{upload_file['title']}'"}).GetList()
|
37 |
+
if existing_files:
|
38 |
+
# File with the same name already exists, update the existing file
|
39 |
+
upload_file = existing_files[0]
|
40 |
+
print(f"File already exists on Drive. Updating file with ID: {upload_file['id']}")
|
41 |
+
else:
|
42 |
+
print("Uploading a new file to Drive.")
|
43 |
+
|
44 |
+
upload_file.SetContentFile(file_path)
|
45 |
+
upload_file.Upload()
|
46 |
+
print(f"File uploaded successfully. File ID: {upload_file['id']}")
|
47 |
+
return upload_file['id']
|
48 |
+
except Exception as e:
|
49 |
+
print(f"An error occurred during file upload: {e}")
|
50 |
+
return None
|
51 |
+
|
52 |
+
|
53 |
+
def get_drive_download_link(drive, file_id):
|
54 |
+
try:
|
55 |
+
file = drive.CreateFile({'id': file_id})
|
56 |
+
file.Upload() # Make sure the file exists on Drive
|
57 |
+
file.InsertPermission({
|
58 |
+
'type': 'anyone',
|
59 |
+
'value': 'anyone',
|
60 |
+
'role': 'reader'})
|
61 |
+
return "https://drive.google.com/uc?export=download&id=" + file_id
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error fetching download link: {e}")
|
64 |
+
return None
|
hespress_ar.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
3 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
4 |
+
from selenium.webdriver.support import expected_conditions as EC
|
5 |
+
from selenium.webdriver.common.by import By
|
6 |
+
from selenium.common.exceptions import TimeoutException
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import time
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
import requests
|
12 |
+
import csv
|
13 |
+
from urllib.parse import urljoin
|
14 |
+
from google_drive_handle import authenticate_google_drive
|
15 |
+
drive = authenticate_google_drive()
|
16 |
+
|
17 |
+
|
18 |
+
# Set up Chrome WebDriver with options
|
19 |
+
options = ChromeOptions()
|
20 |
+
options.add_argument('--headless')
|
21 |
+
options.add_argument('--no-sandbox')
|
22 |
+
options.add_argument('--disable-dev-shm-usage')
|
23 |
+
options.add_argument('log-level=3')
|
24 |
+
|
25 |
+
# Initialize the Chrome WebDriver
|
26 |
+
wd = webdriver.Chrome(options=options)
|
27 |
+
|
28 |
+
def download_image(img_url):
|
29 |
+
return img_url
|
30 |
+
|
31 |
+
def scroll_page(expected_article_count):
|
32 |
+
scroll_pause_time = 2
|
33 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
34 |
+
scrolled_height = 0
|
35 |
+
|
36 |
+
while True:
|
37 |
+
scrolled_height += screen_height
|
38 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
39 |
+
time.sleep(scroll_pause_time)
|
40 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
41 |
+
if scrolled_height >= new_height:
|
42 |
+
break
|
43 |
+
|
44 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
45 |
+
articles = soup.find_all('div', class_='overlay card')
|
46 |
+
if len(articles) >= expected_article_count:
|
47 |
+
break
|
48 |
+
|
49 |
+
def scrape_article_details(article_url):
|
50 |
+
try:
|
51 |
+
wd.get(article_url)
|
52 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
53 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
54 |
+
content_tag = soup.find('div', class_='article-content')
|
55 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
56 |
+
date_tag = soup.find('small', class_='text-muted time')
|
57 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
58 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
59 |
+
image_url = image_tag['src'] if image_tag else None
|
60 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
61 |
+
return content, date, img_url
|
62 |
+
except TimeoutException:
|
63 |
+
print("Timed out waiting for page elements to load")
|
64 |
+
return "", "", None
|
65 |
+
except Exception as e:
|
66 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
67 |
+
return "", "", None
|
68 |
+
|
69 |
+
def scrape_article_details(article_url):
|
70 |
+
|
71 |
+
try:
|
72 |
+
wd.get(article_url)
|
73 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
|
74 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
75 |
+
|
76 |
+
content_tag = soup.find('div', class_='article-content')
|
77 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
78 |
+
|
79 |
+
date_tag = soup.find('small', class_='text-muted time')
|
80 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
81 |
+
|
82 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
83 |
+
image_url = image_tag['src'] if image_tag else None
|
84 |
+
|
85 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
86 |
+
|
87 |
+
return content, date, img_url
|
88 |
+
|
89 |
+
except TimeoutException:
|
90 |
+
print("Timed out waiting for page elements to load")
|
91 |
+
return "", "", None, ""
|
92 |
+
except Exception as e:
|
93 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
94 |
+
return "", "", None, ""
|
95 |
+
|
96 |
+
def sanitize_filename(filename):
|
97 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
98 |
+
|
99 |
+
def scrape_category(category_url, num_articles):
|
100 |
+
print("Attempting to scrape:", category_url)
|
101 |
+
articles_data = []
|
102 |
+
wd.get(category_url)
|
103 |
+
scroll_page(num_articles)
|
104 |
+
|
105 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
106 |
+
articles = soup.find_all('div', class_='overlay card')
|
107 |
+
for article in articles[:num_articles]:
|
108 |
+
link_tag = article.find('a', class_='stretched-link')
|
109 |
+
link = link_tag['href'] if link_tag else ""
|
110 |
+
title_tag = article.find('h3', class_='card-title')
|
111 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
112 |
+
content, date, img_url = scrape_article_details(link)
|
113 |
+
article_data = {
|
114 |
+
"Title": title,
|
115 |
+
"Date": date,
|
116 |
+
"Category": category_url.split('/')[-1],
|
117 |
+
"Content": content,
|
118 |
+
"Link": link,
|
119 |
+
"Image": img_url
|
120 |
+
}
|
121 |
+
print(f"Scraping article: {title}, Link: {link}")
|
122 |
+
articles_data.append(article_data)
|
123 |
+
|
124 |
+
|
125 |
+
# Save scraped data to a CSV file
|
126 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
127 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_ar.csv')
|
128 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
129 |
+
|
130 |
+
try:
|
131 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
132 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
133 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
134 |
+
if file_mode == 'w':
|
135 |
+
writer.writeheader()
|
136 |
+
for article in articles_data:
|
137 |
+
writer.writerow(article)
|
138 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
139 |
+
except IOError as e:
|
140 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
141 |
+
return None # Return None to indicate failure
|
142 |
+
|
143 |
+
# Check if the file exists before uploading
|
144 |
+
|
145 |
+
if os.path.exists(csv_file_path):
|
146 |
+
print(f"File successfully created at {csv_file_path}")
|
147 |
+
return csv_file_path
|
148 |
+
|
149 |
+
else:
|
150 |
+
print(f"Failed to create file for {category_url}")
|
151 |
+
return None
|
hespress_en.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.common.exceptions import TimeoutException
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
import csv
|
14 |
+
from urllib.parse import urljoin
|
15 |
+
from google_drive_handle import authenticate_google_drive
|
16 |
+
drive = authenticate_google_drive()
|
17 |
+
|
18 |
+
|
19 |
+
# Set up Chrome WebDriver with options
|
20 |
+
options = ChromeOptions()
|
21 |
+
options.add_argument('--headless')
|
22 |
+
options.add_argument('--no-sandbox')
|
23 |
+
options.add_argument('--disable-dev-shm-usage')
|
24 |
+
options.add_argument('log-level=3')
|
25 |
+
|
26 |
+
# Initialize the Chrome WebDriver
|
27 |
+
wd = webdriver.Chrome(options=options)
|
28 |
+
|
29 |
+
|
30 |
+
def download_image(img_url):
|
31 |
+
return img_url
|
32 |
+
|
33 |
+
def scroll_page(expected_article_count):
|
34 |
+
scroll_pause_time = 2
|
35 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
36 |
+
scrolled_height = 0
|
37 |
+
|
38 |
+
while True:
|
39 |
+
scrolled_height += screen_height
|
40 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
41 |
+
time.sleep(scroll_pause_time)
|
42 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
43 |
+
if scrolled_height >= new_height:
|
44 |
+
break
|
45 |
+
|
46 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
47 |
+
articles = soup.find_all('div', class_='overlay card')
|
48 |
+
if len(articles) >= expected_article_count:
|
49 |
+
break
|
50 |
+
|
51 |
+
def scrape_article_details(article_url):
|
52 |
+
try:
|
53 |
+
wd.get(article_url)
|
54 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
55 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
56 |
+
content_tag = soup.find('div', class_='article-content')
|
57 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
58 |
+
date_tag = soup.find('small', class_='text-muted time')
|
59 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
60 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
61 |
+
image_url = image_tag['src'] if image_tag else None
|
62 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
63 |
+
return content, date, img_url
|
64 |
+
except TimeoutException:
|
65 |
+
print("Timed out waiting for page elements to load")
|
66 |
+
return "", "", None
|
67 |
+
except Exception as e:
|
68 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
69 |
+
return "", "", None
|
70 |
+
|
71 |
+
def sanitize_filename(filename):
|
72 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
73 |
+
|
74 |
+
def scrape_category(category_url, num_articles):
|
75 |
+
print("Attempting to scrape:", category_url)
|
76 |
+
articles_data = []
|
77 |
+
wd.get(category_url)
|
78 |
+
scroll_page(num_articles)
|
79 |
+
|
80 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
81 |
+
articles = soup.find_all('div', class_='overlay card')
|
82 |
+
for article in articles[:num_articles]:
|
83 |
+
link_tag = article.find('a', class_='stretched-link')
|
84 |
+
link = link_tag['href'] if link_tag else ""
|
85 |
+
title_tag = article.find('h3', class_='card-title')
|
86 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
87 |
+
content, date, img_url = scrape_article_details(link)
|
88 |
+
article_data = {
|
89 |
+
"Title": title,
|
90 |
+
"Date": date,
|
91 |
+
"Category": category_url.split('/')[-1],
|
92 |
+
"Content": content,
|
93 |
+
"Link": link,
|
94 |
+
"Image": img_url
|
95 |
+
}
|
96 |
+
print(f"Scraping article: {title}, Link: {link}")
|
97 |
+
articles_data.append(article_data)
|
98 |
+
|
99 |
+
# Save scraped data to a CSV file
|
100 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
101 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
|
102 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
103 |
+
|
104 |
+
try:
|
105 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
106 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
107 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
108 |
+
if file_mode == 'w':
|
109 |
+
writer.writeheader()
|
110 |
+
for article in articles_data:
|
111 |
+
writer.writerow(article)
|
112 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
113 |
+
except IOError as e:
|
114 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
115 |
+
return None # Return None to indicate failure
|
116 |
+
|
117 |
+
# Check if the file exists before uploading
|
118 |
+
|
119 |
+
if os.path.exists(csv_file_path):
|
120 |
+
print(f"File successfully created at {csv_file_path}")
|
121 |
+
return csv_file_path
|
122 |
+
|
123 |
+
else:
|
124 |
+
print(f"Failed to create file for {category_url}")
|
125 |
+
return None
|
hespress_fr.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
3 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.common.exceptions import TimeoutException
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import time
|
10 |
+
import re
|
11 |
+
import os
|
12 |
+
import requests
|
13 |
+
import csv
|
14 |
+
from urllib.parse import urljoin
|
15 |
+
from google_drive_handle import authenticate_google_drive
|
16 |
+
drive = authenticate_google_drive()
|
17 |
+
|
18 |
+
|
19 |
+
# Set up Chrome WebDriver with options
|
20 |
+
options = ChromeOptions()
|
21 |
+
options.add_argument('--headless')
|
22 |
+
options.add_argument('--no-sandbox')
|
23 |
+
options.add_argument('--disable-dev-shm-usage')
|
24 |
+
options.add_argument('log-level=3')
|
25 |
+
|
26 |
+
|
27 |
+
# Initialize the Chrome WebDriver
|
28 |
+
wd = webdriver.Chrome(options=options)
|
29 |
+
|
30 |
+
def download_image(img_url):
|
31 |
+
return img_url
|
32 |
+
|
33 |
+
def scroll_page(expected_article_count):
|
34 |
+
scroll_pause_time = 2
|
35 |
+
screen_height = wd.execute_script("return window.innerHeight;")
|
36 |
+
scrolled_height = 0
|
37 |
+
|
38 |
+
while True:
|
39 |
+
scrolled_height += screen_height
|
40 |
+
wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
|
41 |
+
time.sleep(scroll_pause_time)
|
42 |
+
new_height = wd.execute_script("return document.body.scrollHeight")
|
43 |
+
if scrolled_height >= new_height:
|
44 |
+
break
|
45 |
+
|
46 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
47 |
+
articles = soup.find_all('div', class_='overlay card')
|
48 |
+
if len(articles) >= expected_article_count:
|
49 |
+
break
|
50 |
+
|
51 |
+
def scrape_article_details(article_url):
|
52 |
+
try:
|
53 |
+
wd.get(article_url)
|
54 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
|
55 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
56 |
+
content_tag = soup.find('div', class_='article-content')
|
57 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
58 |
+
date_tag = soup.find('small', class_='text-muted time')
|
59 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
60 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
61 |
+
image_url = image_tag['src'] if image_tag else None
|
62 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
63 |
+
return content, date, img_url
|
64 |
+
except TimeoutException:
|
65 |
+
print("Timed out waiting for page elements to load")
|
66 |
+
return "", "", None
|
67 |
+
except Exception as e:
|
68 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
69 |
+
return "", "", None
|
70 |
+
|
71 |
+
def scrape_article_details(article_url):
|
72 |
+
|
73 |
+
try:
|
74 |
+
wd.get(article_url)
|
75 |
+
WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
|
76 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
77 |
+
|
78 |
+
content_tag = soup.find('div', class_='article-content')
|
79 |
+
content = content_tag.get_text().strip() if content_tag else ""
|
80 |
+
|
81 |
+
date_tag = soup.find('small', class_='text-muted time')
|
82 |
+
date = date_tag.get_text().strip() if date_tag else ""
|
83 |
+
|
84 |
+
image_tag = soup.find('img', class_='wp-post-image')
|
85 |
+
image_url = image_tag['src'] if image_tag else None
|
86 |
+
|
87 |
+
img_url = download_image(urljoin(article_url, image_url)) if image_url else None
|
88 |
+
|
89 |
+
return content, date, img_url
|
90 |
+
|
91 |
+
except TimeoutException:
|
92 |
+
print("Timed out waiting for page elements to load")
|
93 |
+
return "", "", None, ""
|
94 |
+
except Exception as e:
|
95 |
+
print(f"An error occurred while scraping article details: {str(e)}")
|
96 |
+
return "", "", None, ""
|
97 |
+
|
98 |
+
def sanitize_filename(filename):
|
99 |
+
return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
|
100 |
+
|
101 |
+
def scrape_category(category_url, num_articles):
|
102 |
+
print("Attempting to scrape:", category_url)
|
103 |
+
articles_data = []
|
104 |
+
wd.get(category_url)
|
105 |
+
scroll_page(num_articles)
|
106 |
+
|
107 |
+
soup = BeautifulSoup(wd.page_source, 'html.parser')
|
108 |
+
articles = soup.find_all('div', class_='overlay card')
|
109 |
+
for article in articles[:num_articles]:
|
110 |
+
link_tag = article.find('a', class_='stretched-link')
|
111 |
+
link = link_tag['href'] if link_tag else ""
|
112 |
+
title_tag = article.find('h3', class_='card-title')
|
113 |
+
title = title_tag.get_text().strip() if title_tag else ""
|
114 |
+
content, date, img_url = scrape_article_details(link)
|
115 |
+
article_data = {
|
116 |
+
"Title": title,
|
117 |
+
"Date": date,
|
118 |
+
"Category": category_url.split('/')[-1],
|
119 |
+
"Content": content,
|
120 |
+
"Link": link,
|
121 |
+
"Image": img_url
|
122 |
+
}
|
123 |
+
print(f"Scraping article: {title}, Link: {link}")
|
124 |
+
articles_data.append(article_data)
|
125 |
+
|
126 |
+
|
127 |
+
# Save scraped data to a CSV file
|
128 |
+
category_name = sanitize_filename(category_url.split("/")[-1])
|
129 |
+
csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_fr.csv')
|
130 |
+
file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
|
131 |
+
|
132 |
+
try:
|
133 |
+
with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
|
134 |
+
fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
|
135 |
+
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
136 |
+
if file_mode == 'w':
|
137 |
+
writer.writeheader()
|
138 |
+
for article in articles_data:
|
139 |
+
writer.writerow(article)
|
140 |
+
print(f"CSV file saved successfully at {csv_file_path}")
|
141 |
+
except IOError as e:
|
142 |
+
print(f"Failed to save file at {csv_file_path}: {e}")
|
143 |
+
return None # Return None to indicate failure
|
144 |
+
|
145 |
+
# Check if the file exists before uploading
|
146 |
+
|
147 |
+
if os.path.exists(csv_file_path):
|
148 |
+
print(f"File successfully created at {csv_file_path}")
|
149 |
+
return csv_file_path
|
150 |
+
|
151 |
+
else:
|
152 |
+
print(f"Failed to create file for {category_url}")
|
153 |
+
return None
|
liberation_fr.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import required libraries
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
import pandas as pd
|
5 |
+
import time
|
6 |
+
import timeit
|
7 |
+
|
8 |
+
# Headers for simulating a browser request
|
9 |
+
headers = {
|
10 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
11 |
+
}
|
12 |
+
|
13 |
+
def faire_requete(url):
|
14 |
+
"""
|
15 |
+
Effectuer une requête HTTP avec gestion des erreurs
|
16 |
+
Args:
|
17 |
+
url (str): l'URL de la requête HTTP
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
bytes or None: Le contenu de la réponse si la requête est réussie, sinon None.
|
21 |
+
"""
|
22 |
+
try:
|
23 |
+
with requests.get(url, headers=headers) as reponse:
|
24 |
+
reponse.raise_for_status()
|
25 |
+
return reponse.content
|
26 |
+
except requests.RequestException as e:
|
27 |
+
print(f"Erreur de requête HTTP: {e}")
|
28 |
+
return None
|
29 |
+
|
30 |
+
def extract_articles(category_url, num_articles):
|
31 |
+
temps_debut = timeit.default_timer()
|
32 |
+
liens_articles = []
|
33 |
+
current_count = 0
|
34 |
+
|
35 |
+
while current_count < num_articles:
|
36 |
+
time.sleep(2)
|
37 |
+
contenu = faire_requete(category_url + f"?start={current_count}&order=")
|
38 |
+
|
39 |
+
if contenu:
|
40 |
+
soup = BeautifulSoup(contenu, "html.parser")
|
41 |
+
liens = soup.find_all("h3", {"class":"titre_article"})
|
42 |
+
for lien in liens:
|
43 |
+
if current_count >= num_articles:
|
44 |
+
break
|
45 |
+
liens_articles.append("https://www.libe.ma" + lien.a["href"])
|
46 |
+
current_count += 1
|
47 |
+
|
48 |
+
lignes = []
|
49 |
+
for lien in liens_articles:
|
50 |
+
time.sleep(2)
|
51 |
+
contenu = faire_requete(lien)
|
52 |
+
if contenu:
|
53 |
+
soup = BeautifulSoup(contenu, "html.parser")
|
54 |
+
try:
|
55 |
+
titre = soup.find("h1", {"class":"access"}).text.replace("\n", "").strip()
|
56 |
+
except:
|
57 |
+
titre = None
|
58 |
+
try:
|
59 |
+
description = soup.find("div", {"class":"access firstletter"}).text.replace("\n", "").strip()
|
60 |
+
except:
|
61 |
+
description = None
|
62 |
+
try:
|
63 |
+
date = soup.find("div", {"class":"date"}).text.replace("\n", "").strip()
|
64 |
+
except:
|
65 |
+
date = None
|
66 |
+
lignes.append([titre, description, date])
|
67 |
+
|
68 |
+
return lignes
|
69 |
+
|
70 |
+
def scrape_category(category_url, num_articles):
|
71 |
+
article_data = extract_articles(category_url, num_articles)
|
72 |
+
|
73 |
+
colonnes = ["titre", "content", "date"]
|
74 |
+
articles_df = pd.DataFrame(article_data, columns=colonnes)
|
75 |
+
|
76 |
+
csv_file_path = "liberation_art.csv"
|
77 |
+
articles_df.to_csv(csv_file_path, index=False)
|
78 |
+
|
79 |
+
return csv_file_path
|
80 |
+
'''
|
81 |
+
if __name__ == "__main__":
|
82 |
+
category_url = "https://www.libe.ma/Economie_r10.html"
|
83 |
+
num_articles = 10 # Number of articles to scrape
|
84 |
+
csv_file_path = scrape_category(category_url, num_articles)
|
85 |
+
# Now, csv_file_path can be used in Streamlit for uploading
|
86 |
+
'''
|