S-11 commited on
Commit
e45d093
1 Parent(s): 2b961fb

Upload 13 files

Browse files
Files changed (13) hide show
  1. .env +9 -0
  2. .gitignore +6 -0
  3. README.md +43 -11
  4. akhbarona_ar.py +79 -0
  5. al9anat_ar.py +169 -0
  6. app.py +123 -0
  7. config.json +92 -0
  8. file_id_mapping.json +1 -0
  9. google_drive_handle.py +64 -0
  10. hespress_ar.py +151 -0
  11. hespress_en.py +125 -0
  12. hespress_fr.py +153 -0
  13. liberation_fr.py +86 -0
.env ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ CLIENT_ID=960753990614-492j6pk974fjq94qvls3mcgee1sbi4dv.apps.googleusercontent.com
2
+ PROJECT_ID=moroccan-news-aggregator
3
+ AUTH_URI=https://accounts.google.com/o/oauth2/auth
4
+ TOKEN_URI=https://oauth2.googleapis.com/token
5
+ AUTH_PROVIDER_X509_CERT_URL=https://www.googleapis.com/oauth2/v1/certs
6
+ CLIENT_SECRET=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
7
+ REFRESH_TOKEN=1//04ayA66paFryZCgYIARAAGAQSNwF-L9IreeWNlmWv38CCaRqvv_W8VHEp7ysy1A36bTZk3ViCCE9pOabmcKPWNfyz6HJgYm0fkZs
8
+ REDIRECT_URIS=https://developers.google.com/oauthplayground,http://localhost:8501,http://localhost:8080
9
+
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ file_id_mapping.json
2
+ __pycache/
3
+ .env
4
+ .gitignore
5
+ .git/
6
+ UI-design.pdf
README.md CHANGED
@@ -1,13 +1,45 @@
1
- ---
2
- title: A8 Moroccan News Aggregator
3
- emoji: 📊
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: streamlit
7
- sdk_version: 1.31.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Moroccan News Aggregator
2
+
3
+ The Moroccan News Aggregator is a simple web scraping project designed to extract news articles from popular Moroccan news websites. The goal is to provide users with a convenient way to access and categorize news content from different sources.
4
+
5
+ ## Features
6
+
7
+ - **Multi-Language Support:** Choose news articles in English, Arabic, or French languages from websites such as Hespress, Akhbarona, and Le360.
8
+
9
+ - **Category Selection:** Select specific categories within each language to filter news articles based on your interests.
10
+
11
+ - **Data Storage:** The scraped data is uploaded to Google Drive, ensuring easy access and sharing.
12
+
13
+ ## Setup Instructions
14
+
15
+ 1. **Clone the Repository:**
16
+
17
+ ```bash
18
+ git clone <repository-url>
19
+ ```
20
+
21
+ 2. **Install Dependencies:**
22
+
23
+ ```bash
24
+ pip install -r requirements.txt
25
+ ```
26
+
27
+ 3. **Run the App:**
28
+
29
+ ```bash
30
+ streamlit run app.py
31
+ ```
32
+
33
+ 4. Follow the on-screen instructions to choose websites, languages, categories, and start scraping news articles.
34
+
35
+ ## Configuration
36
+
37
+ Adjust settings in the `config.json` file to customize supported websites, languages, and categories.
38
+
39
+ ## License
40
+
41
+ This project is licensed under the [MIT License](LICENSE).
42
+
43
  ---
44
 
45
+ Feel free to explore and customize the project for your needs. If you encounter any issues or have suggestions for improvements, please let us know!
akhbarona_ar.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import os
5
+ from google_drive_handle import authenticate_google_drive
6
+ drive = authenticate_google_drive()
7
+
8
+ headers = {
9
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
10
+ }
11
+
12
+ def scrape_article(article_url):
13
+ response = requests.get(article_url, headers=headers)
14
+ soup = BeautifulSoup(response.content, 'html.parser')
15
+
16
+ title_tag = soup.find('h1')
17
+ title = title_tag.get_text(strip=True) if title_tag else 'No Title'
18
+
19
+ content_div = soup.find('div', id='article_holder') # Ensure this is the correct ID
20
+ if content_div:
21
+ content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))
22
+ else:
23
+ content = 'Content not found'
24
+
25
+ return {
26
+ 'Title': title,
27
+ 'Content': content
28
+ }
29
+
30
+ def scrape_category(category_url, num_articles):
31
+ articles_scraped = 0
32
+ all_articles = []
33
+ page_num = 1
34
+
35
+ # Extract site and category from the URL
36
+ site_name = category_url.split('/')[2] # This gets 'www.akhbarona.com' from the URL
37
+ site_name = site_name.replace('www.', '')
38
+ category_name = category_url.split('/')[-1] # This gets the category name from the URL
39
+
40
+ while articles_scraped < num_articles:
41
+ paginated_url = f"{category_url}/index.{page_num}.html"
42
+
43
+ response = requests.get(paginated_url, headers=headers)
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+
46
+ article_links = soup.find_all('h2', class_='article_title')
47
+ for article_link in article_links:
48
+ a_tag = article_link.find('a')
49
+ if a_tag and 'href' in a_tag.attrs:
50
+ full_article_url = a_tag['href']
51
+ if not full_article_url.startswith('http'):
52
+ full_article_url = f"{category_url}/{full_article_url}"
53
+ article_data = scrape_article(full_article_url)
54
+
55
+ all_articles.append(article_data)
56
+ articles_scraped += 1
57
+
58
+ if articles_scraped >= num_articles:
59
+ break
60
+
61
+ if articles_scraped >= num_articles:
62
+ break
63
+
64
+ print(f"Going to next page: {paginated_url}")
65
+ page_num += 1 # Increment the page number
66
+
67
+
68
+ #csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
69
+ df = pd.DataFrame(all_articles)
70
+ csv_file_name = f"{site_name}_{category_name}_articles.csv"
71
+ csv_file_path = os.path.join(os.getcwd(), csv_file_name) # Full file path
72
+ df.to_csv(csv_file_path, index=False)
73
+ print(f"Articles saved to {csv_file_path}")
74
+
75
+ return csv_file_path
76
+
77
+
78
+
79
+
al9anat_ar.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import Self
2
+ from selenium import webdriver
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.chrome.service import Service
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from selenium.webdriver.common.by import By
8
+ from webdriver_manager.chrome import ChromeDriverManager
9
+ from selenium.webdriver.common.action_chains import ActionChains
10
+ from selenium.common.exceptions import NoSuchElementException, TimeoutException
11
+ from selenium.common.exceptions import ElementClickInterceptedException
12
+ from bs4 import BeautifulSoup
13
+ import time
14
+ import os
15
+ import re
16
+ import requests
17
+ import json
18
+ import csv
19
+ from urllib.parse import urljoin
20
+
21
+
22
+
23
+
24
+ # Set up Chrome WebDriver with options
25
+ options = ChromeOptions()
26
+ options.add_argument('--headless')
27
+ options.add_argument('--no-sandbox')
28
+ options.add_argument('--disable-dev-shm-usage')
29
+ options.add_argument('log-level=3')
30
+
31
+
32
+ # Initialize the Chrome WebDriver
33
+ wd = webdriver.Chrome(options=options)
34
+
35
+ def download_image(img_url):
36
+ return img_url
37
+
38
+ def sanitize_filename(filename):
39
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
40
+
41
+ def scroll_page(wd, max_scrolls=7, articles_per_load=6, max_attempts=5):
42
+ scroll_pause_time = 5
43
+ attempts = 0
44
+
45
+ for _ in range(max_scrolls):
46
+ current_articles = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
47
+ wd.execute_script("window.scrollBy(0, document.body.scrollHeight);")
48
+ time.sleep(scroll_pause_time)
49
+
50
+ try:
51
+ load_more_button = WebDriverWait(wd, 10).until(
52
+ EC.presence_of_element_located((By.XPATH, '//a[@class="ts-button load-button load-button-a ts-button-alt" and @href="#"]'))
53
+ )
54
+ wd.execute_script("arguments[0].scrollIntoView();", load_more_button)
55
+ wd.execute_script("arguments[0].click();", load_more_button)
56
+ attempts = 0 # Reset attempts after successful button click
57
+ except TimeoutException:
58
+ attempts += 1
59
+ if attempts >= max_attempts:
60
+ print("Maximum attempts reached without new articles. Exiting.")
61
+ return False # Exit the function
62
+
63
+ new_article_count = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
64
+ if new_article_count > current_articles:
65
+ attempts = 0 # Reset attempts after successfully loading new articles
66
+ else:
67
+ attempts += 1
68
+ if attempts >= max_attempts:
69
+ print("No new articles found after several attempts. Exiting.")
70
+ return False # Exit the function
71
+
72
+ return True
73
+
74
+
75
+
76
+ def scrape_article_details(article_url, wd):
77
+ try:
78
+ # Validate the URL
79
+ if not article_url.startswith("http"):
80
+ article_url = "https://" + article_url
81
+ print("Navigating to:", article_url)
82
+
83
+ wd.get(article_url)
84
+ WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'the-post-tags'))) # Wait for a specific element to ensure the page has loaded
85
+
86
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
87
+ content_tag = soup.find('div', class_='post-content cf entry-content content-spacious')
88
+ content = content_tag.get_text().strip() if content_tag else ""
89
+
90
+ category_tag = soup.find('span', class_='meta-item cat-labels')
91
+ category_from_article = category_tag.get_text().strip() if category_tag else "Uncategorized"
92
+
93
+ title_tag = soup.find('h1', class_='is-title post-title')
94
+ art_title = title_tag.get_text().strip() if title_tag else ""
95
+
96
+ date_tag = soup.find('span', class_='meta-item has-next-icon date')
97
+ date = date_tag.get_text().strip() if date_tag else ""
98
+
99
+ image_tag = soup.find('a', class_='image-link')
100
+ image_url = image_tag['href'] if image_tag else None
101
+ img_url = urljoin(article_url, image_url)
102
+ image_path = download_image(img_url) if image_url else None
103
+
104
+ return content, date, image_path, art_title, category_from_article
105
+ except TimeoutException:
106
+ print("Timed out waiting for page elements to load for URL:", article_url)
107
+ return "", "", None, "", ""
108
+ except Exception as e:
109
+ print(f"An error occurred while scraping article details at {article_url}: {str(e)}")
110
+ return "", "", None, "", ""
111
+
112
+
113
+ def scrape_category(category_url,num_articles):
114
+ # Set up Chrome WebDriver with options
115
+ options = ChromeOptions()
116
+ options.add_argument('--headless')
117
+ options.add_argument('--no-sandbox')
118
+ options.add_argument('--disable-dev-shm-usage')
119
+ options.add_argument('log-level=3')
120
+
121
+ # Initialize the Chrome WebDriver
122
+ wd = webdriver.Chrome(options=options)
123
+ print("Attempting to scrape:", category_url)
124
+ articles_data = []
125
+ articles_count = 0
126
+ wd.get(category_url)
127
+
128
+ # Adjusted to use num_articles for scrolling and loading articles
129
+ scroll_page(wd, max_scrolls=int(num_articles/6), articles_per_load=6)
130
+
131
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
132
+ articles = soup.find_all('article', class_='l-post grid-base-post grid-post')
133
+
134
+ for article in articles[:num_articles]: # Limit to num_articles
135
+ link_tag = article.find('a', class_='image-link media-ratio ratio-16-9')
136
+ link = link_tag['href'] if link_tag else ""
137
+ if link:
138
+ wd.get(link)
139
+ article_data = scrape_article_details(link, wd)
140
+ if article_data[0]: # Check if content is non-empty
141
+ articles_data.append({
142
+ "art_id": articles_count,
143
+ "Title": article_data[3],
144
+ "Date": article_data[1],
145
+ "Category": article_data[4],
146
+ "Content": article_data[0],
147
+ "Link": link,
148
+ "Image": article_data[2],
149
+ })
150
+ articles_count += 1
151
+ print(f"Article #{articles_count} scraped: {article_data[3]}")
152
+
153
+ category_name = sanitize_filename(category_url.split("/")[-1])
154
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data.csv')
155
+ try:
156
+ with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
157
+ fieldnames = ["art_id", "Title", "Date", "Category", "Content", "Link", "Image"]
158
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
159
+ writer.writeheader()
160
+ for article in articles_data:
161
+ writer.writerow(article)
162
+ print(f"Data written to {csv_file_path} successfully.")
163
+ except Exception as e:
164
+ print(f"Error writing data to file: {e}")
165
+
166
+ wd.quit() # Close the WebDriver
167
+
168
+ print(f"Total articles scraped: {len(articles_data)}")
169
+ return csv_file_path
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #web interface
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import json
6
+ import importlib
7
+ from selenium import webdriver
8
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
9
+ import google_drive_handle as gdrive
10
+ from dotenv import load_dotenv
11
+ import os
12
+
13
+ # Load config.json
14
+ with open('config.json') as f:
15
+ config = json.load(f)
16
+
17
+ # Set up Chrome WebDriver with options
18
+ options = ChromeOptions()
19
+ options.add_argument('--headless')
20
+ options.add_argument('--no-sandbox')
21
+ options.add_argument('--disable-dev-shm-usage')
22
+ options.add_argument('log-level=3')
23
+
24
+
25
+ # Initialize the Chrome WebDriver
26
+ wd = webdriver.Chrome(options=options)
27
+
28
+
29
+ drive = gdrive.authenticate_google_drive()
30
+ processed_files = set()
31
+ st.markdown(
32
+ """
33
+ <style>
34
+ .centered {
35
+ display: flex;
36
+ align-items: center;
37
+ justify-content: center;
38
+ text-align: center;
39
+ }
40
+ </style>
41
+ """,
42
+ unsafe_allow_html=True
43
+ )
44
+
45
+ st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
46
+
47
+ selected_websites = {}
48
+ selected_categories = {}
49
+
50
+ def save_file_id_mapping(file_id_mapping):
51
+ with open("file_id_mapping.json", "w") as file:
52
+ json.dump(file_id_mapping, file)
53
+
54
+ def load_file_id_mapping():
55
+ try:
56
+ with open("file_id_mapping.json", "r") as file:
57
+ return json.load(file)
58
+ except FileNotFoundError:
59
+ return {} # Return an empty dictionary if the file doesn't exist
60
+
61
+ file_id_mapping = load_file_id_mapping()
62
+
63
+ selected_websites = {}
64
+
65
+ for website, details in config.items():
66
+ if st.checkbox(website, key=website):
67
+ # Language selection
68
+ languages = details.get("languages", {})
69
+ if languages and len(languages) > 1:
70
+ language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
71
+ selected_websites[website] = f"{website}_{language}" # like: hespress_en
72
+ else:
73
+ selected_websites[website] = website # like: akhbarona
74
+
75
+ # Category selection
76
+ categories = languages.get(language, {})
77
+ if categories:
78
+ categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
79
+ selected_categories[website] = categories
80
+
81
+ # Number of articles input
82
+ num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
83
+
84
+ # Start scraping button
85
+ if st.button('Start Scraping'):
86
+ with st.spinner('Scraping in progress...'):
87
+ progress_bar = st.progress(0)
88
+ total_tasks = sum(len(categories) for categories in selected_categories.values())
89
+ completed_tasks = 0
90
+ for website, module_name in selected_websites.items():
91
+ scraper_module = importlib.import_module(module_name)
92
+ for category in selected_categories.get(website, []):
93
+ category_url = config[website]['languages'][language][category]
94
+ if 'category_name' in config[website]:
95
+ category_name = config[website]['category_name'].get(category, 'default_category_name')
96
+ file_path = scraper_module.scrape_category(category_url, num_articles)
97
+
98
+ if file_path:
99
+ if file_path not in file_id_mapping:
100
+ file_id = gdrive.upload_file_to_drive(drive, file_path)
101
+ print(f"Uploading file: {file_path}, File ID: {file_id}")
102
+ file_id_mapping[file_path] = file_id
103
+ save_file_id_mapping(file_id_mapping)
104
+ else:
105
+ file_id = file_id_mapping[file_path]
106
+ print(f"File already uploaded. Using existing File ID: {file_id}")
107
+
108
+ if file_id:
109
+ download_link = gdrive.get_drive_download_link(drive, file_id)
110
+ if download_link:
111
+ #st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
112
+
113
+ df = pd.read_csv(file_path)
114
+ st.write(f"{website} - {category} Data:")
115
+ st.dataframe(df)
116
+ else:
117
+ st.error(f"Failed to retrieve download link for file ID: {file_id}")
118
+ else:
119
+ st.error(f"Failed to upload file for {website} - {category}")
120
+ else:
121
+ st.error(f"File not created for {website} - {category}")
122
+
123
+ st.success('Scraping Completed!')
config.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hespress": {
3
+ "languages": {
4
+ "en": {
5
+ "politics": "https://en.hespress.com/politics",
6
+ "economy": "https://en.hespress.com/economy",
7
+ "society": "https://en.hespress.com/society",
8
+ "culture": "https://en.hespress.com/culture",
9
+ "sports": "https://en.hespress.com/sports",
10
+ "mena": "https://en.hespress.com/mena",
11
+ "international": "https://en.hespress.com/international"
12
+ },
13
+ "ar": {
14
+ "Politique": "https://www.hespress.com/politique",
15
+ "Economie": "https://www.hespress.com/economie",
16
+ "Tamazight": "https://www.hespress.com/tamazight",
17
+ "Sport": "https://www.hespress.com/sport",
18
+ "Societe": "https://www.hespress.com/societe",
19
+ "Culture": "https://www.hespress.com/art-et-culture",
20
+ "Medias": "https://www.hespress.com/medias",
21
+ "faits-divers": "https://www.hespress.com/faits-divers",
22
+ "Automoto": "https://www.hespress.com/automoto",
23
+ "Regions": "https://www.hespress.com/regions"
24
+ },
25
+ "fr": {
26
+ "Politique": "https://fr.hespress.com/politique",
27
+ "Economie": "https://fr.hespress.com/economie",
28
+ "Monde": "https://fr.hespress.com/monde",
29
+ "Sport": "https://fr.hespress.com/sport",
30
+ "Societe": "https://fr.hespress.com/societe",
31
+ "Culture": "https://fr.hespress.com/culture",
32
+ "Medias": "https://fr.hespress.com/media",
33
+ "High-tech": "https://fr.hespress.com/high-tech",
34
+ "Opinions": "https://fr.hespress.com/opinions",
35
+ "Regions": "https://fr.hespress.com/regions"
36
+ }
37
+ },
38
+ "module": "hespress"
39
+ },
40
+ "akhbarona": {
41
+ "languages": {
42
+ "ar": {
43
+ "economy": " https://www.akhbarona.com/economy",
44
+ "politic": "https://www.akhbarona.com/politic",
45
+ "national": "https://www.akhbarona.com/national",
46
+ "world": "https://www.akhbarona.com/world",
47
+ "health": "https://www.akhbarona.com/health",
48
+ "technology": "https://www.akhbarona.com/technology",
49
+ "culture": "https://www.akhbarona.com/culture",
50
+ "religion": "https://www.akhbarona.com/religion",
51
+ "last": "https://www.akhbarona.com/last"
52
+ },
53
+ "fr": {}
54
+ },
55
+ "module": "akhbarona"
56
+ },
57
+ "liberation": {
58
+ "languages": {
59
+ "fr": {
60
+ "Actualites": "https://www.libe.ma/Actualite_r5.html",
61
+ "Economie": "https://www.libe.ma/Economie_r10.html",
62
+ "Societe": "https://www.libe.ma/Societe_r7.html",
63
+ "culture": "https://www.libe.ma/Culture_r8.html",
64
+ "Sport": "https://www.libe.ma/Sport_r6.html",
65
+ "international": "https://www.libe.ma/Monde_r17.html",
66
+ "Entretien": "https://www.libe.ma/Entretien_r14.html",
67
+ "L'info": "https://www.libe.ma/L-info_r25.html",
68
+ "Portrait": "https://www.libe.ma/Portrait_r41.html",
69
+ "Horizon": "https://www.libe.ma/Horizons_r13.html",
70
+ "People": "https://www.libe.ma/People_r27.html"
71
+ },
72
+ "ar": {}
73
+ },
74
+ "module": "liberation"
75
+ },
76
+ "al9anat": {
77
+ "languages": {
78
+ "ar": {
79
+ "society": "https://www.al9anat.com/%d9%85%d8%ac%d8%aa%d9%85%d8%b9-2/",
80
+ "politic": "https://www.al9anat.com/%d8%b3%d9%8a%d8%a7%d8%b3%d8%a9/",
81
+ "economy": "https://www.al9anat.com/%d8%a5%d9%82%d8%aa%d8%b5%d8%a7%d8%af/",
82
+ "sport": "https://www.al9anat.com/%d8%b1%d9%8a%d8%a7%d8%b6%d8%a9/",
83
+ "art": "https://www.al9anat.com/%d8%a3%d8%af%d8%a8-%d9%88-%d9%81%d9%86%d9%88%d9%86/",
84
+ "international": "https://www.al9anat.com/%d8%af%d9%88%d9%84%d9%8a/",
85
+ "interviews": "https://www.al9anat.com/%d8%ad%d9%88%d8%a7%d8%b1%d8%a7%d8%aa/",
86
+ "Maroc_outWorld": "https://www.al9anat.com/%d9%85%d8%ba%d8%a7%d8%b1%d8%a8%d8%a9-%d8%a7%d9%84%d8%b9%d8%a7%d9%84%d9%85/"
87
+ },
88
+ "fr": {}
89
+ },
90
+ "module": "al9anat"
91
+ }
92
+ }
file_id_mapping.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"C:\\Users\\bourh\\Desktop\\news-scraper\\medias_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\art-et-culture_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\societe_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\_data_ar.csv": "1-O9jOHOzUh51njw3s0hAyGJa4vLpmFo9", "C:\\Users\\bourh\\Desktop\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_technology_articles.csv": "1EYglfLNOrV99rSg93a7SRBzzClHEyrcX", "liberation_art.csv": "1DyOzZg7zfQZB_7nJK01y6SLI-vV16pyS", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\_data.csv": "1k_vpDJ7BjZ37_SUQuXp2XNTLXcPI3aMi", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_health_articles.csv": "16lE-sUbDspD-LlgMneHChHdLn_xgkvjh", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_religion_articles.csv": "1D-dArtoLqf6rT2e5CpcHigTcMy8Gssl7", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\monde_data_fr.csv": "1HPCxTaIQSFLRjToqRp6jzRBDES-C2Nqc", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\societe_data_fr.csv": "1KyGSL7Qb6X9Ru04D9qm5DXfgYSY2KHen", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_culture_articles.csv": "1fFOoItXzEbWxfn9maFAxT2VPgO3ZjD47", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_last_articles.csv": "1pjSbjCsraB1SA2vtchzsP17VbjW8z4rY", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_economy_articles.csv": "1rBJVKgEBZO__XuyVQos5pN49bneLwN36", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_national_articles.csv": "1dsIFUh_rDEQOD2X_B3tzBOQNzUcjRRNG", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\culture_data_en.csv": "1B12V7CW0UfTRyXn6fc4opOkz1ap88Gug"}
google_drive_handle.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from pydrive.auth import GoogleAuth
3
+ from pydrive.drive import GoogleDrive
4
+ from oauth2client.client import OAuth2Credentials
5
+ import os
6
+
7
+ load_dotenv()
8
+
9
+ CLIENT_ID = os.getenv('CLIENT_ID')
10
+ CLIENT_SECRET = os.getenv('CLIENT_SECRET')
11
+ REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')
12
+ REDIRECT_URI = os.getenv('REDIRECT_URIS').split(',')[0] # Access the first URI
13
+
14
+ def authenticate_google_drive():
15
+ gauth = GoogleAuth()
16
+ gauth.credentials = OAuth2Credentials(None, CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, None,
17
+ "https://accounts.google.com/o/oauth2/token", None, "web")
18
+ drive = GoogleDrive(gauth)
19
+ return drive
20
+
21
+ drive = authenticate_google_drive()
22
+
23
+ def upload_file_to_drive(drive, file_path, folder_id=None):
24
+ if not os.path.exists(file_path):
25
+ print(f"Cannot upload, file does not exist at path: {file_path}")
26
+ return None
27
+
28
+ try:
29
+ file_metadata = {'title': os.path.basename(file_path)}
30
+ if folder_id:
31
+ file_metadata['parents'] = [{'id': folder_id}]
32
+
33
+ upload_file = drive.CreateFile(file_metadata)
34
+
35
+ # Check if the file already exists on Google Drive
36
+ existing_files = drive.ListFile({'q': f"title='{upload_file['title']}'"}).GetList()
37
+ if existing_files:
38
+ # File with the same name already exists, update the existing file
39
+ upload_file = existing_files[0]
40
+ print(f"File already exists on Drive. Updating file with ID: {upload_file['id']}")
41
+ else:
42
+ print("Uploading a new file to Drive.")
43
+
44
+ upload_file.SetContentFile(file_path)
45
+ upload_file.Upload()
46
+ print(f"File uploaded successfully. File ID: {upload_file['id']}")
47
+ return upload_file['id']
48
+ except Exception as e:
49
+ print(f"An error occurred during file upload: {e}")
50
+ return None
51
+
52
+
53
+ def get_drive_download_link(drive, file_id):
54
+ try:
55
+ file = drive.CreateFile({'id': file_id})
56
+ file.Upload() # Make sure the file exists on Drive
57
+ file.InsertPermission({
58
+ 'type': 'anyone',
59
+ 'value': 'anyone',
60
+ 'role': 'reader'})
61
+ return "https://drive.google.com/uc?export=download&id=" + file_id
62
+ except Exception as e:
63
+ print(f"Error fetching download link: {e}")
64
+ return None
hespress_ar.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
3
+ from selenium.webdriver.support.ui import WebDriverWait
4
+ from selenium.webdriver.support import expected_conditions as EC
5
+ from selenium.webdriver.common.by import By
6
+ from selenium.common.exceptions import TimeoutException
7
+ from bs4 import BeautifulSoup
8
+ import time
9
+ import re
10
+ import os
11
+ import requests
12
+ import csv
13
+ from urllib.parse import urljoin
14
+ from google_drive_handle import authenticate_google_drive
15
+ drive = authenticate_google_drive()
16
+
17
+
18
+ # Set up Chrome WebDriver with options
19
+ options = ChromeOptions()
20
+ options.add_argument('--headless')
21
+ options.add_argument('--no-sandbox')
22
+ options.add_argument('--disable-dev-shm-usage')
23
+ options.add_argument('log-level=3')
24
+
25
+ # Initialize the Chrome WebDriver
26
+ wd = webdriver.Chrome(options=options)
27
+
28
+ def download_image(img_url):
29
+ return img_url
30
+
31
+ def scroll_page(expected_article_count):
32
+ scroll_pause_time = 2
33
+ screen_height = wd.execute_script("return window.innerHeight;")
34
+ scrolled_height = 0
35
+
36
+ while True:
37
+ scrolled_height += screen_height
38
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
39
+ time.sleep(scroll_pause_time)
40
+ new_height = wd.execute_script("return document.body.scrollHeight")
41
+ if scrolled_height >= new_height:
42
+ break
43
+
44
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
45
+ articles = soup.find_all('div', class_='overlay card')
46
+ if len(articles) >= expected_article_count:
47
+ break
48
+
49
+ def scrape_article_details(article_url):
50
+ try:
51
+ wd.get(article_url)
52
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
53
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
54
+ content_tag = soup.find('div', class_='article-content')
55
+ content = content_tag.get_text().strip() if content_tag else ""
56
+ date_tag = soup.find('small', class_='text-muted time')
57
+ date = date_tag.get_text().strip() if date_tag else ""
58
+ image_tag = soup.find('img', class_='wp-post-image')
59
+ image_url = image_tag['src'] if image_tag else None
60
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
61
+ return content, date, img_url
62
+ except TimeoutException:
63
+ print("Timed out waiting for page elements to load")
64
+ return "", "", None
65
+ except Exception as e:
66
+ print(f"An error occurred while scraping article details: {str(e)}")
67
+ return "", "", None
68
+
69
+ def scrape_article_details(article_url):
70
+
71
+ try:
72
+ wd.get(article_url)
73
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
74
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
75
+
76
+ content_tag = soup.find('div', class_='article-content')
77
+ content = content_tag.get_text().strip() if content_tag else ""
78
+
79
+ date_tag = soup.find('small', class_='text-muted time')
80
+ date = date_tag.get_text().strip() if date_tag else ""
81
+
82
+ image_tag = soup.find('img', class_='wp-post-image')
83
+ image_url = image_tag['src'] if image_tag else None
84
+
85
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
86
+
87
+ return content, date, img_url
88
+
89
+ except TimeoutException:
90
+ print("Timed out waiting for page elements to load")
91
+ return "", "", None, ""
92
+ except Exception as e:
93
+ print(f"An error occurred while scraping article details: {str(e)}")
94
+ return "", "", None, ""
95
+
96
+ def sanitize_filename(filename):
97
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
98
+
99
+ def scrape_category(category_url, num_articles):
100
+ print("Attempting to scrape:", category_url)
101
+ articles_data = []
102
+ wd.get(category_url)
103
+ scroll_page(num_articles)
104
+
105
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
106
+ articles = soup.find_all('div', class_='overlay card')
107
+ for article in articles[:num_articles]:
108
+ link_tag = article.find('a', class_='stretched-link')
109
+ link = link_tag['href'] if link_tag else ""
110
+ title_tag = article.find('h3', class_='card-title')
111
+ title = title_tag.get_text().strip() if title_tag else ""
112
+ content, date, img_url = scrape_article_details(link)
113
+ article_data = {
114
+ "Title": title,
115
+ "Date": date,
116
+ "Category": category_url.split('/')[-1],
117
+ "Content": content,
118
+ "Link": link,
119
+ "Image": img_url
120
+ }
121
+ print(f"Scraping article: {title}, Link: {link}")
122
+ articles_data.append(article_data)
123
+
124
+
125
+ # Save scraped data to a CSV file
126
+ category_name = sanitize_filename(category_url.split("/")[-1])
127
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_ar.csv')
128
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
129
+
130
+ try:
131
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
132
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
133
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
134
+ if file_mode == 'w':
135
+ writer.writeheader()
136
+ for article in articles_data:
137
+ writer.writerow(article)
138
+ print(f"CSV file saved successfully at {csv_file_path}")
139
+ except IOError as e:
140
+ print(f"Failed to save file at {csv_file_path}: {e}")
141
+ return None # Return None to indicate failure
142
+
143
+ # Check if the file exists before uploading
144
+
145
+ if os.path.exists(csv_file_path):
146
+ print(f"File successfully created at {csv_file_path}")
147
+ return csv_file_path
148
+
149
+ else:
150
+ print(f"Failed to create file for {category_url}")
151
+ return None
hespress_en.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.common.exceptions import TimeoutException
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ import re
11
+ import os
12
+ import requests
13
+ import csv
14
+ from urllib.parse import urljoin
15
+ from google_drive_handle import authenticate_google_drive
16
+ drive = authenticate_google_drive()
17
+
18
+
19
+ # Set up Chrome WebDriver with options
20
+ options = ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+ options.add_argument('log-level=3')
25
+
26
+ # Initialize the Chrome WebDriver
27
+ wd = webdriver.Chrome(options=options)
28
+
29
+
30
+ def download_image(img_url):
31
+ return img_url
32
+
33
+ def scroll_page(expected_article_count):
34
+ scroll_pause_time = 2
35
+ screen_height = wd.execute_script("return window.innerHeight;")
36
+ scrolled_height = 0
37
+
38
+ while True:
39
+ scrolled_height += screen_height
40
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
41
+ time.sleep(scroll_pause_time)
42
+ new_height = wd.execute_script("return document.body.scrollHeight")
43
+ if scrolled_height >= new_height:
44
+ break
45
+
46
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
47
+ articles = soup.find_all('div', class_='overlay card')
48
+ if len(articles) >= expected_article_count:
49
+ break
50
+
51
+ def scrape_article_details(article_url):
52
+ try:
53
+ wd.get(article_url)
54
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
55
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
56
+ content_tag = soup.find('div', class_='article-content')
57
+ content = content_tag.get_text().strip() if content_tag else ""
58
+ date_tag = soup.find('small', class_='text-muted time')
59
+ date = date_tag.get_text().strip() if date_tag else ""
60
+ image_tag = soup.find('img', class_='wp-post-image')
61
+ image_url = image_tag['src'] if image_tag else None
62
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
63
+ return content, date, img_url
64
+ except TimeoutException:
65
+ print("Timed out waiting for page elements to load")
66
+ return "", "", None
67
+ except Exception as e:
68
+ print(f"An error occurred while scraping article details: {str(e)}")
69
+ return "", "", None
70
+
71
+ def sanitize_filename(filename):
72
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
73
+
74
+ def scrape_category(category_url, num_articles):
75
+ print("Attempting to scrape:", category_url)
76
+ articles_data = []
77
+ wd.get(category_url)
78
+ scroll_page(num_articles)
79
+
80
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
81
+ articles = soup.find_all('div', class_='overlay card')
82
+ for article in articles[:num_articles]:
83
+ link_tag = article.find('a', class_='stretched-link')
84
+ link = link_tag['href'] if link_tag else ""
85
+ title_tag = article.find('h3', class_='card-title')
86
+ title = title_tag.get_text().strip() if title_tag else ""
87
+ content, date, img_url = scrape_article_details(link)
88
+ article_data = {
89
+ "Title": title,
90
+ "Date": date,
91
+ "Category": category_url.split('/')[-1],
92
+ "Content": content,
93
+ "Link": link,
94
+ "Image": img_url
95
+ }
96
+ print(f"Scraping article: {title}, Link: {link}")
97
+ articles_data.append(article_data)
98
+
99
+ # Save scraped data to a CSV file
100
+ category_name = sanitize_filename(category_url.split("/")[-1])
101
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
102
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
103
+
104
+ try:
105
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
106
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
107
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
108
+ if file_mode == 'w':
109
+ writer.writeheader()
110
+ for article in articles_data:
111
+ writer.writerow(article)
112
+ print(f"CSV file saved successfully at {csv_file_path}")
113
+ except IOError as e:
114
+ print(f"Failed to save file at {csv_file_path}: {e}")
115
+ return None # Return None to indicate failure
116
+
117
+ # Check if the file exists before uploading
118
+
119
+ if os.path.exists(csv_file_path):
120
+ print(f"File successfully created at {csv_file_path}")
121
+ return csv_file_path
122
+
123
+ else:
124
+ print(f"Failed to create file for {category_url}")
125
+ return None
hespress_fr.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.common.exceptions import TimeoutException
8
+ from bs4 import BeautifulSoup
9
+ import time
10
+ import re
11
+ import os
12
+ import requests
13
+ import csv
14
+ from urllib.parse import urljoin
15
+ from google_drive_handle import authenticate_google_drive
16
+ drive = authenticate_google_drive()
17
+
18
+
19
+ # Set up Chrome WebDriver with options
20
+ options = ChromeOptions()
21
+ options.add_argument('--headless')
22
+ options.add_argument('--no-sandbox')
23
+ options.add_argument('--disable-dev-shm-usage')
24
+ options.add_argument('log-level=3')
25
+
26
+
27
+ # Initialize the Chrome WebDriver
28
+ wd = webdriver.Chrome(options=options)
29
+
30
+ def download_image(img_url):
31
+ return img_url
32
+
33
+ def scroll_page(expected_article_count):
34
+ scroll_pause_time = 2
35
+ screen_height = wd.execute_script("return window.innerHeight;")
36
+ scrolled_height = 0
37
+
38
+ while True:
39
+ scrolled_height += screen_height
40
+ wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
41
+ time.sleep(scroll_pause_time)
42
+ new_height = wd.execute_script("return document.body.scrollHeight")
43
+ if scrolled_height >= new_height:
44
+ break
45
+
46
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
47
+ articles = soup.find_all('div', class_='overlay card')
48
+ if len(articles) >= expected_article_count:
49
+ break
50
+
51
+ def scrape_article_details(article_url):
52
+ try:
53
+ wd.get(article_url)
54
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
55
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
56
+ content_tag = soup.find('div', class_='article-content')
57
+ content = content_tag.get_text().strip() if content_tag else ""
58
+ date_tag = soup.find('small', class_='text-muted time')
59
+ date = date_tag.get_text().strip() if date_tag else ""
60
+ image_tag = soup.find('img', class_='wp-post-image')
61
+ image_url = image_tag['src'] if image_tag else None
62
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
63
+ return content, date, img_url
64
+ except TimeoutException:
65
+ print("Timed out waiting for page elements to load")
66
+ return "", "", None
67
+ except Exception as e:
68
+ print(f"An error occurred while scraping article details: {str(e)}")
69
+ return "", "", None
70
+
71
+ def scrape_article_details(article_url):
72
+
73
+ try:
74
+ wd.get(article_url)
75
+ WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content'))) # Adjusted to wait for article content
76
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
77
+
78
+ content_tag = soup.find('div', class_='article-content')
79
+ content = content_tag.get_text().strip() if content_tag else ""
80
+
81
+ date_tag = soup.find('small', class_='text-muted time')
82
+ date = date_tag.get_text().strip() if date_tag else ""
83
+
84
+ image_tag = soup.find('img', class_='wp-post-image')
85
+ image_url = image_tag['src'] if image_tag else None
86
+
87
+ img_url = download_image(urljoin(article_url, image_url)) if image_url else None
88
+
89
+ return content, date, img_url
90
+
91
+ except TimeoutException:
92
+ print("Timed out waiting for page elements to load")
93
+ return "", "", None, ""
94
+ except Exception as e:
95
+ print(f"An error occurred while scraping article details: {str(e)}")
96
+ return "", "", None, ""
97
+
98
+ def sanitize_filename(filename):
99
+ return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
100
+
101
+ def scrape_category(category_url, num_articles):
102
+ print("Attempting to scrape:", category_url)
103
+ articles_data = []
104
+ wd.get(category_url)
105
+ scroll_page(num_articles)
106
+
107
+ soup = BeautifulSoup(wd.page_source, 'html.parser')
108
+ articles = soup.find_all('div', class_='overlay card')
109
+ for article in articles[:num_articles]:
110
+ link_tag = article.find('a', class_='stretched-link')
111
+ link = link_tag['href'] if link_tag else ""
112
+ title_tag = article.find('h3', class_='card-title')
113
+ title = title_tag.get_text().strip() if title_tag else ""
114
+ content, date, img_url = scrape_article_details(link)
115
+ article_data = {
116
+ "Title": title,
117
+ "Date": date,
118
+ "Category": category_url.split('/')[-1],
119
+ "Content": content,
120
+ "Link": link,
121
+ "Image": img_url
122
+ }
123
+ print(f"Scraping article: {title}, Link: {link}")
124
+ articles_data.append(article_data)
125
+
126
+
127
+ # Save scraped data to a CSV file
128
+ category_name = sanitize_filename(category_url.split("/")[-1])
129
+ csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_fr.csv')
130
+ file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
131
+
132
+ try:
133
+ with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
134
+ fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
135
+ writer = csv.DictWriter(file, fieldnames=fieldnames)
136
+ if file_mode == 'w':
137
+ writer.writeheader()
138
+ for article in articles_data:
139
+ writer.writerow(article)
140
+ print(f"CSV file saved successfully at {csv_file_path}")
141
+ except IOError as e:
142
+ print(f"Failed to save file at {csv_file_path}: {e}")
143
+ return None # Return None to indicate failure
144
+
145
+ # Check if the file exists before uploading
146
+
147
+ if os.path.exists(csv_file_path):
148
+ print(f"File successfully created at {csv_file_path}")
149
+ return csv_file_path
150
+
151
+ else:
152
+ print(f"Failed to create file for {category_url}")
153
+ return None
liberation_fr.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import pandas as pd
5
+ import time
6
+ import timeit
7
+
8
+ # Headers for simulating a browser request
9
+ headers = {
10
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
11
+ }
12
+
13
+ def faire_requete(url):
14
+ """
15
+ Effectuer une requête HTTP avec gestion des erreurs
16
+ Args:
17
+ url (str): l'URL de la requête HTTP
18
+
19
+ Returns:
20
+ bytes or None: Le contenu de la réponse si la requête est réussie, sinon None.
21
+ """
22
+ try:
23
+ with requests.get(url, headers=headers) as reponse:
24
+ reponse.raise_for_status()
25
+ return reponse.content
26
+ except requests.RequestException as e:
27
+ print(f"Erreur de requête HTTP: {e}")
28
+ return None
29
+
30
+ def extract_articles(category_url, num_articles):
31
+ temps_debut = timeit.default_timer()
32
+ liens_articles = []
33
+ current_count = 0
34
+
35
+ while current_count < num_articles:
36
+ time.sleep(2)
37
+ contenu = faire_requete(category_url + f"?start={current_count}&order=")
38
+
39
+ if contenu:
40
+ soup = BeautifulSoup(contenu, "html.parser")
41
+ liens = soup.find_all("h3", {"class":"titre_article"})
42
+ for lien in liens:
43
+ if current_count >= num_articles:
44
+ break
45
+ liens_articles.append("https://www.libe.ma" + lien.a["href"])
46
+ current_count += 1
47
+
48
+ lignes = []
49
+ for lien in liens_articles:
50
+ time.sleep(2)
51
+ contenu = faire_requete(lien)
52
+ if contenu:
53
+ soup = BeautifulSoup(contenu, "html.parser")
54
+ try:
55
+ titre = soup.find("h1", {"class":"access"}).text.replace("\n", "").strip()
56
+ except:
57
+ titre = None
58
+ try:
59
+ description = soup.find("div", {"class":"access firstletter"}).text.replace("\n", "").strip()
60
+ except:
61
+ description = None
62
+ try:
63
+ date = soup.find("div", {"class":"date"}).text.replace("\n", "").strip()
64
+ except:
65
+ date = None
66
+ lignes.append([titre, description, date])
67
+
68
+ return lignes
69
+
70
+ def scrape_category(category_url, num_articles):
71
+ article_data = extract_articles(category_url, num_articles)
72
+
73
+ colonnes = ["titre", "content", "date"]
74
+ articles_df = pd.DataFrame(article_data, columns=colonnes)
75
+
76
+ csv_file_path = "liberation_art.csv"
77
+ articles_df.to_csv(csv_file_path, index=False)
78
+
79
+ return csv_file_path
80
+ '''
81
+ if __name__ == "__main__":
82
+ category_url = "https://www.libe.ma/Economie_r10.html"
83
+ num_articles = 10 # Number of articles to scrape
84
+ csv_file_path = scrape_category(category_url, num_articles)
85
+ # Now, csv_file_path can be used in Streamlit for uploading
86
+ '''