Spaces:

MoroccanDS
/

A8-Moroccan-News-Aggregator

Running

App Files Files Community

S-11 commited on Feb 11

Commit

e45d093

•

1 Parent(s): 2b961fb

Upload 13 files

Browse files

Files changed (13) hide show

.env +9 -0
.gitignore +6 -0
README.md +43 -11
akhbarona_ar.py +79 -0
al9anat_ar.py +169 -0
app.py +123 -0
config.json +92 -0
file_id_mapping.json +1 -0
google_drive_handle.py +64 -0
hespress_ar.py +151 -0
hespress_en.py +125 -0
hespress_fr.py +153 -0
liberation_fr.py +86 -0

.env ADDED Viewed

	@@ -0,0 +1,9 @@

+CLIENT_ID=960753990614-492j6pk974fjq94qvls3mcgee1sbi4dv.apps.googleusercontent.com
+PROJECT_ID=moroccan-news-aggregator
+AUTH_URI=https://accounts.google.com/o/oauth2/auth
+TOKEN_URI=https://oauth2.googleapis.com/token
+AUTH_PROVIDER_X509_CERT_URL=https://www.googleapis.com/oauth2/v1/certs
+CLIENT_SECRET=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
+REFRESH_TOKEN=1//04ayA66paFryZCgYIARAAGAQSNwF-L9IreeWNlmWv38CCaRqvv_W8VHEp7ysy1A36bTZk3ViCCE9pOabmcKPWNfyz6HJgYm0fkZs
+REDIRECT_URIS=https://developers.google.com/oauthplayground,http://localhost:8501,http://localhost:8080

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+file_id_mapping.json
+__pycache/
+.env
+.gitignore
+.git/
+UI-design.pdf

README.md CHANGED Viewed

@@ -1,13 +1,45 @@
----
-title: A8 Moroccan News Aggregator
-emoji: 📊
-colorFrom: pink
-colorTo: red
-sdk: streamlit
-sdk_version: 1.31.0
-app_file: app.py
-pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Moroccan News Aggregator
+The Moroccan News Aggregator is a simple web scraping project designed to extract news articles from popular Moroccan news websites. The goal is to provide users with a convenient way to access and categorize news content from different sources.
+## Features
+- **Multi-Language Support:** Choose news articles in English, Arabic, or French languages from websites such as Hespress, Akhbarona, and Le360.
+- **Category Selection:** Select specific categories within each language to filter news articles based on your interests.
+- **Data Storage:** The scraped data is uploaded to Google Drive, ensuring easy access and sharing.
+## Setup Instructions
+1. **Clone the Repository:**
+    ```bash
+    git clone <repository-url>
+    ```
+2. **Install Dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+3. **Run the App:**
+    ```bash
+    streamlit run app.py
+    ```
+4. Follow the on-screen instructions to choose websites, languages, categories, and start scraping news articles.
+## Configuration
+Adjust settings in the `config.json` file to customize supported websites, languages, and categories.
+## License
+This project is licensed under the [MIT License](LICENSE).
 ---
+Feel free to explore and customize the project for your needs. If you encounter any issues or have suggestions for improvements, please let us know!

akhbarona_ar.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import os
+from google_drive_handle import authenticate_google_drive
+drive = authenticate_google_drive()
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+}
+def scrape_article(article_url):
+    response = requests.get(article_url, headers=headers)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    title_tag = soup.find('h1')
+    title = title_tag.get_text(strip=True) if title_tag else 'No Title'
+    content_div = soup.find('div', id='article_holder')  # Ensure this is the correct ID
+    if content_div:
+        content = ' '.join(p.get_text(strip=True) for p in content_div.find_all('p'))
+    else:
+        content = 'Content not found'
+    return {
+        'Title': title,
+        'Content': content
+    }
+def scrape_category(category_url, num_articles):
+    articles_scraped = 0
+    all_articles = []
+    page_num = 1
+    # Extract site and category from the URL
+    site_name = category_url.split('/')[2]  # This gets 'www.akhbarona.com' from the URL
+    site_name = site_name.replace('www.', '')
+    category_name = category_url.split('/')[-1]  # This gets the category name from the URL
+    while articles_scraped < num_articles:
+        paginated_url = f"{category_url}/index.{page_num}.html"
+        response = requests.get(paginated_url, headers=headers)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        article_links = soup.find_all('h2', class_='article_title')
+        for article_link in article_links:
+            a_tag = article_link.find('a')
+            if a_tag and 'href' in a_tag.attrs:
+                full_article_url = a_tag['href']
+                if not full_article_url.startswith('http'):
+                    full_article_url = f"{category_url}/{full_article_url}"
+                article_data = scrape_article(full_article_url)
+                all_articles.append(article_data)
+                articles_scraped += 1
+                if articles_scraped >= num_articles:
+                    break
+        if articles_scraped >= num_articles:
+            break
+        print(f"Going to next page: {paginated_url}")
+        page_num += 1  # Increment the page number
+    #csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
+    df = pd.DataFrame(all_articles)
+    csv_file_name = f"{site_name}_{category_name}_articles.csv"
+    csv_file_path = os.path.join(os.getcwd(), csv_file_name)  # Full file path
+    df.to_csv(csv_file_path, index=False)
+    print(f"Articles saved to {csv_file_path}")
+    return csv_file_path

al9anat_ar.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing_extensions import Self
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.common.action_chains import ActionChains
+from selenium.common.exceptions import NoSuchElementException, TimeoutException
+from selenium.common.exceptions import ElementClickInterceptedException
+from bs4 import BeautifulSoup
+import time
+import os
+import re
+import requests
+import json
+import csv
+from urllib.parse import urljoin
+# Set up Chrome WebDriver with options
+options = ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument('log-level=3')
+# Initialize the Chrome WebDriver
+wd = webdriver.Chrome(options=options)
+def download_image(img_url):
+    return img_url
+def sanitize_filename(filename):
+    return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
+def scroll_page(wd, max_scrolls=7, articles_per_load=6, max_attempts=5):
+    scroll_pause_time = 5
+    attempts = 0
+    for _ in range(max_scrolls):
+        current_articles = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
+        wd.execute_script("window.scrollBy(0, document.body.scrollHeight);")
+        time.sleep(scroll_pause_time)
+        try:
+            load_more_button = WebDriverWait(wd, 10).until(
+                EC.presence_of_element_located((By.XPATH, '//a[@class="ts-button load-button load-button-a ts-button-alt" and @href="#"]'))
+            )
+            wd.execute_script("arguments[0].scrollIntoView();", load_more_button)
+            wd.execute_script("arguments[0].click();", load_more_button)
+            attempts = 0  # Reset attempts after successful button click
+        except TimeoutException:
+            attempts += 1
+            if attempts >= max_attempts:
+                print("Maximum attempts reached without new articles. Exiting.")
+                return False  # Exit the function
+        new_article_count = len(wd.find_elements(By.CSS_SELECTOR, "article.l-post"))
+        if new_article_count > current_articles:
+            attempts = 0  # Reset attempts after successfully loading new articles
+        else:
+            attempts += 1
+            if attempts >= max_attempts:
+                print("No new articles found after several attempts. Exiting.")
+                return False  # Exit the function
+    return True
+def scrape_article_details(article_url, wd):
+    try:
+        # Validate the URL
+        if not article_url.startswith("http"):
+            article_url = "https://" + article_url
+        print("Navigating to:", article_url)
+        wd.get(article_url)
+        WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'the-post-tags')))  # Wait for a specific element to ensure the page has loaded
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='post-content cf entry-content content-spacious')
+        content = content_tag.get_text().strip() if content_tag else ""
+        category_tag = soup.find('span', class_='meta-item cat-labels')
+        category_from_article = category_tag.get_text().strip() if category_tag else "Uncategorized"
+        title_tag = soup.find('h1', class_='is-title post-title')
+        art_title = title_tag.get_text().strip() if title_tag else ""
+        date_tag = soup.find('span', class_='meta-item has-next-icon date')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('a', class_='image-link')
+        image_url = image_tag['href'] if image_tag else None
+        img_url = urljoin(article_url, image_url)
+        image_path = download_image(img_url) if image_url else None
+        return content, date, image_path, art_title, category_from_article
+    except TimeoutException:
+        print("Timed out waiting for page elements to load for URL:", article_url)
+        return "", "", None, "", ""
+    except Exception as e:
+        print(f"An error occurred while scraping article details at {article_url}: {str(e)}")
+        return "", "", None, "", ""
+def scrape_category(category_url,num_articles):
+    # Set up Chrome WebDriver with options
+    options = ChromeOptions()
+    options.add_argument('--headless')
+    options.add_argument('--no-sandbox')
+    options.add_argument('--disable-dev-shm-usage')
+    options.add_argument('log-level=3')
+    # Initialize the Chrome WebDriver
+    wd = webdriver.Chrome(options=options)
+    print("Attempting to scrape:", category_url)
+    articles_data = []
+    articles_count = 0
+    wd.get(category_url)
+    # Adjusted to use num_articles for scrolling and loading articles
+    scroll_page(wd, max_scrolls=int(num_articles/6), articles_per_load=6)
+    soup = BeautifulSoup(wd.page_source, 'html.parser')
+    articles = soup.find_all('article', class_='l-post grid-base-post grid-post')
+    for article in articles[:num_articles]:  # Limit to num_articles
+        link_tag = article.find('a', class_='image-link media-ratio ratio-16-9')
+        link = link_tag['href'] if link_tag else ""
+        if link:
+            wd.get(link)
+            article_data = scrape_article_details(link, wd)
+            if article_data[0]:  # Check if content is non-empty
+                articles_data.append({
+                    "art_id": articles_count,
+                    "Title": article_data[3],
+                    "Date": article_data[1],
+                    "Category": article_data[4],
+                    "Content": article_data[0],
+                    "Link": link,
+                    "Image": article_data[2],
+                })
+                articles_count += 1
+                print(f"Article #{articles_count} scraped: {article_data[3]}")
+    category_name = sanitize_filename(category_url.split("/")[-1])
+    csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data.csv')
+    try:
+        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
+            fieldnames = ["art_id", "Title", "Date", "Category", "Content", "Link", "Image"]
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            writer.writeheader()
+            for article in articles_data:
+                writer.writerow(article)
+        print(f"Data written to {csv_file_path} successfully.")
+    except Exception as e:
+        print(f"Error writing data to file: {e}")
+    wd.quit()  # Close the WebDriver
+    print(f"Total articles scraped: {len(articles_data)}")
+    return csv_file_path

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#web interface
+import streamlit as st
+import pandas as pd
+import json
+import importlib
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+import google_drive_handle as gdrive
+from dotenv import load_dotenv
+import os
+# Load config.json
+with open('config.json') as f:
+    config = json.load(f)
+# Set up Chrome WebDriver with options
+options = ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument('log-level=3')
+# Initialize the Chrome WebDriver
+wd = webdriver.Chrome(options=options)
+drive = gdrive.authenticate_google_drive()
+processed_files = set()
+st.markdown(
+    """
+    <style>
+        .centered {
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            text-align: center;
+        }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+st.markdown("<h1 class='centered'>Moroccan News Aggregator</h1>", unsafe_allow_html=True)
+selected_websites = {}
+selected_categories = {}
+def save_file_id_mapping(file_id_mapping):
+    with open("file_id_mapping.json", "w") as file:
+        json.dump(file_id_mapping, file)
+def load_file_id_mapping():
+    try:
+        with open("file_id_mapping.json", "r") as file:
+            return json.load(file)
+    except FileNotFoundError:
+        return {}  # Return an empty dictionary if the file doesn't exist
+file_id_mapping = load_file_id_mapping()
+selected_websites = {}
+for website, details in config.items():
+    if st.checkbox(website, key=website):
+        # Language selection
+        languages = details.get("languages", {})
+        if languages and len(languages) > 1:
+            language = st.selectbox(f'Choose language for {website}', list(languages.keys()), key=f'lang_{website}')
+            selected_websites[website] = f"{website}_{language}"  # like: hespress_en
+        else:
+            selected_websites[website] = website  # like: akhbarona
+        # Category selection
+        categories = languages.get(language, {})
+        if categories:
+            categories = st.multiselect(f'Select categories for {website}', list(categories.keys()), key=f'{website}_categories')
+            selected_categories[website] = categories
+# Number of articles input
+num_articles = st.number_input('Number of Articles', min_value=1, max_value=10000, step=1)
+# Start scraping button
+if st.button('Start Scraping'):
+    with st.spinner('Scraping in progress...'):
+        progress_bar = st.progress(0)
+        total_tasks = sum(len(categories) for categories in selected_categories.values())
+        completed_tasks = 0
+        for website, module_name in selected_websites.items():
+            scraper_module = importlib.import_module(module_name)
+            for category in selected_categories.get(website, []):
+                category_url = config[website]['languages'][language][category]
+                if 'category_name' in config[website]:
+                    category_name = config[website]['category_name'].get(category, 'default_category_name')
+                file_path = scraper_module.scrape_category(category_url, num_articles)
+                if file_path:
+                    if file_path not in file_id_mapping:
+                        file_id = gdrive.upload_file_to_drive(drive, file_path)
+                        print(f"Uploading file: {file_path}, File ID: {file_id}")
+                        file_id_mapping[file_path] = file_id
+                        save_file_id_mapping(file_id_mapping)
+                    else:
+                        file_id = file_id_mapping[file_path]
+                        print(f"File already uploaded. Using existing File ID: {file_id}")
+                    if file_id:
+                        download_link = gdrive.get_drive_download_link(drive, file_id)
+                        if download_link:
+                            #st.markdown(f"[Download {website} - {category} data]({download_link})", unsafe_allow_html=True)
+                            df = pd.read_csv(file_path)
+                            st.write(f"{website} - {category} Data:")
+                            st.dataframe(df)
+                        else:
+                            st.error(f"Failed to retrieve download link for file ID: {file_id}")
+                    else:
+                        st.error(f"Failed to upload file for {website} - {category}")
+                else:
+                    st.error(f"File not created for {website} - {category}")
+        st.success('Scraping Completed!')

config.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "hespress": {
+    "languages": {
+      "en": {
+        "politics": "https://en.hespress.com/politics",
+        "economy": "https://en.hespress.com/economy",
+        "society": "https://en.hespress.com/society",
+        "culture": "https://en.hespress.com/culture",
+        "sports": "https://en.hespress.com/sports",
+        "mena": "https://en.hespress.com/mena",
+        "international": "https://en.hespress.com/international"
+      },
+      "ar": {
+        "Politique": "https://www.hespress.com/politique",
+        "Economie": "https://www.hespress.com/economie",
+        "Tamazight": "https://www.hespress.com/tamazight",
+        "Sport": "https://www.hespress.com/sport",
+        "Societe": "https://www.hespress.com/societe",
+        "Culture": "https://www.hespress.com/art-et-culture",
+        "Medias": "https://www.hespress.com/medias",
+        "faits-divers": "https://www.hespress.com/faits-divers",
+        "Automoto": "https://www.hespress.com/automoto",
+        "Regions": "https://www.hespress.com/regions"
+      },
+      "fr": {
+        "Politique": "https://fr.hespress.com/politique",
+        "Economie": "https://fr.hespress.com/economie",
+        "Monde": "https://fr.hespress.com/monde",
+        "Sport": "https://fr.hespress.com/sport",
+        "Societe": "https://fr.hespress.com/societe",
+        "Culture": "https://fr.hespress.com/culture",
+        "Medias": "https://fr.hespress.com/media",
+        "High-tech": "https://fr.hespress.com/high-tech",
+        "Opinions": "https://fr.hespress.com/opinions",
+        "Regions": "https://fr.hespress.com/regions"
+      }
+    },
+    "module": "hespress"
+  },
+  "akhbarona": {
+    "languages": {
+      "ar": {
+        "economy": " https://www.akhbarona.com/economy",
+        "politic": "https://www.akhbarona.com/politic",
+        "national": "https://www.akhbarona.com/national",
+        "world": "https://www.akhbarona.com/world",
+        "health": "https://www.akhbarona.com/health",
+        "technology": "https://www.akhbarona.com/technology",
+        "culture": "https://www.akhbarona.com/culture",
+        "religion": "https://www.akhbarona.com/religion",
+        "last": "https://www.akhbarona.com/last"
+      },
+      "fr": {}
+    },
+    "module": "akhbarona"
+  },
+  "liberation": {
+    "languages": {
+      "fr": {
+        "Actualites": "https://www.libe.ma/Actualite_r5.html",
+        "Economie": "https://www.libe.ma/Economie_r10.html",
+        "Societe": "https://www.libe.ma/Societe_r7.html",
+        "culture": "https://www.libe.ma/Culture_r8.html",
+        "Sport": "https://www.libe.ma/Sport_r6.html",
+        "international": "https://www.libe.ma/Monde_r17.html",
+        "Entretien": "https://www.libe.ma/Entretien_r14.html",
+        "L'info": "https://www.libe.ma/L-info_r25.html",
+        "Portrait": "https://www.libe.ma/Portrait_r41.html",
+        "Horizon": "https://www.libe.ma/Horizons_r13.html",
+        "People": "https://www.libe.ma/People_r27.html"
+      },
+      "ar": {}
+    },
+    "module": "liberation"
+  },
+  "al9anat": {
+    "languages": {
+      "ar": {
+        "society": "https://www.al9anat.com/%d9%85%d8%ac%d8%aa%d9%85%d8%b9-2/",
+        "politic": "https://www.al9anat.com/%d8%b3%d9%8a%d8%a7%d8%b3%d8%a9/",
+        "economy": "https://www.al9anat.com/%d8%a5%d9%82%d8%aa%d8%b5%d8%a7%d8%af/",
+        "sport": "https://www.al9anat.com/%d8%b1%d9%8a%d8%a7%d8%b6%d8%a9/",
+        "art": "https://www.al9anat.com/%d8%a3%d8%af%d8%a8-%d9%88-%d9%81%d9%86%d9%88%d9%86/",
+        "international": "https://www.al9anat.com/%d8%af%d9%88%d9%84%d9%8a/",
+        "interviews": "https://www.al9anat.com/%d8%ad%d9%88%d8%a7%d8%b1%d8%a7%d8%aa/",
+        "Maroc_outWorld": "https://www.al9anat.com/%d9%85%d8%ba%d8%a7%d8%b1%d8%a8%d8%a9-%d8%a7%d9%84%d8%b9%d8%a7%d9%84%d9%85/"
+      },
+      "fr": {}
+    },
+    "module": "al9anat"
+  }
+}

file_id_mapping.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"C:\\Users\\bourh\\Desktop\\news-scraper\\medias_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\art-et-culture_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\societe_data_ar.csv": null, "C:\\Users\\bourh\\Desktop\\news-scraper\\_data_ar.csv": "1-O9jOHOzUh51njw3s0hAyGJa4vLpmFo9", "C:\\Users\\bourh\\Desktop\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\economy_data_en.csv": "1W9ugILgVHwLjNNpG6Hg2fHMH92EiMqIt", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_technology_articles.csv": "1EYglfLNOrV99rSg93a7SRBzzClHEyrcX", "liberation_art.csv": "1DyOzZg7zfQZB_7nJK01y6SLI-vV16pyS", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\_data.csv": "1k_vpDJ7BjZ37_SUQuXp2XNTLXcPI3aMi", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_health_articles.csv": "16lE-sUbDspD-LlgMneHChHdLn_xgkvjh", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_religion_articles.csv": "1D-dArtoLqf6rT2e5CpcHigTcMy8Gssl7", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\monde_data_fr.csv": "1HPCxTaIQSFLRjToqRp6jzRBDES-C2Nqc", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\societe_data_fr.csv": "1KyGSL7Qb6X9Ru04D9qm5DXfgYSY2KHen", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_culture_articles.csv": "1fFOoItXzEbWxfn9maFAxT2VPgO3ZjD47", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_last_articles.csv": "1pjSbjCsraB1SA2vtchzsP17VbjW8z4rY", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_economy_articles.csv": "1rBJVKgEBZO__XuyVQos5pN49bneLwN36", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\akhbarona.com_national_articles.csv": "1dsIFUh_rDEQOD2X_B3tzBOQNzUcjRRNG", "C:\\Users\\Lenovo\\Documents\\MDS projects\\v5\\news-scraper\\culture_data_en.csv": "1B12V7CW0UfTRyXn6fc4opOkz1ap88Gug"}

google_drive_handle.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from dotenv import load_dotenv
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+from oauth2client.client import OAuth2Credentials
+import os
+load_dotenv()
+CLIENT_ID = os.getenv('CLIENT_ID')
+CLIENT_SECRET = os.getenv('CLIENT_SECRET')
+REFRESH_TOKEN = os.getenv('REFRESH_TOKEN')
+REDIRECT_URI = os.getenv('REDIRECT_URIS').split(',')[0]  # Access the first URI
+def authenticate_google_drive():
+    gauth = GoogleAuth()
+    gauth.credentials = OAuth2Credentials(None, CLIENT_ID, CLIENT_SECRET, REFRESH_TOKEN, None,
+                                         "https://accounts.google.com/o/oauth2/token", None, "web")
+    drive = GoogleDrive(gauth)
+    return drive
+drive = authenticate_google_drive()
+def upload_file_to_drive(drive, file_path, folder_id=None):
+    if not os.path.exists(file_path):
+        print(f"Cannot upload, file does not exist at path: {file_path}")
+        return None
+    try:
+        file_metadata = {'title': os.path.basename(file_path)}
+        if folder_id:
+            file_metadata['parents'] = [{'id': folder_id}]
+        upload_file = drive.CreateFile(file_metadata)
+        # Check if the file already exists on Google Drive
+        existing_files = drive.ListFile({'q': f"title='{upload_file['title']}'"}).GetList()
+        if existing_files:
+            # File with the same name already exists, update the existing file
+            upload_file = existing_files[0]
+            print(f"File already exists on Drive. Updating file with ID: {upload_file['id']}")
+        else:
+            print("Uploading a new file to Drive.")
+        upload_file.SetContentFile(file_path)
+        upload_file.Upload()
+        print(f"File uploaded successfully. File ID: {upload_file['id']}")
+        return upload_file['id']
+    except Exception as e:
+        print(f"An error occurred during file upload: {e}")
+        return None
+def get_drive_download_link(drive, file_id):
+    try:
+        file = drive.CreateFile({'id': file_id})
+        file.Upload() # Make sure the file exists on Drive
+        file.InsertPermission({
+            'type': 'anyone',
+            'value': 'anyone',
+            'role': 'reader'})
+        return "https://drive.google.com/uc?export=download&id=" + file_id
+    except Exception as e:
+        print(f"Error fetching download link: {e}")
+        return None

hespress_ar.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from bs4 import BeautifulSoup
+import time
+import re
+import os
+import requests
+import csv
+from urllib.parse import urljoin
+from google_drive_handle import authenticate_google_drive
+drive = authenticate_google_drive()
+# Set up Chrome WebDriver with options
+options = ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument('log-level=3')
+# Initialize the Chrome WebDriver
+wd = webdriver.Chrome(options=options)
+def download_image(img_url):
+    return img_url
+def scroll_page(expected_article_count):
+    scroll_pause_time = 2
+    screen_height = wd.execute_script("return window.innerHeight;")
+    scrolled_height = 0
+    while True:
+        scrolled_height += screen_height
+        wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
+        time.sleep(scroll_pause_time)
+        new_height = wd.execute_script("return document.body.scrollHeight")
+        if scrolled_height >= new_height:
+            break
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        articles = soup.find_all('div', class_='overlay card')
+        if len(articles) >= expected_article_count:
+            break
+def scrape_article_details(article_url):
+    try:
+        wd.get(article_url)
+        WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='article-content')
+        content = content_tag.get_text().strip() if content_tag else ""
+        date_tag = soup.find('small', class_='text-muted time')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('img', class_='wp-post-image')
+        image_url = image_tag['src'] if image_tag else None
+        img_url = download_image(urljoin(article_url, image_url)) if image_url else None
+        return content, date, img_url
+    except TimeoutException:
+        print("Timed out waiting for page elements to load")
+        return "", "", None
+    except Exception as e:
+        print(f"An error occurred while scraping article details: {str(e)}")
+        return "", "", None
+def scrape_article_details(article_url):
+    try:
+        wd.get(article_url)
+        WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))  # Adjusted to wait for article content
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='article-content')
+        content = content_tag.get_text().strip() if content_tag else ""
+        date_tag = soup.find('small', class_='text-muted time')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('img', class_='wp-post-image')
+        image_url = image_tag['src'] if image_tag else None
+        img_url = download_image(urljoin(article_url, image_url)) if image_url else None
+        return content, date, img_url
+    except TimeoutException:
+        print("Timed out waiting for page elements to load")
+        return "", "", None, ""
+    except Exception as e:
+        print(f"An error occurred while scraping article details: {str(e)}")
+        return "", "", None, ""
+def sanitize_filename(filename):
+    return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
+def scrape_category(category_url, num_articles):
+    print("Attempting to scrape:", category_url)
+    articles_data = []
+    wd.get(category_url)
+    scroll_page(num_articles)
+    soup = BeautifulSoup(wd.page_source, 'html.parser')
+    articles = soup.find_all('div', class_='overlay card')
+    for article in articles[:num_articles]:
+        link_tag = article.find('a', class_='stretched-link')
+        link = link_tag['href'] if link_tag else ""
+        title_tag = article.find('h3', class_='card-title')
+        title = title_tag.get_text().strip() if title_tag else ""
+        content, date, img_url = scrape_article_details(link)
+        article_data = {
+            "Title": title,
+            "Date": date,
+            "Category": category_url.split('/')[-1],
+            "Content": content,
+            "Link": link,
+            "Image": img_url
+        }
+        print(f"Scraping article: {title}, Link: {link}")
+        articles_data.append(article_data)
+    # Save scraped data to a CSV file
+    category_name = sanitize_filename(category_url.split("/")[-1])
+    csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_ar.csv')
+    file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
+    try:
+            with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
+                fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
+                writer = csv.DictWriter(file, fieldnames=fieldnames)
+                if file_mode == 'w':
+                    writer.writeheader()
+                for article in articles_data:
+                    writer.writerow(article)
+            print(f"CSV file saved successfully at {csv_file_path}")
+    except IOError as e:
+            print(f"Failed to save file at {csv_file_path}: {e}")
+            return None  # Return None to indicate failure
+    # Check if the file exists before uploading
+    if os.path.exists(csv_file_path):
+        print(f"File successfully created at {csv_file_path}")
+        return csv_file_path
+    else:
+        print(f"Failed to create file for {category_url}")
+        return None

hespress_en.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from bs4 import BeautifulSoup
+import time
+import re
+import os
+import requests
+import csv
+from urllib.parse import urljoin
+from google_drive_handle import authenticate_google_drive
+drive = authenticate_google_drive()
+# Set up Chrome WebDriver with options
+options = ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument('log-level=3')
+# Initialize the Chrome WebDriver
+wd = webdriver.Chrome(options=options)
+def download_image(img_url):
+    return img_url
+def scroll_page(expected_article_count):
+    scroll_pause_time = 2
+    screen_height = wd.execute_script("return window.innerHeight;")
+    scrolled_height = 0
+    while True:
+        scrolled_height += screen_height
+        wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
+        time.sleep(scroll_pause_time)
+        new_height = wd.execute_script("return document.body.scrollHeight")
+        if scrolled_height >= new_height:
+            break
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        articles = soup.find_all('div', class_='overlay card')
+        if len(articles) >= expected_article_count:
+            break
+def scrape_article_details(article_url):
+    try:
+        wd.get(article_url)
+        WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='article-content')
+        content = content_tag.get_text().strip() if content_tag else ""
+        date_tag = soup.find('small', class_='text-muted time')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('img', class_='wp-post-image')
+        image_url = image_tag['src'] if image_tag else None
+        img_url = download_image(urljoin(article_url, image_url)) if image_url else None
+        return content, date, img_url
+    except TimeoutException:
+        print("Timed out waiting for page elements to load")
+        return "", "", None
+    except Exception as e:
+        print(f"An error occurred while scraping article details: {str(e)}")
+        return "", "", None
+def sanitize_filename(filename):
+    return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
+def scrape_category(category_url, num_articles):
+    print("Attempting to scrape:", category_url)
+    articles_data = []
+    wd.get(category_url)
+    scroll_page(num_articles)
+    soup = BeautifulSoup(wd.page_source, 'html.parser')
+    articles = soup.find_all('div', class_='overlay card')
+    for article in articles[:num_articles]:
+        link_tag = article.find('a', class_='stretched-link')
+        link = link_tag['href'] if link_tag else ""
+        title_tag = article.find('h3', class_='card-title')
+        title = title_tag.get_text().strip() if title_tag else ""
+        content, date, img_url = scrape_article_details(link)
+        article_data = {
+            "Title": title,
+            "Date": date,
+            "Category": category_url.split('/')[-1],
+            "Content": content,
+            "Link": link,
+            "Image": img_url
+        }
+        print(f"Scraping article: {title}, Link: {link}")
+        articles_data.append(article_data)
+    # Save scraped data to a CSV file
+    category_name = sanitize_filename(category_url.split("/")[-1])
+    csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_en.csv')
+    file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
+    try:
+        with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
+            fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
+            writer = csv.DictWriter(file, fieldnames=fieldnames)
+            if file_mode == 'w':
+                writer.writeheader()
+            for article in articles_data:
+                writer.writerow(article)
+        print(f"CSV file saved successfully at {csv_file_path}")
+    except IOError as e:
+        print(f"Failed to save file at {csv_file_path}: {e}")
+        return None  # Return None to indicate failure
+    # Check if the file exists before uploading
+    if os.path.exists(csv_file_path):
+        print(f"File successfully created at {csv_file_path}")
+        return csv_file_path
+    else:
+        print(f"Failed to create file for {category_url}")
+        return None

hespress_fr.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from selenium import webdriver
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+from bs4 import BeautifulSoup
+import time
+import re
+import os
+import requests
+import csv
+from urllib.parse import urljoin
+from google_drive_handle import authenticate_google_drive
+drive = authenticate_google_drive()
+# Set up Chrome WebDriver with options
+options = ChromeOptions()
+options.add_argument('--headless')
+options.add_argument('--no-sandbox')
+options.add_argument('--disable-dev-shm-usage')
+options.add_argument('log-level=3')
+# Initialize the Chrome WebDriver
+wd = webdriver.Chrome(options=options)
+def download_image(img_url):
+    return img_url
+def scroll_page(expected_article_count):
+    scroll_pause_time = 2
+    screen_height = wd.execute_script("return window.innerHeight;")
+    scrolled_height = 0
+    while True:
+        scrolled_height += screen_height
+        wd.execute_script(f"window.scrollTo(0, {scrolled_height});")
+        time.sleep(scroll_pause_time)
+        new_height = wd.execute_script("return document.body.scrollHeight")
+        if scrolled_height >= new_height:
+            break
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        articles = soup.find_all('div', class_='overlay card')
+        if len(articles) >= expected_article_count:
+            break
+def scrape_article_details(article_url):
+    try:
+        wd.get(article_url)
+        WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='article-content')
+        content = content_tag.get_text().strip() if content_tag else ""
+        date_tag = soup.find('small', class_='text-muted time')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('img', class_='wp-post-image')
+        image_url = image_tag['src'] if image_tag else None
+        img_url = download_image(urljoin(article_url, image_url)) if image_url else None
+        return content, date, img_url
+    except TimeoutException:
+        print("Timed out waiting for page elements to load")
+        return "", "", None
+    except Exception as e:
+        print(f"An error occurred while scraping article details: {str(e)}")
+        return "", "", None
+def scrape_article_details(article_url):
+    try:
+        wd.get(article_url)
+        WebDriverWait(wd, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'article-content')))  # Adjusted to wait for article content
+        soup = BeautifulSoup(wd.page_source, 'html.parser')
+        content_tag = soup.find('div', class_='article-content')
+        content = content_tag.get_text().strip() if content_tag else ""
+        date_tag = soup.find('small', class_='text-muted time')
+        date = date_tag.get_text().strip() if date_tag else ""
+        image_tag = soup.find('img', class_='wp-post-image')
+        image_url = image_tag['src'] if image_tag else None
+        img_url = download_image(urljoin(article_url, image_url)) if image_url else None
+        return content, date, img_url
+    except TimeoutException:
+        print("Timed out waiting for page elements to load")
+        return "", "", None, ""
+    except Exception as e:
+        print(f"An error occurred while scraping article details: {str(e)}")
+        return "", "", None, ""
+def sanitize_filename(filename):
+    return re.sub(r'[^\w\s-]', '', filename).strip().lower().replace(' ', '_')
+def scrape_category(category_url, num_articles):
+    print("Attempting to scrape:", category_url)
+    articles_data = []
+    wd.get(category_url)
+    scroll_page(num_articles)
+    soup = BeautifulSoup(wd.page_source, 'html.parser')
+    articles = soup.find_all('div', class_='overlay card')
+    for article in articles[:num_articles]:
+        link_tag = article.find('a', class_='stretched-link')
+        link = link_tag['href'] if link_tag else ""
+        title_tag = article.find('h3', class_='card-title')
+        title = title_tag.get_text().strip() if title_tag else ""
+        content, date, img_url = scrape_article_details(link)
+        article_data = {
+            "Title": title,
+            "Date": date,
+            "Category": category_url.split('/')[-1],
+            "Content": content,
+            "Link": link,
+            "Image": img_url
+        }
+        print(f"Scraping article: {title}, Link: {link}")
+        articles_data.append(article_data)
+    # Save scraped data to a CSV file
+    category_name = sanitize_filename(category_url.split("/")[-1])
+    csv_file_path = os.path.join(os.getcwd(), f'{category_name}_data_fr.csv')
+    file_mode = 'a' if os.path.exists(csv_file_path) else 'w'
+    try:
+            with open(csv_file_path, file_mode, newline='', encoding='utf-8') as file:
+                fieldnames = ["Title", "Date", "Category", "Content", "Link", "Image"]
+                writer = csv.DictWriter(file, fieldnames=fieldnames)
+                if file_mode == 'w':
+                    writer.writeheader()
+                for article in articles_data:
+                    writer.writerow(article)
+            print(f"CSV file saved successfully at {csv_file_path}")
+    except IOError as e:
+            print(f"Failed to save file at {csv_file_path}: {e}")
+            return None  # Return None to indicate failure
+    # Check if the file exists before uploading
+    if os.path.exists(csv_file_path):
+        print(f"File successfully created at {csv_file_path}")
+        return csv_file_path
+    else:
+        print(f"Failed to create file for {category_url}")
+        return None

liberation_fr.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Import required libraries
+from bs4 import BeautifulSoup
+import requests
+import pandas as pd
+import time
+import timeit
+# Headers for simulating a browser request
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+}
+def faire_requete(url):
+    """
+    Effectuer une requête HTTP avec gestion des erreurs
+    Args:
+        url (str): l'URL de la requête HTTP
+    Returns:
+        bytes or None: Le contenu de la réponse si la requête est réussie, sinon None.
+    """
+    try:
+        with requests.get(url, headers=headers) as reponse:
+            reponse.raise_for_status()
+            return reponse.content
+    except requests.RequestException as e:
+        print(f"Erreur de requête HTTP: {e}")
+        return None
+def extract_articles(category_url, num_articles):
+    temps_debut = timeit.default_timer()
+    liens_articles = []
+    current_count = 0
+    while current_count < num_articles:
+        time.sleep(2)
+        contenu = faire_requete(category_url + f"?start={current_count}&order=")
+        if contenu:
+            soup = BeautifulSoup(contenu, "html.parser")
+            liens = soup.find_all("h3", {"class":"titre_article"})
+            for lien in liens:
+                if current_count >= num_articles:
+                    break
+                liens_articles.append("https://www.libe.ma" + lien.a["href"])
+                current_count += 1
+    lignes = []
+    for lien in liens_articles:
+        time.sleep(2)
+        contenu = faire_requete(lien)
+        if contenu:
+            soup = BeautifulSoup(contenu, "html.parser")
+            try:
+                titre = soup.find("h1", {"class":"access"}).text.replace("\n", "").strip()
+            except:
+                titre = None
+            try:
+                description = soup.find("div", {"class":"access firstletter"}).text.replace("\n", "").strip()
+            except:
+                description = None
+            try:
+                date = soup.find("div", {"class":"date"}).text.replace("\n", "").strip()
+            except:
+                date = None
+            lignes.append([titre, description, date])
+    return lignes
+def scrape_category(category_url, num_articles):
+    article_data = extract_articles(category_url, num_articles)
+    colonnes = ["titre", "content", "date"]
+    articles_df = pd.DataFrame(article_data, columns=colonnes)
+    csv_file_path = "liberation_art.csv"
+    articles_df.to_csv(csv_file_path, index=False)
+    return csv_file_path
+'''
+if __name__ == "__main__":
+    category_url = "https://www.libe.ma/Economie_r10.html"
+    num_articles = 10  # Number of articles to scrape
+    csv_file_path = scrape_category(category_url, num_articles)
+    # Now, csv_file_path can be used in Streamlit for uploading
+'''