Bayhaqy's picture
Update pages/News_Scrapping.py
6d4fa29
raw
history blame
15.9 kB
# News Information and data article
from newspaper import Article, Config
from gnews import GNews
# Data Analysis and Profiling
import pandas as pd
from ydata_profiling import ProfileReport
from st_aggrid import AgGrid
# Streamlit for Building the Dashboard
import streamlit as st
from streamlit_pandas_profiling import st_profile_report
# Language Detection
from langdetect import detect
# NLP and Text Processing
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from deep_translator import GoogleTranslator
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
# URL Parsing
from urllib.parse import urlparse
# Data Visualization
import plotly.express as px
import matplotlib.pyplot as plt
# Word Cloud Generation
from wordcloud import WordCloud
# Other Libraries
import torch
import requests
import subprocess
import logging
import json
import re
import os
# NLTK Data Download
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':newspaper:')
with st.container():
# Initialize Streamlit app
st.title('News Article Scrapping')
st.write("Created by Bayhaqy")
## ............................................... ##
# Set up logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
## ............................................... ##
# Function for get model and tokenize
@st.cache_resource
def get_models_and_tokenizers():
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
#model.eval()
return model, tokenizer
# Function for sentiment analysis
@st.cache_resource
def analyze_sentiment_distilbert(text, _model, _tokenizer):
try:
tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
with torch.no_grad():
raw_predictions = _model(**tokens_info).logits
predicted_class_id = raw_predictions.argmax().item()
predict = _model.config.id2label[predicted_class_id]
softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
if (softmaxed > 70):
status = 'Not trust'
elif (softmaxed > 40):
status = 'Not sure'
else:
status = 'Trust'
return status, predict
except Exception as e:
logging.error(f"Sentiment analysis error: {str(e)}")
return 'N/A', 'N/A'
# Function for sentiment analysis using VADER
@st.cache_data
def analyze_sentiment_vader(text):
analyzer = SentimentIntensityAnalyzer()
sentiment = analyzer.polarity_scores(text)
compound_score = sentiment['compound']
if compound_score >= 0.05:
return 'Positive'
elif compound_score <= -0.05:
return 'Negative'
else:
return 'Neutral'
# Function for sentiment analysis using TextBlob
@st.cache_data
def analyze_sentiment_textblob(text):
analysis = TextBlob(text)
polarity = analysis.sentiment.polarity
if polarity > 0:
return 'Positive'
elif polarity < 0:
return 'Negative'
else:
return 'Neutral'
## ............................................... ##
# Function to process an article
@st.cache_data
def process_article(url, _config):
try:
article = Article(url=url, config=_config)
article.download()
article.parse()
# Check if publish_date is not None before further processing
if article.publish_date is None:
return None # Skip processing and return None
# Check if text is not None before further processing
if len(article.text) <= 5:
return None # Skip processing and return None
# Get the article data if publish_date is not not None
text = article.text
url = article.canonical_link
source_url = urlparse(url).netloc
title = article.title
authors = article.authors
#publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')
article.nlp()
keywords = article.meta_keywords
summary = article.summary
language = detect(title)
return publish_date, language, url, source_url, title, authors, keywords, text, summary
except Exception as e:
logging.error(f"Article processing error: {str(e)}")
return None # Skip processing and return None
# Function for translation
@st.cache_data
def translate_text(text, source='auto', target='en'):
try:
if source != target:
text = GoogleTranslator(source=source, target=target).translate(text)
return text
except Exception as e:
logging.error(f"Translation error: {str(e)}")
return text
## ............................................... ##
# Function to preprocess the data
@st.cache_data
def preprocessing_data(df):
# Remove duplicates
df = df.drop_duplicates(subset='Translation')
# Reset the index to add the date column
df.reset_index(inplace=True,drop=True)
# Function to clean and preprocess text
def clean_text(text):
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Convert to lowercase
text = text.lower()
# Remove non-alphanumeric characters
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize text
words = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# Lemmatize words
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
return ' '.join(words)
# Apply the clean_text function to the "Translation" column
df['Cleaned Translation'] = df['Translation'].apply(clean_text)
return df
## ............................................... ##
# Function to create a Word Cloud
@st.cache_data
def create_wordcloud(df):
# Combine all text
text = ' '.join(df['Cleaned Translation'])
# Create a Word Cloud
wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)
# Convert the word cloud to an image
wordcloud_image = wordcloud.to_image()
# Display the Word Cloud using st.image
st.image(wordcloud_image, use_column_width=True)
## ............................................... ##
with st.container():
# Input search parameters
search_term = st.text_input('Enter a search term :', 'Indonesia')
col1, col2, col3 = st.columns(3)
with col1:
period = st.text_input('Enter a news period :', '7d')
max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
with col2:
country = st.text_input('Country :', 'Indonesia')
language = st.text_input('Language :', 'indonesian')
with col3:
start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))
## ............................................... ##
with st.container():
col1, col2 = st.columns(2)
with col1:
# Checkbox options for different processing steps
include_translation = st.checkbox("Include Translation", value=True)
include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
with col2:
include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)
## ............................................... ##
# Create a variable to track whether the data has been processed
data_processed = False
## ............................................... ##
# Create a custom configuration
config = Config()
config.number_threads = 500
config.request_timeout = 10
## ............................................... ##
# Initialize the DataFrame
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary'])
# Initialize your model and tokenizer
model, tokenizer = get_models_and_tokenizers()
## ............................................... ##
with st.container():
# Fetch news and process articles
if st.button('Fetch and Process News'):
# Your news retrieval code
google_news = GNews()
google_news.period = period # News from last 7 days
google_news.max_results = max_results # number of responses across a keyword
google_news.country = country # News from a specific country
google_news.language = language # News in a specific language
#google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
news = google_news.get_news(search_term)
## ............................................... ##,
# Progress bar for fetching and processing news
progress_bar = st.progress(0)
total_news = len(news)
# Your news retrieval code (assuming 'news' is a list of article URLs)
#for x in news:
for idx, x in enumerate(news):
result = process_article(x['url'], _config=config)
if result is not None:
publish_date, language, url, source_url, title, authors, keywords, text, summary = result
# Insert to dataframe
temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
'Text': [text], 'Summary': [summary]})
df = pd.concat([df, temp_df], ignore_index=True)
# Convert 'Publish_Date' to DatetimeIndex
df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
# Update the progress bar
progress = (idx + 1) / total_news
progress_bar.progress(progress)
# Conditionally apply translation function to the 'Translation' column
if include_translation:
df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
# Preprocessing Data
df = preprocessing_data(df)
# Conditionally apply sentiment analysis function to the 'Translation' column
if include_sentiment_analysis:
df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
# Conditionally apply VADER sentiment analysis to the 'Translation' column
if include_sentiment_vader:
df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
if include_sentiment_textblob:
df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
# Set data_processed to True when the data has been successfully processed
data_processed = True
## ............................................... ##
# Add a button to download the data as a CSV file
if data_processed:
st.markdown("### Download Processed Data as CSV")
st.write("Click the button below to download the processed data as a CSV file.")
# Create a downloadable link
csv_data = df.to_csv(index=False).encode()
st.download_button(
label="Download CSV",
data=csv_data,
file_name="processed_data.csv",
)
## ............................................... ##
with st.expander("See for Table"):
# Display processed data
if data_processed:
AgGrid(df, height=400)
## ............................................... ##
# Display processed data
with st.expander("See for Exploratory Data Analysis"):
if data_processed:
col1, col2 = st.columns(2)
with col1:
## ............................................... ##
# Create a DataFrame to count the number of tweets by Fake Check
FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
FakeCheck_counts.columns = ['Fake Check', 'News Count']
fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
## ............................................... ##
# Create wordcloud
try:
st.write('WordCloud for News')
create_wordcloud(df)
except Exception as e:
logging.error(f" Column Translation Not Available : {str(e)}")
## ............................................... ##
with col2:
## ............................................... ##
# Create a DataFrame to count the number of News by language
language_counts = df['Language'].value_counts().reset_index()
language_counts.columns = ['Language', 'News Count']
fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
## ............................................... ##
# Group by Sentiment columns and get the count
try:
sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
sentiment_counts = sentiment_counts.reset_index()
sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
except Exception as e:
logging.error(f" Column Sentiment Not Available : {str(e)}")
## ............................................... ##
with st.expander("See for Analysis with ydata-profiling"):
## ............................................... ##
# Display processed data
if data_processed:
pr = ProfileReport(df)
st_profile_report(pr)