Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Sleeping

App Files Files Community

Classification-News-Analysis-and-Prediction / pages /News_Scrapping.py

Bayhaqy

Update pages/News_Scrapping.py

6d4fa29 10 months ago

raw

history blame

15.9 kB

	# News Information and data article
	from newspaper import Article, Config
	from gnews import GNews

	# Data Analysis and Profiling
	import pandas as pd
	from ydata_profiling import ProfileReport
	from st_aggrid import AgGrid

	# Streamlit for Building the Dashboard
	import streamlit as st
	from streamlit_pandas_profiling import st_profile_report

	# Language Detection
	from langdetect import detect

	# NLP and Text Processing
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from deep_translator import GoogleTranslator
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from bs4 import BeautifulSoup

	# Sentiment Analysis
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	from textblob import TextBlob

	# URL Parsing
	from urllib.parse import urlparse

	# Data Visualization
	import plotly.express as px
	import matplotlib.pyplot as plt

	# Word Cloud Generation
	from wordcloud import WordCloud

	# Other Libraries
	import torch
	import requests
	import subprocess
	import logging
	import json
	import re
	import os

	# NLTK Data Download
	nltk.download('stopwords')
	nltk.download('wordnet')
	nltk.download('punkt')

	## ............................................... ##
	# Set page configuration (Call this once and make changes as needed)
	st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':newspaper:')

	with st.container():
	# Initialize Streamlit app
	st.title('News Article Scrapping')
	st.write("Created by Bayhaqy")

	## ............................................... ##
	# Set up logging
	logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

	## ............................................... ##
	# Function for get model and tokenize
	@st.cache_resource
	def get_models_and_tokenizers():
	model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
	#model.eval()

	return model, tokenizer

	# Function for sentiment analysis
	@st.cache_resource
	def analyze_sentiment_distilbert(text, _model, _tokenizer):
	try:
	tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
	with torch.no_grad():
	raw_predictions = _model(**tokens_info).logits

	predicted_class_id = raw_predictions.argmax().item()
	predict = _model.config.id2label[predicted_class_id]

	softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
	if (softmaxed > 70):
	status = 'Not trust'
	elif (softmaxed > 40):
	status = 'Not sure'
	else:
	status = 'Trust'
	return status, predict

	except Exception as e:
	logging.error(f"Sentiment analysis error: {str(e)}")
	return 'N/A', 'N/A'

	# Function for sentiment analysis using VADER
	@st.cache_data
	def analyze_sentiment_vader(text):
	analyzer = SentimentIntensityAnalyzer()
	sentiment = analyzer.polarity_scores(text)
	compound_score = sentiment['compound']
	if compound_score >= 0.05:
	return 'Positive'
	elif compound_score <= -0.05:
	return 'Negative'
	else:
	return 'Neutral'

	# Function for sentiment analysis using TextBlob
	@st.cache_data
	def analyze_sentiment_textblob(text):
	analysis = TextBlob(text)
	polarity = analysis.sentiment.polarity
	if polarity > 0:
	return 'Positive'
	elif polarity < 0:
	return 'Negative'
	else:
	return 'Neutral'

	## ............................................... ##
	# Function to process an article
	@st.cache_data
	def process_article(url, _config):
	try:
	article = Article(url=url, config=_config)
	article.download()
	article.parse()

	# Check if publish_date is not None before further processing
	if article.publish_date is None:
	return None # Skip processing and return None

	# Check if text is not None before further processing
	if len(article.text) <= 5:
	return None # Skip processing and return None

	# Get the article data if publish_date is not not None
	text = article.text
	url = article.canonical_link
	source_url = urlparse(url).netloc

	title = article.title
	authors = article.authors
	#publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
	publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')

	article.nlp()
	keywords = article.meta_keywords
	summary = article.summary

	language = detect(title)

	return publish_date, language, url, source_url, title, authors, keywords, text, summary

	except Exception as e:
	logging.error(f"Article processing error: {str(e)}")
	return None # Skip processing and return None

	# Function for translation
	@st.cache_data
	def translate_text(text, source='auto', target='en'):
	try:
	if source != target:
	text = GoogleTranslator(source=source, target=target).translate(text)
	return text

	except Exception as e:
	logging.error(f"Translation error: {str(e)}")
	return text

	## ............................................... ##
	# Function to preprocess the data
	@st.cache_data
	def preprocessing_data(df):
	# Remove duplicates
	df = df.drop_duplicates(subset='Translation')

	# Reset the index to add the date column
	df.reset_index(inplace=True,drop=True)

	# Function to clean and preprocess text
	def clean_text(text):
	# Remove URLs
	text = re.sub(r'http\S+', '', text)

	# Convert to lowercase
	text = text.lower()

	# Remove non-alphanumeric characters
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize text
	words = nltk.word_tokenize(text)

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	words = [word for word in words if word not in stop_words]

	# Lemmatize words
	lemmatizer = WordNetLemmatizer()
	words = [lemmatizer.lemmatize(word) for word in words]

	return ' '.join(words)

	# Apply the clean_text function to the "Translation" column
	df['Cleaned Translation'] = df['Translation'].apply(clean_text)

	return df

	## ............................................... ##
	# Function to create a Word Cloud
	@st.cache_data
	def create_wordcloud(df):
	# Combine all text
	text = ' '.join(df['Cleaned Translation'])

	# Create a Word Cloud
	wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)

	# Convert the word cloud to an image
	wordcloud_image = wordcloud.to_image()

	# Display the Word Cloud using st.image
	st.image(wordcloud_image, use_column_width=True)

	## ............................................... ##
	with st.container():
	# Input search parameters
	search_term = st.text_input('Enter a search term :', 'Indonesia')

	col1, col2, col3 = st.columns(3)

	with col1:
	period = st.text_input('Enter a news period :', '7d')
	max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
	with col2:
	country = st.text_input('Country :', 'Indonesia')
	language = st.text_input('Language :', 'indonesian')
	with col3:
	start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
	end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))

	## ............................................... ##
	with st.container():
	col1, col2 = st.columns(2)

	with col1:
	# Checkbox options for different processing steps
	include_translation = st.checkbox("Include Translation", value=True)
	include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
	with col2:
	include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
	include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)

	## ............................................... ##
	# Create a variable to track whether the data has been processed
	data_processed = False

	## ............................................... ##
	# Create a custom configuration
	config = Config()
	config.number_threads = 500
	config.request_timeout = 10

	## ............................................... ##
	# Initialize the DataFrame
	df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary'])

	# Initialize your model and tokenizer
	model, tokenizer = get_models_and_tokenizers()

	## ............................................... ##
	with st.container():
	# Fetch news and process articles
	if st.button('Fetch and Process News'):
	# Your news retrieval code
	google_news = GNews()
	google_news.period = period # News from last 7 days
	google_news.max_results = max_results # number of responses across a keyword
	google_news.country = country # News from a specific country
	google_news.language = language # News in a specific language
	#google_news.exclude_websites = ['yahoo.com', 'cnn.com'] # Exclude news from specific website i.e Yahoo.com and CNN.com
	google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
	google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023

	news = google_news.get_news(search_term)

	## ............................................... ##,
	# Progress bar for fetching and processing news
	progress_bar = st.progress(0)
	total_news = len(news)

	# Your news retrieval code (assuming 'news' is a list of article URLs)
	#for x in news:
	for idx, x in enumerate(news):
	result = process_article(x['url'], _config=config)
	if result is not None:
	publish_date, language, url, source_url, title, authors, keywords, text, summary = result

	# Insert to dataframe
	temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
	'Text': [text], 'Summary': [summary]})
	df = pd.concat([df, temp_df], ignore_index=True)

	# Convert 'Publish_Date' to DatetimeIndex
	df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])

	# Update the progress bar
	progress = (idx + 1) / total_news
	progress_bar.progress(progress)

	# Conditionally apply translation function to the 'Translation' column
	if include_translation:
	df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' \| ' + row['Summary']), source=row['Language'], target='en'), axis=1)

	# Preprocessing Data
	df = preprocessing_data(df)

	# Conditionally apply sentiment analysis function to the 'Translation' column
	if include_sentiment_analysis:
	df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))


	# Conditionally apply VADER sentiment analysis to the 'Translation' column
	if include_sentiment_vader:
	df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)

	# Conditionally apply TextBlob sentiment analysis to the 'Translation' column
	if include_sentiment_textblob:
	df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)

	# Set data_processed to True when the data has been successfully processed
	data_processed = True

	## ............................................... ##
	# Add a button to download the data as a CSV file
	if data_processed:
	st.markdown("### Download Processed Data as CSV")
	st.write("Click the button below to download the processed data as a CSV file.")

	# Create a downloadable link
	csv_data = df.to_csv(index=False).encode()
	st.download_button(
	label="Download CSV",
	data=csv_data,
	file_name="processed_data.csv",
	)

	## ............................................... ##
	with st.expander("See for Table"):
	# Display processed data
	if data_processed:
	AgGrid(df, height=400)

	## ............................................... ##
	# Display processed data
	with st.expander("See for Exploratory Data Analysis"):
	if data_processed:
	col1, col2 = st.columns(2)
	with col1:
	## ............................................... ##
	# Create a DataFrame to count the number of tweets by Fake Check
	FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
	FakeCheck_counts.columns = ['Fake Check', 'News Count']
	fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
	st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)

	## ............................................... ##
	# Create wordcloud
	try:
	st.write('WordCloud for News')
	create_wordcloud(df)
	except Exception as e:
	logging.error(f" Column Translation Not Available : {str(e)}")

	## ............................................... ##

	with col2:
	## ............................................... ##
	# Create a DataFrame to count the number of News by language
	language_counts = df['Language'].value_counts().reset_index()
	language_counts.columns = ['Language', 'News Count']
	fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
	st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)

	## ............................................... ##
	# Group by Sentiment columns and get the count
	try:
	sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
	sentiment_counts = sentiment_counts.reset_index()
	sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
	fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
	st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)

	except Exception as e:
	logging.error(f" Column Sentiment Not Available : {str(e)}")

	## ............................................... ##

	with st.expander("See for Analysis with ydata-profiling"):
	## ............................................... ##
	# Display processed data
	if data_processed:
	pr = ProfileReport(df)
	st_profile_report(pr)