Spaces:

Santarabantoosoo
/

Sentiments_topic_modeling_ITALIAN

Runtime error

Sentiments_topic_modeling_ITALIAN / app.py

santarabantoosoo

asd

9943afd over 2 years ago

8.34 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import plotly.express as px
	from stop_words import get_stop_words
	from wordcloud import WordCloud
	from datasets import load_dataset
	import re

	## import data

	dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
	data = pd.DataFrame.from_dict(dataset["train"])


	# load stop words

	it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
	it_stop = pd.DataFrame.from_dict(it_stop_words["train"])

	it_stop = it_stop.text.to_list()

	## Optimize stop words according to Luca's repo

	def format_input(user_key, stopwords):
	'''
	format user input request to lookup in the database of frequencies

	input:
	user_key is a string
	stopwords is a list of strings
	output:
	key is a string
	'''

	key = user_key.lower()
	key = re.sub(r'[^\w\s]', ' ', key)

	key = ' '.join([el for el in key.split() if not (el in stopwords)])


	return key


	### Loading TFIDF

	TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")

	TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")

	TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")

	TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")


	## Loading whole_text

	whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")

	whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")

	whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")

	whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")

	TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])

	TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])

	TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])

	TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])

	whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])

	whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])

	whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])

	whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])

	ser_TFIDF = []

	ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
	ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
	ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
	ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])

	ser_whole_text = []

	ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
	ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
	ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
	ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])


	def plot_time_series(choice, keyword, user_keys):

	x = np.arange(2,10,2)

	y = [[] for j in range(len(keyword))]

	for j in range(len(keyword)):
	i=0
	while i < len(choice):
	try:
	y[j].append(choice[i][keyword[j]])
	i += 1
	except:
	y[j].append(0.0)
	i += 1

	y[j] = np.array(y[j])


	x_ticks_labels = ['Q1','Q2','Q3','Q4']

	fig, ax = plt.subplots(1,1)

	for j in range(len(keyword)):
	ax.plot(x,y[j], label = user_keys[j].lower())


	# Set number of ticks for x-axis
	ax.set_xticks(x)
	ax.set_xticklabels(x_ticks_labels, fontsize=12)

	leg = plt.legend(loc='best')
	plt.xlabel('Time')
	plt.title("keywords quartely analysis (July 2021 - July 2022)")
	plt.ylabel(f'Freq. from {user_choice}')
	return fig


	# Wordcloud with anger tweets
	angry_tweets = data['tweet'][data["emotion"] == 'anger']
	angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
	stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
	anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))


	# Wordcloud with sad tweets
	sad_tweets = data['tweet'][data["emotion"] == 'sadness']
	sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
	stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
	sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))


	# Wordcloud with joy tweets
	joy_tweets = data['tweet'][data["emotion"] == 'joy']
	joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
	stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
	joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))


	# Wordcloud with fear tweets
	fear_tweets = data['tweet'][data["emotion"] == 'fear']
	fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
	stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
	fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))

	## COmbine all plots in a single plot

	wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)

	# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

	wc_fig.tight_layout()

	ax1.imshow(sad_wordcloud, interpolation="bilinear")

	ax1.axis("off")

	ax1.set_title('Sadness', {'fontsize': 30})


	ax2.imshow(joy_wordcloud, interpolation="bilinear")

	ax2.axis("off")

	ax2.set_title('Joy', {'fontsize': 30})


	ax3.imshow(fear_wordcloud, interpolation="bilinear")

	ax3.axis("off")

	ax3.set_title('Fear', {'fontsize': 30})



	ax4.imshow(anger_wordcloud, interpolation="bilinear")

	ax4.axis("off")

	ax4.set_title('Anger', {'fontsize': 30})


	# plot a pie plot for emotions' distribution

	number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()

	number_tweets_per_day["tweet_date"] = pd.to_datetime(number_tweets_per_day["date"])

	time_fig = px.line(number_tweets_per_day, x = 'tweet_date', y = 'id', labels = {'id': 'count'}, color = 'emotion',
	color_discrete_sequence=px.colors.qualitative.G10)

	# create a lineplot for emotions

	sentiment_counts = data.groupby('emotion').agg({'id' : 'size'}).reset_index()
	sentiment_counts.rename(columns = {'id':'count'}, inplace = True)
	sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Tweets within each emotion', labels = {'id': 'count'},
	color_discrete_sequence=px.colors.qualitative.G10)
	sent_fig

	def display_plot(image_choice):

	if image_choice == 'Sentiment distribution':
	return sent_fig

	elif image_choice == 'Time series':
	return time_fig

	elif image_choice == 'Word clouds':
	return wc_fig

	def display_freq_plot(choice, *args):

	user_keys = [arg for arg in args]

	# clean input strings to match keywords in the database
	keyword = []
	for key in user_keys:
	keyword.append(format_input(key, it_stop))

	if choice == "TFIDF":
	return plot_time_series(ser_TFIDF, keyword, user_keys)

	elif choice == "Whole_text":
	return plot_time_series(ser_whole_text, keyword, user_keys)


	with gr.Blocks() as demo:
	gr.Markdown("## Choose your adventure")

	with gr.Tabs():

	with gr.TabItem("Topic modeling"):
	gr.Markdown("Nothing here yet")

	with gr.TabItem("Word frequency"):

	inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'),
	gr.Textbox(label = 'word 1'),
	gr.Textbox(label = 'word 2'),
	gr.Textbox(label = 'word 3'),
	gr.Textbox(label = 'word 4')]
	plot_output = gr.Plot(elem_id = 1)
	freq_button = gr.Button("Submit")

	freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)

	with gr.TabItem("Sentiment analysis"):
	text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
	sent_plot = gr.Plot()
	sent_button = gr.Button("Submit")

	sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)


	demo.launch(debug=True, show_error = True);