import gradio as gr import pandas as pd import numpy as np import matplotlib.pyplot as plt import plotly.express as px from stop_words import get_stop_words from wordcloud import WordCloud from datasets import load_dataset import re ## import data dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets") data = pd.DataFrame.from_dict(dataset["train"]) # load stop words it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords") it_stop = pd.DataFrame.from_dict(it_stop_words["train"]) it_stop = it_stop.text.to_list() ## Optimize stop words according to Luca's repo def format_input(user_key, stopwords): ''' format user input request to lookup in the database of frequencies input: user_key is a string stopwords is a list of strings output: key is a string ''' key = user_key.lower() key = re.sub(r'[^\w\s]', ' ', key) key = ' '.join([el for el in key.split() if not (el in stopwords)]) return key ### Loading TFIDF TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct") TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr") TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul") TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan") ## Loading whole_text whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct") whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr") whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul") whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan") TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"]) TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"]) TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"]) TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"]) whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"]) whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"]) whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"]) whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"]) ser_TFIDF = [] ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0]) ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0]) ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0]) ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0]) ser_whole_text = [] ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0]) ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0]) ser_whole_text.append(whole_text_22_May_Jul.transpose()[0]) ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0]) def plot_time_series(choice, keyword, user_keys): x = np.arange(2,10,2) y = [[] for j in range(len(keyword))] for j in range(len(keyword)): i=0 while i < len(choice): try: y[j].append(choice[i][keyword[j]]) i += 1 except: y[j].append(0.0) i += 1 y[j] = np.array(y[j]) x_ticks_labels = ['Q1','Q2','Q3','Q4'] fig, ax = plt.subplots(1,1) for j in range(len(keyword)): ax.plot(x,y[j], label = user_keys[j].lower()) # Set number of ticks for x-axis ax.set_xticks(x) ax.set_xticklabels(x_ticks_labels, fontsize=12) leg = plt.legend(loc='best') plt.xlabel('Time') plt.title("keywords quartely analysis (July 2021 - July 2022)") plt.ylabel(f'Freq. from {user_choice}') return fig # Wordcloud with anger tweets angry_tweets = data['tweet'][data["emotion"] == 'anger'] angry_tweets = angry_tweets.apply(format_input, args = [it_stop]) stop_words = ["https", 'http', "co", "RT"] + list(it_stop) anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets)) # Wordcloud with sad tweets sad_tweets = data['tweet'][data["emotion"] == 'sadness'] sad_tweets = sad_tweets.apply(format_input, args = [it_stop]) stop_words = ["https", 'http', "co", "RT"] + list(it_stop) sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets)) # Wordcloud with joy tweets joy_tweets = data['tweet'][data["emotion"] == 'joy'] joy_tweets = joy_tweets.apply(format_input, args = [it_stop]) stop_words = ["https", 'http', "co", "RT"] + list(it_stop) joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets)) # Wordcloud with fear tweets fear_tweets = data['tweet'][data["emotion"] == 'fear'] fear_tweets = fear_tweets.apply(format_input, args = [it_stop]) stop_words = ["https", 'http', "co", "RT"] + list(it_stop) fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets)) ## COmbine all plots in a single plot wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2) # fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) wc_fig.tight_layout() ax1.imshow(sad_wordcloud, interpolation="bilinear") ax1.axis("off") ax1.set_title('Sadness', {'fontsize': 30}) ax2.imshow(joy_wordcloud, interpolation="bilinear") ax2.axis("off") ax2.set_title('Joy', {'fontsize': 30}) ax3.imshow(fear_wordcloud, interpolation="bilinear") ax3.axis("off") ax3.set_title('Fear', {'fontsize': 30}) ax4.imshow(anger_wordcloud, interpolation="bilinear") ax4.axis("off") ax4.set_title('Anger', {'fontsize': 30}) # plot a pie plot for emotions' distribution number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index() number_tweets_per_day["tweet_date"] = pd.to_datetime(number_tweets_per_day["date"]) time_fig = px.line(number_tweets_per_day, x = 'tweet_date', y = 'id', labels = {'id': 'count'}, color = 'emotion', color_discrete_sequence=px.colors.qualitative.G10) # create a lineplot for emotions sentiment_counts = data.groupby('emotion').agg({'id' : 'size'}).reset_index() sentiment_counts.rename(columns = {'id':'count'}, inplace = True) sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Tweets within each emotion', labels = {'id': 'count'}, color_discrete_sequence=px.colors.qualitative.G10) sent_fig def display_plot(image_choice): if image_choice == 'Sentiment distribution': return sent_fig elif image_choice == 'Time series': return time_fig elif image_choice == 'Word clouds': return wc_fig def display_freq_plot(choice, *args): user_keys = [arg for arg in args] # clean input strings to match keywords in the database keyword = [] for key in user_keys: keyword.append(format_input(key, it_stop)) if choice == "TFIDF": return plot_time_series(ser_TFIDF, keyword, user_keys) elif choice == "Whole_text": return plot_time_series(ser_whole_text, keyword, user_keys) with gr.Blocks() as demo: gr.Markdown("## Choose your adventure") with gr.Tabs(): with gr.TabItem("Topic modeling"): gr.Markdown("Nothing here yet") with gr.TabItem("Word frequency"): inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'), gr.Textbox(label = 'word 1'), gr.Textbox(label = 'word 2'), gr.Textbox(label = 'word 3'), gr.Textbox(label = 'word 4')] plot_output = gr.Plot(elem_id = 1) freq_button = gr.Button("Submit") freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output) with gr.TabItem("Sentiment analysis"): text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot') sent_plot = gr.Plot() sent_button = gr.Button("Submit") sent_button.click(display_plot, inputs=text_input, outputs= sent_plot) demo.launch(debug=True);