Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import plotly.express as px | |
from stop_words import get_stop_words | |
from wordcloud import WordCloud | |
from datasets import load_dataset | |
import re | |
## import data | |
dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets") | |
data = pd.DataFrame.from_dict(dataset["train"]) | |
# load stop words | |
it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords") | |
it_stop = pd.DataFrame.from_dict(it_stop_words["train"]) | |
it_stop = it_stop.text.to_list() | |
## Optimize stop words according to Luca's repo | |
def format_input(user_key, stopwords): | |
''' | |
format user input request to lookup in the database of frequencies | |
input: | |
user_key is a string | |
stopwords is a list of strings | |
output: | |
key is a string | |
''' | |
key = user_key.lower() | |
key = re.sub(r'[^\w\s]', ' ', key) | |
key = ' '.join([el for el in key.split() if not (el in stopwords)]) | |
return key | |
### Loading TFIDF | |
TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct") | |
TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr") | |
TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul") | |
TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan") | |
## Loading whole_text | |
whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct") | |
whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr") | |
whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul") | |
whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan") | |
TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"]) | |
TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"]) | |
TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"]) | |
TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"]) | |
whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"]) | |
whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"]) | |
whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"]) | |
whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"]) | |
ser_TFIDF = [] | |
ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0]) | |
ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0]) | |
ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0]) | |
ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0]) | |
ser_whole_text = [] | |
ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0]) | |
ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0]) | |
ser_whole_text.append(whole_text_22_May_Jul.transpose()[0]) | |
ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0]) | |
def plot_time_series(choice, keyword, user_keys): | |
x = np.arange(2,10,2) | |
y = [[] for j in range(len(keyword))] | |
for j in range(len(keyword)): | |
i=0 | |
while i < len(choice): | |
try: | |
y[j].append(choice[i][keyword[j]]) | |
i += 1 | |
except: | |
y[j].append(0.0) | |
i += 1 | |
y[j] = np.array(y[j]) | |
x_ticks_labels = ['Q1','Q2','Q3','Q4'] | |
fig, ax = plt.subplots(1,1) | |
for j in range(len(keyword)): | |
ax.plot(x,y[j], label = user_keys[j].lower()) | |
# Set number of ticks for x-axis | |
ax.set_xticks(x) | |
ax.set_xticklabels(x_ticks_labels, fontsize=12) | |
leg = plt.legend(loc='best') | |
plt.xlabel('Time') | |
plt.title("keywords quartely analysis (July 2021 - July 2022)") | |
plt.ylabel(f'Freq. from {user_keys}') | |
return fig | |
# Wordcloud with anger tweets | |
angry_tweets = data['tweet'][data["emotion"] == 'anger'] | |
angry_tweets = angry_tweets.apply(format_input, args = [it_stop]) | |
stop_words = ["https", 'http', "co", "RT"] + list(it_stop) | |
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets)) | |
# Wordcloud with sad tweets | |
sad_tweets = data['tweet'][data["emotion"] == 'sadness'] | |
sad_tweets = sad_tweets.apply(format_input, args = [it_stop]) | |
stop_words = ["https", 'http', "co", "RT"] + list(it_stop) | |
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets)) | |
# Wordcloud with joy tweets | |
joy_tweets = data['tweet'][data["emotion"] == 'joy'] | |
joy_tweets = joy_tweets.apply(format_input, args = [it_stop]) | |
stop_words = ["https", 'http', "co", "RT"] + list(it_stop) | |
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets)) | |
# Wordcloud with fear tweets | |
fear_tweets = data['tweet'][data["emotion"] == 'fear'] | |
fear_tweets = fear_tweets.apply(format_input, args = [it_stop]) | |
stop_words = ["https", 'http', "co", "RT"] + list(it_stop) | |
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets)) | |
## COmbine all plots in a single plot | |
wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2) | |
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) | |
wc_fig.tight_layout() | |
ax1.imshow(sad_wordcloud, interpolation="bilinear") | |
ax1.axis("off") | |
ax1.set_title('Sadness', {'fontsize': 30}) | |
ax2.imshow(joy_wordcloud, interpolation="bilinear") | |
ax2.axis("off") | |
ax2.set_title('Joy', {'fontsize': 30}) | |
ax3.imshow(fear_wordcloud, interpolation="bilinear") | |
ax3.axis("off") | |
ax3.set_title('Fear', {'fontsize': 30}) | |
ax4.imshow(anger_wordcloud, interpolation="bilinear") | |
ax4.axis("off") | |
ax4.set_title('Anger', {'fontsize': 30}) | |
# plot a pie plot for emotions' distribution | |
number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index() | |
number_tweets_per_day["tweet_date"] = pd.to_datetime(number_tweets_per_day["date"]) | |
time_fig = px.line(number_tweets_per_day, x = 'tweet_date', y = 'id', labels = {'id': 'count'}, color = 'emotion', | |
color_discrete_sequence=px.colors.qualitative.G10) | |
# create a lineplot for emotions | |
sentiment_counts = data.groupby('emotion').agg({'id' : 'size'}).reset_index() | |
sentiment_counts.rename(columns = {'id':'count'}, inplace = True) | |
sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Tweets within each emotion', labels = {'id': 'count'}, | |
color_discrete_sequence=px.colors.qualitative.G10) | |
sent_fig | |
def display_plot(image_choice): | |
if image_choice == 'Sentiment distribution': | |
return sent_fig | |
elif image_choice == 'Time series': | |
return time_fig | |
elif image_choice == 'Word clouds': | |
return wc_fig | |
def display_freq_plot(choice, *args): | |
user_keys = [arg for arg in args] | |
# clean input strings to match keywords in the database | |
keyword = [] | |
for key in user_keys: | |
keyword.append(format_input(key, it_stop)) | |
if choice == "TFIDF": | |
return plot_time_series(ser_TFIDF, keyword, user_keys) | |
elif choice == "Whole_text": | |
return plot_time_series(ser_whole_text, keyword, user_keys) | |
def display_output(tweet_index): | |
topics = "<ol>\ | |
<li>Discussion about scientific studies</li>\ | |
<li>Anxiety about pandemic and the information about it OR Specific people in the context of LC</li>\ | |
<li>Discussion about LC impact in terms of time periods</li>\ | |
<li>Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)</li>\ | |
<li>Treatment scenario</li>\ | |
<li>Impact/Consequences of LC on children</li>\ | |
</ol>" | |
item = topic_dist_list[tweet_index] | |
distribution = f'<html><body><h3>Topics Distribution</h3>({item[0][0]+1}, {item[0][1]}), ({item[1][0]+1}, {item[1][1]}), ({item[2][0]+1}, {item[2][1]}), ({item[3][0]+1}, {item[3][1]}), ({item[4][0]+1}, {item[4][1]}), ({item[5][0]+1}, {item[5][1]})\ | |
</body></html>' | |
return gr.HTML.update(distribution, visible=True) | |
def display_output_Q2_Q4(tweet_index): | |
item = topic_dist_list_Q2_Q4[tweet_index] | |
distribution = f'<html><body><h3>Topics Distribution</h3>({item[0][0]+1}, {item[0][1]}), ({item[1][0]+1}, {item[1][1]}), ({item[2][0]+1}, {item[2][1]}), ({item[3][0]+1}, {item[3][1]}), ({item[4][0]+1}, {item[4][1]}), ({item[5][0]+1}, {item[5][1]})\ | |
</body></html>' | |
return gr.HTML.update(distribution, visible=True) | |
# with gr.Blocks() as demo: | |
# gr.Markdown("## Choose your adventure") | |
# with gr.Tabs(): | |
# with gr.TabItem("Topic modeling"): | |
# gr.Markdown("Nothing here yet") | |
# with gr.TabItem("Word frequency"): | |
# inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'), | |
# gr.Textbox(label = 'word 1'), | |
# gr.Textbox(label = 'word 2'), | |
# gr.Textbox(label = 'word 3'), | |
# gr.Textbox(label = 'word 4')] | |
# plot_output = gr.Plot(elem_id = 1) | |
# freq_button = gr.Button("Submit") | |
# with gr.TabItem("Sentiment analysis"): | |
# text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot') | |
# sent_plot = gr.Plot() | |
# sent_button = gr.Button("Submit") | |
# sent_button.click(display_plot, inputs=text_input, outputs= sent_plot) | |
# freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output) | |
with gr.Blocks() as demo: | |
gr.Markdown("## Choose your adventure") | |
with gr.Tabs(): | |
with gr.TabItem("Topic modeling"): | |
gr.Markdown( | |
""" | |
## <div style="text-align: center;">Topic modeling analysis on Twitter</div> | |
""" | |
) | |
with gr.Tabs(): | |
with gr.TabItem("July-Semptember 2021"): | |
with gr.Row(): | |
gr.Image("./wordclouds_Q1 data.png", label="July-September 2021") | |
tweets_list = ['C\'è uno studio a riguardo condotto proprio sui più giovani che identifica il long covid alla stregua di ogni strascico di malattie infettive polmonari. Il long covid è dannoso come una polmonite in quanto a effetti a lungo termine. Se lo ritrovo te lo passo, ora sono fuori...', | |
'Mio cugino è guarito dal covid dopo 4 mesi di ospedale, di cui più di 2 intubato, grazie alla testardaggine dei medici che hanno fatto di tutto per salvargli la vita a 57 anni. Ora è nella fase long covid per recuperare i danni fisici riportati', | |
'È importante parlare di #LongCovid e sensibilizzare tutti, giovani compresi, che non è un gioco ma una malattia debilitante/invalidante che può stravolgere la vita. Io 39 anni e #LongCovid da 18 mesi (con 4 figli piccoli). #countlongcovid', | |
'Il Long Covid è una diretta conseguenza di quelli che nei primi tempi sono stati abbandonati a se stessi giorni e giorni e curati solo quando molto aggravati, in ospedale. Se ti curi tempestivamente non hai nessuna conseguenza.', | |
'Non sai di cosa parli sono stato un mese attaccato ad un respiratore e sono salvo per miracolo. Ma questo è niente in confronto con il #LongCovid che mi porto dietro da mesi e mesi. Siete dei criminali a pensare ch\'è meglio curare che prevenire. Dei pazzi da rinchiudere', | |
'A chi dice ""Il COVID è innocuo per i bambini"". Oltre ad alcuni decessi 500+ bambini sono morti di COVID negli USA 2020) c\'è #LongCOVID. Se ne parla in questo studio: ""Studio inglese rileva che il COVID a lungo colpisce fino a 1 bambino su 7 mesi dopo l\'infezione'] | |
q1_data_topic_list=['0. Discussion about scientific studies','1. Anxiety about pandemic and the information about it OR Specific people in the context of LC', | |
'2. Discussion about LC impact in terms of time periods','3. Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)' , | |
'4. Treatment scenario', '5. Impact/Consequences of LC on children'] | |
topic_dist_list=[[(0, 0.2181524), (1, 0.13380228), (2, 0.021277282), (3, 0.48123622), (4, 0.01883339), (5, 0.12669843)], | |
[(0, 0.0145399235), (1, 0.01287178), (2, 0.43158862), (3, 0.24750596), (4, 0.264914), (5, 0.028579665)], | |
[(0, 0.016303344), (1, 0.014450405), (2, 0.36162496), (3, 0.48426068), (4, 0.023487965), (5, 0.09987263)], | |
[(0, 0.018612841), (1, 0.016472807), (2, 0.44922927), (3, 0.033633586), (4, 0.026889767), (5, 0.45516175)], | |
[(0, 0.016305258), (1, 0.014453228), (2, 0.7628153), (3, 0.029092493), (4, 0.14613572), (5, 0.031198042)], | |
[(0, 0.016303508), (1, 0.014449066), (2, 0.15605325), (3, 0.029179793), (4, 0.023376595), (5, 0.7606378)]] | |
topics = '<html><body>\ | |
<h3><b>Topics July to Sept, 2021</b></h3>\ | |
<ol type="1">\ | |
<li>1. Discussion about scientific studies</li>\ | |
<li>2. Anxiety about pandemic and the information about it OR Specific people in the context of LC</li>\ | |
<li>3. Discussion about LC impact in terms of time periods</li>\ | |
<li>4. Discussion about LC impact on patient life (impact on life so far or scope for lifelong impact)</li>\ | |
<li>5. Treatment scenario</li>\ | |
<li>6. Impact/Consequences of LC on children</li>\ | |
</ol>\ | |
</body></html>' | |
Q1_topics = gr.HTML(topics, visible=True) | |
gr.Markdown( | |
""" | |
### Test our topic modeling model : select a tweet and check the topics distribution ! | |
""" | |
) | |
tweet = gr.Dropdown(tweets_list, label="Example tweets", interactive=True, type="index") | |
model_output = gr.HTML("", visible=False) | |
tweet.change(display_output, tweet, model_output) | |
with gr.TabItem("October 2021-July 2022"): | |
topic_dist_list_Q2_Q4=[[(0, 0.4377157), (1, 0.05924045), (2, 0.1525337), (3, 0.1941842), (4, 0.075339705), (5, 0.08098622)], | |
[(0, 0.16064012), (1, 0.063850455), (2, 0.08664099), (3, 0.2870743), (4, 0.081202514), (5, 0.32059166)], | |
[(0, 0.14904374), (1, 0.059243646), (2, 0.08039133), (3, 0.26638654), (4, 0.07534457), (5, 0.36959016)], | |
[(0, 0.14897935), (1, 0.059245925), (2, 0.08039324), (3, 0.41068354), (4, 0.14752874), (5, 0.15316921)], | |
[(0, 0.089826144), (1, 0.069229595), (2, 0.09393969), (3, 0.5643193), (4, 0.08804329), (5, 0.09464199)], | |
[(0, 0.08284077), (1, 0.29718927), (2, 0.08663448), (3, 0.36485678), (4, 0.08119658), (5, 0.08728213)]] | |
with gr.Row(): | |
gr.Image("./wordclouds_Q2-Q2 data.png", label="October 2021-July 2022") | |
Q2_Q4_topics = '<html><body>\ | |
<h3><b>Topics October 2021 to July 2022</b></h3>\ | |
<ol type="1">\ | |
<li>1. Variants</li>\ | |
<li>2. Vaccine side-effects (and general anti-vax/ anti-LC narrative)</li>\ | |
<li>3. Aftermath of LC or vaccine</li>\ | |
<li>4. Impact of LC in terms of time OR Risks/Symptoms of LC</li>\ | |
<li>5. Anger or anxiety about LC information</li>\ | |
<li>6. Discussion or Information about the science/knowledge surrounding LC</li>\ | |
</ol>\ | |
</body></html>' | |
Q2_Q4_topics_html = gr.HTML(Q2_Q4_topics, visible=True) | |
tweet_list_Q2_Q4=["Omicron e Long Covid: palpitazioni e perdita d'udito tra i sintomi - #Omicron #Covid: #palpitazioni ", | |
'Long Covid e trombosi. La correlazione è spiegata da Giovanni Esposito, Presidente GISE, in un articolo sul sito https://t.co/8TdI9nhDHY e avvalorata da uno studio svedese pubblicato sul British Medical Journal. https://t.co/UebaXUtfbz', | |
'Peccato che il ""long COVID"" che è proprio ciò di cui parla l\'esimio dottore citato determini una alterazione o soppressione del sistema immunitario di cui si sa ancora poco ma che può portare a conseguenze fatali per il paziente.', | |
'Il Long covid rappresentava un problema solo fino ad aprile 2021, i vaccini hanno molto ridotto l\'impatto e la gravità delle patologie a lungo termine, in pratica si può dire che il long covid non esiste più', | |
'Sicuro, 100-150 morti al giorno, 6 ondate l anno, rischio long covid, rischio evoluzionario, e via dicendo — finitissimo', | |
'le cure le fai giorno dopo giorno... ci sono casi di long-covid dopo 6 mesi dall\'infezione. [Vaccino > >Cure] è un dato di fatto', | |
'A parte il rischio di sviluppare il #longcovid, il pericolo grave di lasciar circolare il virus e di farlo diventare endemico come preconizza il governo e lo sciagurato #speranza non è nel decorso del singolo caso ma nell\'aumento proporzionale dell\'insorgere di nuove varianti'] | |
gr.Markdown( | |
""" | |
### Test our topic modeling model : select a tweet and check the topics distribution ! | |
""" | |
) | |
tweet_Q2_Q4 = gr.Dropdown(tweet_list_Q2_Q4, label="Example tweets", interactive=True, type="index") | |
model_output_Q2_Q4 = gr.HTML("", visible=False) | |
tweet_Q2_Q4.change(display_output_Q2_Q4, tweet_Q2_Q4, model_output_Q2_Q4) | |
with gr.TabItem("Word frequency"): | |
inputs = [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'), | |
gr.Textbox(label = 'word 1'), | |
gr.Textbox(label = 'word 2'), | |
gr.Textbox(label = 'word 3')] | |
plot_output = gr.Plot() | |
freq_button = gr.Button("Submit") | |
freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output) | |
gr.Examples( | |
examples= [['TFIDF', 'Stanchezza', "l'età", '#LongCovidKids'], ['Whole_text', 'nebbia mentale', 'mal di testa', 'Ansia']], | |
inputs= inputs) | |
with gr.TabItem("Sentiment analysis"): | |
text_input = gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot') | |
sent_plot = gr.Plot() | |
sent_button = gr.Button("Submit") | |
sent_button.click(display_plot, inputs=text_input, outputs= sent_plot) | |
demo.launch(debug=True, show_error = True); | |