File size: 8,339 Bytes
571d313
 
 
 
 
 
 
 
102b824
571d313
 
 
 
 
 
 
102b824
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571d313
 
 
102b824
 
571d313
 
102b824
571d313
 
102b824
 
571d313
 
102b824
571d313
 
102b824
 
571d313
 
 
 
 
102b824
 
571d313
 
102b824
571d313
 
 
102b824
 
571d313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102b824
571d313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102b824
 
 
571d313
102b824
 
 
 
 
 
 
 
 
 
 
 
571d313
 
102b824
571d313
102b824
571d313
 
102b824
 
 
 
 
 
 
 
 
 
571d313
102b824
 
 
 
f861c07
102b824
571d313
102b824
 
 
9943afd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import gradio as gr 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
from stop_words import get_stop_words
from wordcloud import WordCloud
from datasets import load_dataset
import re

## import data 

dataset = load_dataset("Santarabantoosoo/italian_long_covid_tweets")
data = pd.DataFrame.from_dict(dataset["train"])


# load stop words

it_stop_words = load_dataset("Santarabantoosoo/italian-stopwords")
it_stop = pd.DataFrame.from_dict(it_stop_words["train"])

it_stop = it_stop.text.to_list()

## Optimize stop words according to Luca's repo

def format_input(user_key, stopwords):
  '''
  format user input request to lookup in the database of frequencies

  input:  
    user_key is a string
    stopwords is a list of strings
  output: 
    key is a string
  '''

  key = user_key.lower()
  key = re.sub(r'[^\w\s]', ' ', key)

  key = ' '.join([el for el in key.split() if not (el in stopwords)])


  return key


### Loading TFIDF

TFIDF_21_Jul_Oct = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Jul_Oct")

TFIDF_22_Feb_Apr = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_Feb_Apr")

TFIDF_22_May_Jul = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_22_May_Jul")

TFIDF_21_Nov_22_Jan = load_dataset("Santarabantoosoo/Long_Covid_word_frequency_TFIDF_21_Nov_22_Jan")


## Loading whole_text

whole_text_21_Jul_Oct = load_dataset("Santarabantoosoo/whole_text_TF_21_Jul_Oct")

whole_text_22_Feb_Apr = load_dataset("Santarabantoosoo/whole_text_TF_22_Feb_Apr")

whole_text_22_May_Jul = load_dataset("Santarabantoosoo/whole_text_TF_22_May_Jul")

whole_text_21_Nov_22_Jan = load_dataset("Santarabantoosoo/whole_text_TF_21_Nov_22_Jan")

TFIDF_21_Jul_Oct = pd.DataFrame.from_dict(TFIDF_21_Jul_Oct["train"])

TFIDF_22_Feb_Apr = pd.DataFrame.from_dict(TFIDF_22_Feb_Apr["train"])

TFIDF_22_May_Jul = pd.DataFrame.from_dict(TFIDF_22_May_Jul["train"])

TFIDF_21_Nov_22_Jan = pd.DataFrame.from_dict(TFIDF_21_Nov_22_Jan["train"])

whole_text_21_Jul_Oct = pd.DataFrame.from_dict(whole_text_21_Jul_Oct["train"])

whole_text_22_Feb_Apr = pd.DataFrame.from_dict(whole_text_22_Feb_Apr["train"])

whole_text_22_May_Jul = pd.DataFrame.from_dict(whole_text_22_May_Jul["train"])

whole_text_21_Nov_22_Jan = pd.DataFrame.from_dict(whole_text_21_Nov_22_Jan["train"])

ser_TFIDF = []

ser_TFIDF.append(TFIDF_21_Jul_Oct.transpose()[0])
ser_TFIDF.append(TFIDF_22_Feb_Apr.transpose()[0])
ser_TFIDF.append(TFIDF_22_May_Jul.transpose()[0])
ser_TFIDF.append(TFIDF_21_Nov_22_Jan.transpose()[0])

ser_whole_text = []

ser_whole_text.append(whole_text_21_Jul_Oct.transpose()[0])
ser_whole_text.append(whole_text_22_Feb_Apr.transpose()[0])
ser_whole_text.append(whole_text_22_May_Jul.transpose()[0])
ser_whole_text.append(whole_text_21_Nov_22_Jan.transpose()[0])


def plot_time_series(choice, keyword, user_keys):
    
    x = np.arange(2,10,2)
    
    y = [[] for j in range(len(keyword))]
    
    for j in range(len(keyword)):
      i=0
      while i < len(choice):
        try:
          y[j].append(choice[i][keyword[j]])
          i += 1
        except:
          y[j].append(0.0)
          i += 1

      y[j] = np.array(y[j])


    x_ticks_labels = ['Q1','Q2','Q3','Q4']

    fig, ax = plt.subplots(1,1) 

    for j in range(len(keyword)):
      ax.plot(x,y[j], label = user_keys[j].lower())


    # Set number of ticks for x-axis
    ax.set_xticks(x)
    ax.set_xticklabels(x_ticks_labels, fontsize=12)

    leg = plt.legend(loc='best')
    plt.xlabel('Time')
    plt.title("keywords quartely analysis (July 2021 - July 2022)")
    plt.ylabel(f'Freq. from {user_choice}')
    return fig


# Wordcloud with anger tweets
angry_tweets = data['tweet'][data["emotion"] == 'anger']
angry_tweets = angry_tweets.apply(format_input, args = [it_stop])
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
anger_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(angry_tweets))


# Wordcloud with sad tweets
sad_tweets = data['tweet'][data["emotion"] == 'sadness']
sad_tweets = sad_tweets.apply(format_input, args = [it_stop])
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
sad_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(sad_tweets))


 # Wordcloud with joy tweets
joy_tweets = data['tweet'][data["emotion"] == 'joy']
joy_tweets = joy_tweets.apply(format_input, args = [it_stop])
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
joy_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(joy_tweets))


 # Wordcloud with fear tweets
fear_tweets = data['tweet'][data["emotion"] == 'fear']
fear_tweets = fear_tweets.apply(format_input, args = [it_stop])
stop_words = ["https", 'http', "co", "RT"] + list(it_stop)
fear_wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", stopwords = stop_words).generate(str(fear_tweets))

## COmbine all plots in a single plot

wc_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2)

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

wc_fig.tight_layout()

ax1.imshow(sad_wordcloud, interpolation="bilinear")

ax1.axis("off")

ax1.set_title('Sadness', {'fontsize': 30})


ax2.imshow(joy_wordcloud, interpolation="bilinear")

ax2.axis("off")

ax2.set_title('Joy', {'fontsize': 30})


ax3.imshow(fear_wordcloud, interpolation="bilinear")

ax3.axis("off")

ax3.set_title('Fear', {'fontsize': 30})



ax4.imshow(anger_wordcloud, interpolation="bilinear")

ax4.axis("off")

ax4.set_title('Anger', {'fontsize': 30})


# plot a pie plot for emotions' distribution 

number_tweets_per_day = data.groupby(['date', 'emotion']).agg({'id': 'count'}).reset_index()

number_tweets_per_day["tweet_date"] = pd.to_datetime(number_tweets_per_day["date"])

time_fig = px.line(number_tweets_per_day, x = 'tweet_date', y = 'id', labels = {'id': 'count'}, color = 'emotion', 
                  color_discrete_sequence=px.colors.qualitative.G10)

# create a lineplot for emotions 

sentiment_counts = data.groupby('emotion').agg({'id' : 'size'}).reset_index()
sentiment_counts.rename(columns = {'id':'count'}, inplace = True)
sent_fig = px.pie(sentiment_counts, values='count', names='emotion', title='Tweets within each emotion', labels = {'id': 'count'}, 
                 color_discrete_sequence=px.colors.qualitative.G10)
sent_fig

def display_plot(image_choice):
    
    if image_choice == 'Sentiment distribution':
        return sent_fig
    
    elif image_choice == 'Time series':
        return time_fig
    
    elif image_choice == 'Word clouds':
        return wc_fig
    
def display_freq_plot(choice, *args):
    
    user_keys = [arg for arg in args]
    
    # clean input strings to match keywords in the database
    keyword = []
    for key in user_keys:
        keyword.append(format_input(key, it_stop))
        
    if choice == "TFIDF":
        return plot_time_series(ser_TFIDF, keyword, user_keys)
   
    elif choice == "Whole_text":
        return plot_time_series(ser_whole_text, keyword, user_keys)


with gr.Blocks() as demo:
    gr.Markdown("## Choose your adventure")
    
    with gr.Tabs():
        
        with gr.TabItem("Topic modeling"):
            gr.Markdown("Nothing here yet")

        with gr.TabItem("Word frequency"):

            inputs =  [gr.Radio(choices = ['TFIDF', 'Whole_text'], label = 'Choose ur method'), 
                      gr.Textbox(label = 'word 1'),
                      gr.Textbox(label = 'word 2'),
                      gr.Textbox(label = 'word 3'),
                      gr.Textbox(label = 'word 4')]
            plot_output = gr.Plot(elem_id = 1)
            freq_button = gr.Button("Submit")
            
        freq_button.click(display_freq_plot, inputs=inputs, outputs=plot_output)

        with gr.TabItem("Sentiment analysis"):
            text_input =  gr.Radio(choices = ['Sentiment distribution', 'Word clouds', 'Time series'], label = 'Choose ur plot')
            sent_plot = gr.Plot()
            sent_button = gr.Button("Submit")          
            
        sent_button.click(display_plot, inputs=text_input, outputs= sent_plot)
 

demo.launch(debug=True, show_error = True);