Spaces:
Sleeping
Sleeping
import dash | |
from dash import Dash, html, dcc, callback, Output, Input | |
from dash import dash_table | |
import plotly.express as px | |
from app import app | |
import pandas as pd | |
import datetime | |
import requests | |
from io import StringIO | |
from datetime import date | |
import dash_bootstrap_components as dbc | |
import plotly.express as px | |
from dateutil.parser import parse | |
def convert_to_datetime(input_str, parserinfo=None): | |
return parse(input_str, parserinfo=parserinfo) | |
server = app.server | |
url='https://drive.google.com/file/d/1NaXOYHQFF5UO5rQr4rn8Lr3bkYMSOq4_/view?usp=sharing' | |
url='https://drive.google.com/uc?id=' + url.split('/')[-2] | |
# reading of file | |
df = pd.read_csv(url) | |
# removing Aborto | |
df = df[df["Topic"]!="Aborto"] | |
# filtering the file for more than 4 tokens | |
df = df[df['Headline'].str.split().str.len().gt(4)] | |
df['date'] = pd.to_datetime(df['date']) | |
unique_domains = df['domain_folder_name'].unique() | |
# print(unique_domains) | |
unique_topics = df['Topic'].unique() | |
# print(unique_topics) | |
#copying a column | |
df["Veículos de notícias"] = df["domain_folder_name"] | |
# df = df.rename(columns={df.columns[4]: "Veículos de notícias"}) | |
df['FinBERT_label'] = df['FinBERT_label'].astype(str) | |
df['FinBERT_label'].replace({ | |
# '3.0': 'positive', | |
# '2.0': 'neutral', | |
# '1.0': 'negative' | |
'3.0': 'positivo', | |
'2.0': 'neutro', | |
'1.0': 'negativo' | |
}, inplace=True) | |
counts = df.groupby(['date', 'Topic', 'domain_folder_name', 'FinBERT_label']).size().reset_index(name='count') | |
counts['count'] = counts['count'].astype('float64') | |
counts['rolling_mean_counts'] = counts['count'].rolling(window=30, min_periods=2).mean() | |
df_pos = counts[[x in ['positivo'] for x in counts.FinBERT_label]] | |
df_neu = counts[[x in ['neutro'] for x in counts.FinBERT_label]] | |
df_neg = counts[[x in ['negativo'] for x in counts.FinBERT_label]] | |
# app.layout | |
app.layout = dbc.Container([ | |
dbc.Row([ # row 1 | |
dbc.Col([html.H1('Evolução temporal de sentimento em títulos de notícias')], | |
className="text-center mt-3 mb-1")]), | |
dbc.Row([ # row 1 | |
dbc.Col([dcc.Markdown('## [Sobre o projeto](https://github.com/caiocmello/SentDiario)',link_target="_blank")], | |
className="text-center mt-3 mb-1")]), | |
dbc.Row([ # row 2 | |
dbc.Label("Selecione um período (mm/dd/aaaa):", className="fw-bold")]), | |
dbc.Row([ # row 3 | |
dcc.DatePickerRange( | |
id='date-range', | |
min_date_allowed=df['date'].min().date(), | |
max_date_allowed=df['date'].max().date(), | |
initial_visible_month=df['date'].min().date(), | |
start_date=df['date'].min().date(), | |
end_date=df['date'].max().date())]), | |
dbc.Row([ # row 4 | |
dbc.Label("Escolha um tópico:", className="fw-bold") | |
]), | |
dbc.Row([ # row 5 | |
dbc.Col( | |
dcc.Dropdown( | |
id="topic-selector", | |
options=[ | |
{"label": topic, "value": topic} for topic in unique_topics | |
], | |
value="Imigrantes", # Set the initial value | |
style={"width": "50%"}) | |
) | |
]), | |
dbc.Row([ # row 6 | |
dbc.Col(dcc.Graph(id='line-graph-1')) | |
]), | |
dbc.Row([ # row 7 but needs to be updated | |
dbc.Col(dcc.Graph(id="bar-graph-1")) | |
]), | |
# html.Div(id='pie-container-1'), | |
dbc.Row([ # row 9 | |
dbc.Col(dcc.Graph(id='pie-graph-1'), | |
) | |
]), | |
dbc.Row([ # row 7 | |
dbc.Label("Escolha um site de notícias:", className="fw-bold") | |
]), | |
dbc.Row([ # row 8 | |
dbc.Col( | |
dcc.Dropdown( | |
id="domain-selector", | |
options=[ | |
{"label": domain, "value": domain} for domain in unique_domains | |
], | |
value="expresso-pt", # Set the initial value | |
style={"width": "50%"}) | |
) | |
]), | |
dbc.Row([ # row 9 | |
dbc.Col(dcc.Graph(id='line-graph-2'), | |
) | |
]), | |
# dbc.Row([ # row 9 | |
# dbc.Col(dcc.Graph(id='line-graph-2'), | |
# ) | |
# ]), | |
# dbc.Row([ # row 10 | |
# dbc.Col(dcc.Graph(id='line-graph-3'), | |
# ) | |
# ]), | |
# dbc.Row([ # row 11 | |
# dbc.Col(dcc.Graph(id='line-graph-4'), | |
# ) | |
# ]), | |
# html.Div(id='pie-container-2'), | |
dbc.Row([ # row 9 | |
dbc.Col(dcc.Graph(id='pie-graph-2'), | |
) | |
]), | |
dbc.Row([ # row | |
dbc.Label('Lista de notícias encontradas para o tópico e meio de comunicação selecionados', className="fw-bold") | |
]), | |
dbc.Row([ # row 9 | |
dbc.Col( | |
dash_table.DataTable( | |
id='headlines-table', | |
style_as_list_view=True, | |
columns=[ | |
{"name":"Título", "id":"link", "presentation":"markdown"}, | |
# {"name": "Headline", "id": "Headline"}, | |
# {"name": "URL", "id": "url"}, | |
{"name": "Date", "id": "date", "type":"datetime"}, | |
{"name": "Etiqueta de sentimento", "id": "FinBERT_label"}, | |
], | |
style_table={'overflowX': 'auto'}, | |
style_cell={ | |
'textAlign': 'left', | |
# 'whiteSpace': 'normal', | |
# 'height': 'auto', | |
# 'minWidth': '50px', 'width': '180px', 'maxWidth': '180px', | |
}, | |
page_action="native", | |
page_current= 0, | |
page_size= 10, | |
) | |
) | |
]) | |
]) | |
# # Create a function to generate pie charts | |
# def generate_pie_chart(category): | |
# labels = data[category]['labels'] | |
# values = data[category]['values'] | |
# trace = go.Pie(labels=labels, values=values) | |
# layout = go.Layout(title=f'Pie Chart - {category}') | |
# return dcc.Graph( | |
# figure={ | |
# 'data': [trace], | |
# 'layout': layout | |
# } | |
# ) | |
# callback decorator | |
def update_output(selected_topic, selected_domain, start_date, end_date): | |
#log | |
print("topic:",selected_topic,"domain:",selected_domain,"start:", start_date,"end:", end_date,"\n\n") | |
# This is a hack to filter dates to confine to respective topic boundaries | |
min_topic_date = df[df["Topic"] == selected_topic]["date"].min() | |
max_topic_date = df[df["Topic"] == selected_topic]["date"].max() | |
print("min",min_topic_date,"max",max_topic_date) | |
#if start visualisation from where the topic starts | |
start_date = min_topic_date if (min_topic_date > convert_to_datetime(start_date)) else start_date | |
end_date = max_topic_date if (max_topic_date < convert_to_datetime(end_date)) else end_date | |
print("After: Sd",start_date,"Ed",end_date) | |
# filter dataframes based on updated data range | |
mask_1 = ((df["Topic"] == selected_topic) & (df['date'] >= start_date) & (df['date'] <= end_date)) | |
df_filtered = df.loc[mask_1] | |
# print(df_filtered.shape, df.columns) | |
if len(df_filtered)>0: | |
#create line graphs based on filtered dataframes | |
line_fig_1 = px.line(df_filtered, x="date", y="normalised results", | |
color='Veículos de notícias', title="O gráfico mostra a evolução temporal de sentimento dos títulos de notícias <br> Numa escala de -1 (negativo) a 1 (positivo), sendo 0 (neutro)") | |
# Veículos de notícias | |
#set x-axis title and y-axis title in line graphs | |
line_fig_1.update_layout( | |
xaxis_title='Data', | |
yaxis_title='Classificação de Sentimento', | |
title_x=0.5 | |
# font=dict( | |
# family="Courier New, monospace", | |
# size=18, # Set the font size here | |
# color="RebeccaPurple" | |
# ) | |
) | |
#set label format on y-axis in line graphs | |
line_fig_1.update_xaxes(tickformat="%b %d<br>%Y") | |
# Bar Graph start | |
# Convert 'period' column to datetime | |
# df_filtered['period'] = pd.to_datetime(df_filtered['date'], format='%m/%Y') | |
df_filtered['period'] = pd.to_datetime(df_filtered['date']).to_numpy().astype('datetime64[M]') | |
grouped_df = df_filtered.groupby(['period', 'Veículos de notícias']).size().reset_index(name='occurrences') | |
# Sort DataFrame by 'period' column | |
grouped_df = grouped_df.sort_values(by='period') | |
# Create a list of all unique media | |
all_media = df_filtered['domain_folder_name'].unique() | |
# Create a date range from Jan/2000 to the last month in the dataset | |
date_range = pd.date_range(start=df_filtered['date'].min(), end=df_filtered['date'].max(), freq='MS') | |
# date_range = pd.date_range(start="2000-01-01", end=df_filtered['date'].max(), freq='MS') | |
# Create a MultiIndex with all combinations of date_range and all_media | |
idx = pd.MultiIndex.from_product([date_range, all_media], names=['period', 'Veículos de notícias']) | |
# Reindex the DataFrame to include all periods and media | |
grouped_df = grouped_df.set_index(['period', 'Veículos de notícias']).reindex(idx, fill_value=0).reset_index() | |
# print(grouped_df.shape) | |
bar_fig_1 = px.bar(grouped_df, x='period', y='occurrences', color='Veículos de notícias', | |
labels={'period': 'Período', 'occurrences': 'Número de notícias', 'Veículos de notícias': 'Portal'}, | |
title='Número de notícias por período de tempo') | |
bar_fig_1.update_layout(title_x=0.5) | |
# bar_fig_1.update_xaxes(tickformat="%b %d<br>%Y") | |
# Bar Graph ends | |
# line-fig 2 starts | |
# filter dataframes based on updated data range | |
# Filtering data... | |
df_filtered_2 = counts[(counts['Topic'] == selected_topic) & | |
(counts['domain_folder_name'] == selected_domain) & | |
(counts['date'] >= start_date) & | |
(counts['date'] <= end_date)] | |
# Create a date range for the selected period | |
date_range = pd.date_range(start=start_date, end=end_date) | |
# Create a DataFrame with all possible combinations of classes, topics, and dates | |
all_combinations = pd.MultiIndex.from_product([['positivo', 'neutro', 'negativo'], | |
[selected_topic], | |
[selected_domain], | |
date_range], | |
names=['FinBERT_label', 'Topic', 'domain_folder_name', 'date']) | |
df_all_combinations = pd.DataFrame(index=all_combinations).reset_index() | |
# Merge filtered DataFrame with DataFrame of all combinations | |
merged_df = pd.merge(df_all_combinations, df_filtered_2, on=['FinBERT_label', 'Topic', 'domain_folder_name', 'date'], how='left') | |
# Map original labels to their translated versions | |
label_translation = {'positive': 'positivo', 'neutral': 'neutro', 'negative': 'negativo'} | |
# merged_df['FinBERT_label_transformed'] = merged_df['FinBERT_label'].map(label_translation) | |
# Fill missing values with zeros | |
merged_df['count'].fillna(0, inplace=True) | |
merged_df['rolling_mean_counts'].fillna(0, inplace=True) | |
# Define colors for each label | |
label_colors = {'positivo': '#039a4d', 'neutro': '#3c03f4', 'negativo': '#ca3919'} | |
# Create line graph... | |
line_fig_2 = px.line(merged_df, x="date", y="count", color="FinBERT_label", | |
line_group="FinBERT_label", title="Sentimento ao longo do tempo", | |
labels={"count": "Número de notícias", "date": "Date"}, | |
color_discrete_sequence=['#039a4d', '#3c03f4', '#ca3919'] #[label_colors[label] for label in all_combinations.index] | |
) | |
# Update layout... | |
line_fig_2.update_layout(xaxis_title='Date', yaxis_title='Número de artigos de notícias', | |
xaxis=dict(tickformat="%b %d<br>%Y"), legend_title="Etiqueta de sentimento",title_x=0.5) | |
# line-fig 2 ends | |
# df_filtered['FinBERT_label_transformed'] = df_filtered['FinBERT_label'].map(label_translation) | |
# Group by FinBERT_label and count occurrences | |
label_counts_all = df_filtered['FinBERT_label'].value_counts() | |
# Calculate percentage of each label | |
label_percentages_all = (label_counts_all / label_counts_all.sum()) * 100 | |
# Plot general pie chart | |
pie_chart_1 = px.pie( | |
values=label_percentages_all, | |
names=label_percentages_all.index, | |
title='Distribuição Geral', | |
color_discrete_sequence=[label_colors[label] for label in label_percentages_all.index] #['#039a4d', '#3c03f4', '#ca3919'] | |
) | |
pie_chart_1.update_layout(title_x=0.5) | |
# Get unique media categories | |
media_categories = df_filtered['Veículos de notícias'].unique() | |
# Filter DataFrame for current media category | |
media_df = df_filtered[df_filtered['Veículos de notícias'] == selected_domain] | |
# Group by FinBERT_label and count occurrences | |
label_counts = media_df['FinBERT_label'].value_counts() | |
# Calculate percentage of each label | |
label_percentages = (label_counts / label_counts.sum()) * 100 | |
# Plot pie chart | |
pie_chart_2 = px.pie( | |
values=label_percentages, | |
names=label_percentages.index, | |
title=f'Distribuição para {selected_domain}', | |
color_discrete_sequence=[label_colors[label] for label in label_percentages.index] | |
) | |
pie_chart_2.update_layout(title_x=0.5) | |
# pie_chart_2 = dcc.Graph(figure=fig) | |
# pie_chart_2 = html.Div(fig,className='four columns') | |
# Convert FinBERT_label to categorical for better sorting | |
media_df['FinBERT_label'] = pd.Categorical(media_df['FinBERT_label'], | |
categories=['positivo', 'neutro', 'negativo'], | |
ordered=True) | |
def f(row): | |
return "[{0}]({1})".format(row["Headline"],row["url"]) | |
media_df["link"] = media_df.apply(f, axis=1) | |
# Sort DataFrame by sentiment label and date | |
data_table_1 = media_df.sort_values(by=['date', "FinBERT_label"]) | |
data_table_1['date'] = pd.to_datetime(data_table_1['date']).dt.strftime('%m-%d-%Y') | |
return line_fig_1, bar_fig_1, pie_chart_1, line_fig_2, pie_chart_2, data_table_1.to_dict('records') | |
else: | |
return {'data': []},{'data': []} ,{'data': []} ,{'data': []} , {'data': []}, {'data': []} | |
# return line_fig_1 | |
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') | |
# app.layout = html.Div([ | |
# html.H1(children='Title of Dash App', style={'textAlign':'center'}), | |
# dcc.Dropdown(df.country.unique(), 'Canada', id='dropdown-selection'), | |
# dcc.Graph(id='graph-content') | |
# ]) | |
# @callback( | |
# Output('graph-content', 'figure'), | |
# Input('dropdown-selection', 'value') | |
# ) | |
# def update_graph(value): | |
# dff = df[df.country==value] | |
# return px.line(dff, x='year', y='pop') | |
# # Define callback function for updating the headlines table | |
# @app.callback( | |
# Output('headlines-table', 'data'), | |
# Input("topic-selector", "value"), | |
# Input("domain-selector", "value"), | |
# Input('date-range', 'start_date'), | |
# Input('date-range', 'end_date') | |
# ) | |
# def update_headlines_table(selected_topic, selected_domain, start_date, end_date): | |
# # Filtering data... | |
# tab_content_2 = dcc.Markdown(''' | |
# # Sobre o projeto | |
# ''') | |
# app.layout = html.Div( | |
# [ | |
# dbc.Card( | |
# [ | |
# dbc.CardHeader( | |
# dbc.Tabs( | |
# [ | |
# dbc.Tab(label="SentDiário", tab_id="tab-1"), | |
# dbc.Tab(label="Sobre o projeto", tab_id="tab-2"), | |
# ], | |
# id="tabs", | |
# active_tab="tab-1", | |
# ) | |
# ), | |
# dbc.CardBody(html.Div(id="content", className="card-text")), | |
# ] | |
# ) | |
# ] | |
# ) | |
# @app.callback(Output("content", "children"), [Input("tabs", "active_tab")]) | |
# def switch_tab(at): | |
# if at == "tab-1": | |
# return tab_content_1 | |
# elif at == "tab-2": | |
# return tab_content_2 | |
# return html.P("This shouldn't ever be displayed...") | |
if __name__ == '__main__': | |
app.run_server(debug=True) | |