import dash from dash import Dash, html, dcc, callback, Output, Input from dash import dash_table import plotly.express as px from app import app import pandas as pd import datetime import requests from io import StringIO from datetime import date import dash_bootstrap_components as dbc import plotly.express as px from dateutil.parser import parse def convert_to_datetime(input_str, parserinfo=None): return parse(input_str, parserinfo=parserinfo) server = app.server url='https://drive.google.com/file/d/1NaXOYHQFF5UO5rQr4rn8Lr3bkYMSOq4_/view?usp=sharing' url='https://drive.google.com/uc?id=' + url.split('/')[-2] # reading of file df = pd.read_csv(url) # removing Aborto df = df[df["Topic"]!="Aborto"] # filtering the file for more than 4 tokens df = df[df['Headline'].str.split().str.len().gt(4)] df['date'] = pd.to_datetime(df['date']) unique_domains = df['domain_folder_name'].unique() # print(unique_domains) unique_topics = df['Topic'].unique() # print(unique_topics) #copying a column df["Veículos de notícias"] = df["domain_folder_name"] # df = df.rename(columns={df.columns[4]: "Veículos de notícias"}) df['FinBERT_label'] = df['FinBERT_label'].astype(str) df['FinBERT_label'].replace({ # '3.0': 'positive', # '2.0': 'neutral', # '1.0': 'negative' '3.0': 'positivo', '2.0': 'neutro', '1.0': 'negativo' }, inplace=True) counts = df.groupby(['date', 'Topic', 'domain_folder_name', 'FinBERT_label']).size().reset_index(name='count') counts['count'] = counts['count'].astype('float64') counts['rolling_mean_counts'] = counts['count'].rolling(window=30, min_periods=2).mean() df_pos = counts[[x in ['positivo'] for x in counts.FinBERT_label]] df_neu = counts[[x in ['neutro'] for x in counts.FinBERT_label]] df_neg = counts[[x in ['negativo'] for x in counts.FinBERT_label]] # app.layout app.layout = dbc.Container([ dbc.Row([ # row 1 dbc.Col([html.H1('Evolução temporal de sentimento em títulos de notícias')], className="text-center mt-3 mb-1")]), dbc.Row([ # row 1 dbc.Col([dcc.Markdown('## [Sobre o projeto](https://github.com/caiocmello/SentDiario)',link_target="_blank")], className="text-center mt-3 mb-1")]), dbc.Row([ # row 2 dbc.Label("Selecione um período (mm/dd/aaaa):", className="fw-bold")]), dbc.Row([ # row 3 dcc.DatePickerRange( id='date-range', min_date_allowed=df['date'].min().date(), max_date_allowed=df['date'].max().date(), initial_visible_month=df['date'].min().date(), start_date=df['date'].min().date(), end_date=df['date'].max().date())]), dbc.Row([ # row 4 dbc.Label("Escolha um tópico:", className="fw-bold") ]), dbc.Row([ # row 5 dbc.Col( dcc.Dropdown( id="topic-selector", options=[ {"label": topic, "value": topic} for topic in unique_topics ], value="Imigrantes", # Set the initial value style={"width": "50%"}) ) ]), dbc.Row([ # row 6 dbc.Col(dcc.Graph(id='line-graph-1')) ]), dbc.Row([ # row 7 but needs to be updated dbc.Col(dcc.Graph(id="bar-graph-1")) ]), # html.Div(id='pie-container-1'), dbc.Row([ # row 9 dbc.Col(dcc.Graph(id='pie-graph-1'), ) ]), dbc.Row([ # row 7 dbc.Label("Escolha um site de notícias:", className="fw-bold") ]), dbc.Row([ # row 8 dbc.Col( dcc.Dropdown( id="domain-selector", options=[ {"label": domain, "value": domain} for domain in unique_domains ], value="expresso-pt", # Set the initial value style={"width": "50%"}) ) ]), dbc.Row([ # row 9 dbc.Col(dcc.Graph(id='line-graph-2'), ) ]), # dbc.Row([ # row 9 # dbc.Col(dcc.Graph(id='line-graph-2'), # ) # ]), # dbc.Row([ # row 10 # dbc.Col(dcc.Graph(id='line-graph-3'), # ) # ]), # dbc.Row([ # row 11 # dbc.Col(dcc.Graph(id='line-graph-4'), # ) # ]), # html.Div(id='pie-container-2'), dbc.Row([ # row 9 dbc.Col(dcc.Graph(id='pie-graph-2'), ) ]), dbc.Row([ # row dbc.Label('Lista de notícias encontradas para o tópico e meio de comunicação selecionados', className="fw-bold") ]), dbc.Row([ # row 9 dbc.Col( dash_table.DataTable( id='headlines-table', style_as_list_view=True, columns=[ {"name":"Título", "id":"link", "presentation":"markdown"}, # {"name": "Headline", "id": "Headline"}, # {"name": "URL", "id": "url"}, {"name": "Date", "id": "date", "type":"datetime"}, {"name": "Etiqueta de sentimento", "id": "FinBERT_label"}, ], style_table={'overflowX': 'auto'}, style_cell={ 'textAlign': 'left', # 'whiteSpace': 'normal', # 'height': 'auto', # 'minWidth': '50px', 'width': '180px', 'maxWidth': '180px', }, page_action="native", page_current= 0, page_size= 10, ) ) ]) ]) # # Create a function to generate pie charts # def generate_pie_chart(category): # labels = data[category]['labels'] # values = data[category]['values'] # trace = go.Pie(labels=labels, values=values) # layout = go.Layout(title=f'Pie Chart - {category}') # return dcc.Graph( # figure={ # 'data': [trace], # 'layout': layout # } # ) # callback decorator @app.callback( Output('line-graph-1', 'figure'), Output('bar-graph-1','figure'), Output('pie-graph-1', 'figure'), Output('line-graph-2', 'figure'), Output('pie-graph-2', 'figure'), Output('headlines-table', 'data'), Input("topic-selector", "value"), Input("domain-selector", "value"), Input('date-range', 'start_date'), Input('date-range', 'end_date') ) def update_output(selected_topic, selected_domain, start_date, end_date): #log print("topic:",selected_topic,"domain:",selected_domain,"start:", start_date,"end:", end_date,"\n\n") # This is a hack to filter dates to confine to respective topic boundaries min_topic_date = df[df["Topic"] == selected_topic]["date"].min() max_topic_date = df[df["Topic"] == selected_topic]["date"].max() print("min",min_topic_date,"max",max_topic_date) #if start visualisation from where the topic starts start_date = min_topic_date if (min_topic_date > convert_to_datetime(start_date)) else start_date end_date = max_topic_date if (max_topic_date < convert_to_datetime(end_date)) else end_date print("After: Sd",start_date,"Ed",end_date) # filter dataframes based on updated data range mask_1 = ((df["Topic"] == selected_topic) & (df['date'] >= start_date) & (df['date'] <= end_date)) df_filtered = df.loc[mask_1] # print(df_filtered.shape, df.columns) if len(df_filtered)>0: #create line graphs based on filtered dataframes line_fig_1 = px.line(df_filtered, x="date", y="normalised results", color='Veículos de notícias', title="O gráfico mostra a evolução temporal de sentimento dos títulos de notícias
Numa escala de -1 (negativo) a 1 (positivo), sendo 0 (neutro)") # Veículos de notícias #set x-axis title and y-axis title in line graphs line_fig_1.update_layout( xaxis_title='Data', yaxis_title='Classificação de Sentimento', title_x=0.5 # font=dict( # family="Courier New, monospace", # size=18, # Set the font size here # color="RebeccaPurple" # ) ) #set label format on y-axis in line graphs line_fig_1.update_xaxes(tickformat="%b %d
%Y") # Bar Graph start # Convert 'period' column to datetime # df_filtered['period'] = pd.to_datetime(df_filtered['date'], format='%m/%Y') df_filtered['period'] = pd.to_datetime(df_filtered['date']).to_numpy().astype('datetime64[M]') grouped_df = df_filtered.groupby(['period', 'Veículos de notícias']).size().reset_index(name='occurrences') # Sort DataFrame by 'period' column grouped_df = grouped_df.sort_values(by='period') # Create a list of all unique media all_media = df_filtered['domain_folder_name'].unique() # Create a date range from Jan/2000 to the last month in the dataset date_range = pd.date_range(start=df_filtered['date'].min(), end=df_filtered['date'].max(), freq='MS') # date_range = pd.date_range(start="2000-01-01", end=df_filtered['date'].max(), freq='MS') # Create a MultiIndex with all combinations of date_range and all_media idx = pd.MultiIndex.from_product([date_range, all_media], names=['period', 'Veículos de notícias']) # Reindex the DataFrame to include all periods and media grouped_df = grouped_df.set_index(['period', 'Veículos de notícias']).reindex(idx, fill_value=0).reset_index() # print(grouped_df.shape) bar_fig_1 = px.bar(grouped_df, x='period', y='occurrences', color='Veículos de notícias', labels={'period': 'Período', 'occurrences': 'Número de notícias', 'Veículos de notícias': 'Portal'}, title='Número de notícias por período de tempo') bar_fig_1.update_layout(title_x=0.5) # bar_fig_1.update_xaxes(tickformat="%b %d
%Y") # Bar Graph ends # line-fig 2 starts # filter dataframes based on updated data range # Filtering data... df_filtered_2 = counts[(counts['Topic'] == selected_topic) & (counts['domain_folder_name'] == selected_domain) & (counts['date'] >= start_date) & (counts['date'] <= end_date)] # Create a date range for the selected period date_range = pd.date_range(start=start_date, end=end_date) # Create a DataFrame with all possible combinations of classes, topics, and dates all_combinations = pd.MultiIndex.from_product([['positivo', 'neutro', 'negativo'], [selected_topic], [selected_domain], date_range], names=['FinBERT_label', 'Topic', 'domain_folder_name', 'date']) df_all_combinations = pd.DataFrame(index=all_combinations).reset_index() # Merge filtered DataFrame with DataFrame of all combinations merged_df = pd.merge(df_all_combinations, df_filtered_2, on=['FinBERT_label', 'Topic', 'domain_folder_name', 'date'], how='left') # Map original labels to their translated versions label_translation = {'positive': 'positivo', 'neutral': 'neutro', 'negative': 'negativo'} # merged_df['FinBERT_label_transformed'] = merged_df['FinBERT_label'].map(label_translation) # Fill missing values with zeros merged_df['count'].fillna(0, inplace=True) merged_df['rolling_mean_counts'].fillna(0, inplace=True) # Define colors for each label label_colors = {'positivo': '#039a4d', 'neutro': '#3c03f4', 'negativo': '#ca3919'} # Create line graph... line_fig_2 = px.line(merged_df, x="date", y="count", color="FinBERT_label", line_group="FinBERT_label", title="Sentimento ao longo do tempo", labels={"count": "Número de notícias", "date": "Date"}, color_discrete_sequence=['#039a4d', '#3c03f4', '#ca3919'] #[label_colors[label] for label in all_combinations.index] ) # Update layout... line_fig_2.update_layout(xaxis_title='Date', yaxis_title='Número de artigos de notícias', xaxis=dict(tickformat="%b %d
%Y"), legend_title="Etiqueta de sentimento",title_x=0.5) # line-fig 2 ends # df_filtered['FinBERT_label_transformed'] = df_filtered['FinBERT_label'].map(label_translation) # Group by FinBERT_label and count occurrences label_counts_all = df_filtered['FinBERT_label'].value_counts() # Calculate percentage of each label label_percentages_all = (label_counts_all / label_counts_all.sum()) * 100 # Plot general pie chart pie_chart_1 = px.pie( values=label_percentages_all, names=label_percentages_all.index, title='Distribuição Geral', color_discrete_sequence=[label_colors[label] for label in label_percentages_all.index] #['#039a4d', '#3c03f4', '#ca3919'] ) pie_chart_1.update_layout(title_x=0.5) # Get unique media categories media_categories = df_filtered['Veículos de notícias'].unique() # Filter DataFrame for current media category media_df = df_filtered[df_filtered['Veículos de notícias'] == selected_domain] # Group by FinBERT_label and count occurrences label_counts = media_df['FinBERT_label'].value_counts() # Calculate percentage of each label label_percentages = (label_counts / label_counts.sum()) * 100 # Plot pie chart pie_chart_2 = px.pie( values=label_percentages, names=label_percentages.index, title=f'Distribuição para {selected_domain}', color_discrete_sequence=[label_colors[label] for label in label_percentages.index] ) pie_chart_2.update_layout(title_x=0.5) # pie_chart_2 = dcc.Graph(figure=fig) # pie_chart_2 = html.Div(fig,className='four columns') # Convert FinBERT_label to categorical for better sorting media_df['FinBERT_label'] = pd.Categorical(media_df['FinBERT_label'], categories=['positivo', 'neutro', 'negativo'], ordered=True) def f(row): return "[{0}]({1})".format(row["Headline"],row["url"]) media_df["link"] = media_df.apply(f, axis=1) # Sort DataFrame by sentiment label and date data_table_1 = media_df.sort_values(by=['date', "FinBERT_label"]) data_table_1['date'] = pd.to_datetime(data_table_1['date']).dt.strftime('%m-%d-%Y') return line_fig_1, bar_fig_1, pie_chart_1, line_fig_2, pie_chart_2, data_table_1.to_dict('records') else: return {'data': []},{'data': []} ,{'data': []} ,{'data': []} , {'data': []}, {'data': []} # return line_fig_1 # df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv') # app.layout = html.Div([ # html.H1(children='Title of Dash App', style={'textAlign':'center'}), # dcc.Dropdown(df.country.unique(), 'Canada', id='dropdown-selection'), # dcc.Graph(id='graph-content') # ]) # @callback( # Output('graph-content', 'figure'), # Input('dropdown-selection', 'value') # ) # def update_graph(value): # dff = df[df.country==value] # return px.line(dff, x='year', y='pop') # # Define callback function for updating the headlines table # @app.callback( # Output('headlines-table', 'data'), # Input("topic-selector", "value"), # Input("domain-selector", "value"), # Input('date-range', 'start_date'), # Input('date-range', 'end_date') # ) # def update_headlines_table(selected_topic, selected_domain, start_date, end_date): # # Filtering data... # tab_content_2 = dcc.Markdown(''' # # Sobre o projeto # ''') # app.layout = html.Div( # [ # dbc.Card( # [ # dbc.CardHeader( # dbc.Tabs( # [ # dbc.Tab(label="SentDiário", tab_id="tab-1"), # dbc.Tab(label="Sobre o projeto", tab_id="tab-2"), # ], # id="tabs", # active_tab="tab-1", # ) # ), # dbc.CardBody(html.Div(id="content", className="card-text")), # ] # ) # ] # ) # @app.callback(Output("content", "children"), [Input("tabs", "active_tab")]) # def switch_tab(at): # if at == "tab-1": # return tab_content_1 # elif at == "tab-2": # return tab_content_2 # return html.P("This shouldn't ever be displayed...") if __name__ == '__main__': app.run_server(debug=True)