gaetanbrison commited on
Commit
c717908
1 Parent(s): e4ad36d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ from PIL import Image
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.linear_model import LinearRegression
9
+ from sklearn import metrics
10
+
11
+
12
+ st.sidebar.header("Dashboard")
13
+ st.sidebar.markdown("---")
14
+ app_mode = st.sidebar.selectbox('Select Page',['Introduction','Visualization','Prediction'])
15
+
16
+ df = pd.read_csv("transactions_dataset.csv")
17
+ tech_df = df.loc[df['sector'] == 'TECH']
18
+
19
+
20
+
21
+ if app_mode == "Introduction":
22
+
23
+ st.title("Introduction")
24
+ st.markdown("### Welcome to our ESG rankings Dashboard!")
25
+
26
+ #st.image("veh.jpeg", use_column_width=True)
27
+
28
+ st.markdown("#### Wondering what is ESG rankings relative to Investments")
29
+ st.markdown("Our company is a Health insurance company who is looking to improve their revenue model by expanding into a new sector: Vehicle Insurance.")
30
+ st.markdown("##### Objectives")
31
+ st.markdown("- Using other variables that contribute to investment over the years")
32
+ st.markdown("- Points that can be made: ESG growth over the years; correlation w Investment & social pressures")
33
+ st.markdown("- Does an increase ESG lead to increase in Investment? ")
34
+
35
+ num = st.number_input('No. of Rows', 5, 10)
36
+
37
+ head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
38
+ if head == 'Head':
39
+ st.dataframe(df.head(num))
40
+ else:
41
+ st.dataframe(df.tail(num))
42
+
43
+ st.text('(Rows,Columns)')
44
+ st.write(df.shape)
45
+
46
+ st.markdown("##### Key Variables")
47
+
48
+ st.dataframe(df.describe())
49
+
50
+ st.markdown("### Missing Values")
51
+ st.markdown("Null or NaN values.")
52
+
53
+ dfnull = df.isnull().sum()/len(df)*100
54
+ totalmiss = dfnull.sum().round(2)
55
+ st.write("Percentage of total missing values:",totalmiss)
56
+ st.write(dfnull)
57
+ if totalmiss <= 30:
58
+ st.success("We have less then 30 percent of missing values, which is good. This provides us with more accurate data as the null values will not significantly affect the outcomes of our conclusions. And no bias will steer towards misleading results. ")
59
+ else:
60
+ st.warning("Poor data quality due to greater than 30 percent of missing value.")
61
+ st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
62
+
63
+ st.markdown("### Completeness")
64
+ st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
65
+
66
+ st.write("Total data length:", len(df))
67
+ nonmissing = (df.notnull().sum().round(2))
68
+ completeness= round(sum(nonmissing)/len(df),2)
69
+
70
+ st.write("Completeness ratio:",completeness)
71
+ st.write(nonmissing)
72
+ if completeness >= 0.80:
73
+ st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
74
+ else:
75
+ st.success("Poor data quality due to low completeness ratio( less than 0.85).")
76
+
77
+ elif app_mode == "Visualization":
78
+ st.title("Visualization")
79
+
80
+
81
+
82
+
83
+ # DATA VISUALISATION
84
+ tab1, tab2, tab3, tab4 = st.tabs(["SNS Plot", "Bar Chart", "Line Chart", "Pie Plot"])
85
+
86
+ #SNS plot
87
+ tab1.subheader("SNS plot")
88
+ tech_df = tech_df.sample(n=10000)
89
+ fig = sns.pairplot(tech_df)
90
+ tab1.pyplot(fig)
91
+
92
+ #Bar Graph
93
+ # User input for x-variable
94
+ columns = ['Region_Code', 'Gender', 'Vehicle_Age']
95
+ x_variable = tab2.selectbox("Select x-variable:", columns)
96
+ tab2.subheader(f"{x_variable} vs Price (INR)")
97
+ #data_by_variable = df.groupby(x_variable)['Annual_Premium'].mean()
98
+ #tab2.bar_chart(data_by_variable)
99
+
100
+ #Line Graph
101
+ tab3.subheader("Age vs Price")
102
+ #age_by_price = df.groupby('Age')['Annual_Premium'].mean()
103
+ #tab3.line_chart(age_by_price)
104
+
105
+ '''
106
+ tab4.subheader("Pie plot")
107
+ tab4.subheader("Response distribution by Vehicle Damage")
108
+ response_counts = df.groupby(['Vehicle_Damage', 'Response']).size().unstack(fill_value=0)
109
+ fig, ax = plt.subplots()
110
+ colors = ['#ff9999','#66b3ff']
111
+ damage_counts = response_counts.loc[1]
112
+ percentages = (damage_counts.values / damage_counts.sum()) * 100
113
+ labels = ['Yes', 'No']
114
+ ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
115
+ ax.axis('equal')
116
+ tab4.pyplot(fig)
117
+
118
+ #Pie Plot2
119
+ tab4.subheader("Response Distribution by Not Previously Insured")
120
+ response_counts = df.groupby(['Previously_Insured', 'Response']).size().unstack(fill_value=0)
121
+ fig, ax = plt.subplots()
122
+ colors = ['#ff9999','#66b3ff']
123
+ prev_insurance_counts = response_counts.loc[0]
124
+ percentages = (prev_insurance_counts.values / prev_insurance_counts.sum()) * 100
125
+ labels = ['Yes', 'No']
126
+ ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
127
+ ax.axis('equal')
128
+ tab4.pyplot(fig)
129
+
130
+
131
+ tab1, tab2, tab3, tab4 = st.tabs(["SNS Plot", "Bar Chart", "Line Chart", "Pie Plot"])
132
+
133
+ fig = sns.pairplot(df)
134
+ tab1.pyplot(fig)
135
+ '''
136
+
137
+ elif app_mode == "Prediction":
138
+ st.markdown("Prediction")
139
+
140
+ '''
141
+ # Changing "Yes" and "No" to 1 and 0
142
+ df.loc[df['Vehicle_Damage'] == "Yes", 'Vehicle_Damage'] = 1
143
+ df.loc[df['Vehicle_Damage'] == "No", 'Vehicle_Damage'] = 0
144
+ st.title("Prediction")
145
+ X = df[['Age', 'Region_Code', 'Driving_License','Vehicle_Damage', 'Previously_Insured']]
146
+ y = df['Annual_Premium']
147
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
148
+ lin_reg = LinearRegression()
149
+ lin_reg.fit(X_train,y_train)
150
+ pred = lin_reg.predict(X_test)
151
+
152
+ plt.figure(figsize=(10,7))
153
+ plt.title("Actual vs. predicted Annual Premiums",fontsize=25)
154
+ plt.xlabel("Actual test set Annual Premiums",fontsize=18)
155
+ plt.ylabel("Predicted Annual Premiums", fontsize=18)
156
+ plt.scatter(x=y_test,y=pred)
157
+ plt.savefig('prediction.png')
158
+ st.image('prediction.png')
159
+
160
+ # Model Evaluation
161
+ st.markdown("Evaluation")
162
+ coeff_df = pd.DataFrame(lin_reg.coef_, X.columns, columns=['Coefficient'])
163
+ st.dataframe(coeff_df)
164
+ MAE = metrics.mean_absolute_error(y_test, pred)
165
+ MSE = metrics.mean_squared_error(y_test, pred)
166
+ RMSE = np.sqrt(metrics.mean_squared_error(y_test, pred))
167
+ st.write('MAE:', MAE)
168
+ st.write('MSE:', MSE)
169
+ st.write('RMSE:', RMSE)
170
+ '''