Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,10 +3,22 @@ import pandas as pd
|
|
3 |
import numpy as np
|
4 |
import seaborn as sns
|
5 |
from PIL import Image
|
|
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from sklearn.model_selection import train_test_split
|
8 |
from sklearn.linear_model import LinearRegression
|
|
|
|
|
9 |
from sklearn import metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
st.sidebar.header("Dashboard")
|
@@ -17,22 +29,42 @@ df = pd.read_csv("transactions_dataset.csv")
|
|
17 |
tech_df = df.loc[df['sector'] == 'TECH']
|
18 |
|
19 |
|
20 |
-
|
21 |
if app_mode == "Introduction":
|
22 |
|
23 |
st.title("Introduction")
|
24 |
st.markdown("### Welcome to our ESG rankings Dashboard!")
|
25 |
|
26 |
-
|
27 |
|
28 |
-
st.markdown("#### Wondering what is ESG rankings relative to Investments")
|
29 |
-
st.markdown("Our company is a Health insurance company who is looking to improve their revenue model by expanding into a new sector: Vehicle Insurance.")
|
30 |
-
st.markdown("##### Objectives")
|
31 |
-
st.markdown("- Using other variables that contribute to investment over the years")
|
32 |
-
st.markdown("- Points that can be made: ESG growth over the years; correlation w Investment & social pressures")
|
33 |
-
st.markdown("- Does an increase ESG lead to increase in Investment? ")
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
|
38 |
if head == 'Head':
|
@@ -40,131 +72,411 @@ if app_mode == "Introduction":
|
|
40 |
else:
|
41 |
st.dataframe(df.tail(num))
|
42 |
|
43 |
-
st.text('
|
44 |
-
st.write(df.shape)
|
45 |
-
|
46 |
-
st.markdown("##### Key Variables")
|
47 |
|
|
|
|
|
48 |
st.dataframe(df.describe())
|
49 |
|
50 |
-
st.markdown("### Missing Values")
|
51 |
-
st.markdown("Null or NaN
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
else:
|
60 |
st.warning("Poor data quality due to greater than 30 percent of missing value.")
|
61 |
st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
71 |
-
st.write(nonmissing)
|
72 |
-
if completeness >= 0.80:
|
73 |
-
st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
|
74 |
-
else:
|
75 |
-
st.success("Poor data quality due to low completeness ratio( less than 0.85).")
|
76 |
|
77 |
-
|
|
|
78 |
st.title("Visualization")
|
79 |
|
|
|
|
|
80 |
|
|
|
|
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
-
|
|
|
85 |
|
86 |
-
#SNS plot
|
87 |
-
tab1.subheader("SNS plot")
|
88 |
-
tech_df = tech_df.sample(n=10000)
|
89 |
-
fig = sns.pairplot(tech_df)
|
90 |
-
tab1.pyplot(fig)
|
91 |
-
|
92 |
-
#Bar Graph
|
93 |
-
# User input for x-variable
|
94 |
-
columns = ['Region_Code', 'Gender', 'Vehicle_Age']
|
95 |
-
x_variable = tab2.selectbox("Select x-variable:", columns)
|
96 |
-
tab2.subheader(f"{x_variable} vs Price (INR)")
|
97 |
-
#data_by_variable = df.groupby(x_variable)['Annual_Premium'].mean()
|
98 |
-
#tab2.bar_chart(data_by_variable)
|
99 |
-
|
100 |
-
#Line Graph
|
101 |
-
tab3.subheader("Age vs Price")
|
102 |
-
#age_by_price = df.groupby('Age')['Annual_Premium'].mean()
|
103 |
-
#tab3.line_chart(age_by_price)
|
104 |
-
|
105 |
-
'''
|
106 |
-
tab4.subheader("Pie plot")
|
107 |
-
tab4.subheader("Response distribution by Vehicle Damage")
|
108 |
-
response_counts = df.groupby(['Vehicle_Damage', 'Response']).size().unstack(fill_value=0)
|
109 |
-
fig, ax = plt.subplots()
|
110 |
-
colors = ['#ff9999','#66b3ff']
|
111 |
-
damage_counts = response_counts.loc[1]
|
112 |
-
percentages = (damage_counts.values / damage_counts.sum()) * 100
|
113 |
-
labels = ['Yes', 'No']
|
114 |
-
ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
115 |
-
ax.axis('equal')
|
116 |
-
tab4.pyplot(fig)
|
117 |
-
|
118 |
-
#Pie Plot2
|
119 |
-
tab4.subheader("Response Distribution by Not Previously Insured")
|
120 |
-
response_counts = df.groupby(['Previously_Insured', 'Response']).size().unstack(fill_value=0)
|
121 |
-
fig, ax = plt.subplots()
|
122 |
-
colors = ['#ff9999','#66b3ff']
|
123 |
-
prev_insurance_counts = response_counts.loc[0]
|
124 |
-
percentages = (prev_insurance_counts.values / prev_insurance_counts.sum()) * 100
|
125 |
-
labels = ['Yes', 'No']
|
126 |
-
ax.pie(percentages, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
|
127 |
-
ax.axis('equal')
|
128 |
-
tab4.pyplot(fig)
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
|
|
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
'''
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
elif app_mode == "Prediction":
|
138 |
-
st.
|
139 |
-
|
140 |
-
'''
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
st.
|
159 |
-
|
160 |
-
#
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import numpy as np
|
4 |
import seaborn as sns
|
5 |
from PIL import Image
|
6 |
+
import io
|
7 |
+
import mlflow
|
8 |
import matplotlib.pyplot as plt
|
9 |
from sklearn.model_selection import train_test_split
|
10 |
from sklearn.linear_model import LinearRegression
|
11 |
+
from sklearn.linear_model import LogisticRegression
|
12 |
+
from matplotlib.backends.backend_agg import FigureCanvasAgg
|
13 |
from sklearn import metrics
|
14 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
15 |
+
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
16 |
+
from sklearn.tree import DecisionTreeRegressor
|
17 |
+
from sklearn.preprocessing import LabelEncoder
|
18 |
+
import graphviz
|
19 |
+
import missingno as mno
|
20 |
+
from sklearn.tree import export_graphviz
|
21 |
+
|
22 |
|
23 |
|
24 |
st.sidebar.header("Dashboard")
|
|
|
29 |
tech_df = df.loc[df['sector'] == 'TECH']
|
30 |
|
31 |
|
32 |
+
# - - - - - - - - - - - INTRODUCTION - - - - - - - - - - -
|
33 |
if app_mode == "Introduction":
|
34 |
|
35 |
st.title("Introduction")
|
36 |
st.markdown("### Welcome to our ESG rankings Dashboard!")
|
37 |
|
38 |
+
st.image("ESG_image.png", use_column_width=True)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
st.markdown("## Environmental - Social - Governance")
|
42 |
+
st.markdown("##### Does ESG rankings truly effect company investment & returns?")
|
43 |
+
|
44 |
+
st.markdown("""
|
45 |
+
##### Objective:
|
46 |
+
- Our goal is to explore a companies profit margin ratio relative to ESG Rankings to make a positive feedback loop
|
47 |
+
""")
|
48 |
+
|
49 |
+
st.markdown("##### Approach:")
|
50 |
+
st.markdown("""
|
51 |
+
1. Data Exploration
|
52 |
+
- Shape, outliers, nulls
|
53 |
+
2. Comprehensive Variable Analysis
|
54 |
+
- Univariate Analysis
|
55 |
+
- Bi-variate analysis
|
56 |
+
- Multi-variate analysis
|
57 |
+
3. Modelling
|
58 |
+
- Build model that solves business problem
|
59 |
+
""")
|
60 |
+
|
61 |
+
# - - - - - - - - - - - - - - - - - -
|
62 |
+
|
63 |
+
st.markdown("<hr>", unsafe_allow_html=True)
|
64 |
+
|
65 |
+
st.markdown("### About the Data Set")
|
66 |
+
|
67 |
+
num = st.number_input('How many rows would you like to see?', 5, 10)
|
68 |
|
69 |
head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
|
70 |
if head == 'Head':
|
|
|
72 |
else:
|
73 |
st.dataframe(df.tail(num))
|
74 |
|
75 |
+
st.text(f'This data frame has {df.shape[0]} Rows and {df.shape[1]} columns')
|
|
|
|
|
|
|
76 |
|
77 |
+
|
78 |
+
st.markdown("\n\n##### About the Variables")
|
79 |
st.dataframe(df.describe())
|
80 |
|
81 |
+
st.markdown("\n\n### Missing Values")
|
82 |
+
st.markdown("Are there any Null or NaN?")
|
83 |
|
84 |
+
# Calculate percentage of missing values
|
85 |
+
dfnull = tech_df.isnull().sum() / len(tech_df) * 100
|
86 |
+
total_miss = dfnull.sum().round(2)
|
87 |
+
|
88 |
+
# Display percentage of total missing values
|
89 |
+
st.write("Percentage of total missing values:", total_miss, "%")
|
90 |
+
|
91 |
+
# Create two columns layout
|
92 |
+
col1, col2 = st.columns(2)
|
93 |
+
|
94 |
+
# Display DataFrame with missing value percentages in the first column
|
95 |
+
with col1:
|
96 |
+
st.write("Percentage of Missing Values:")
|
97 |
+
st.write(dfnull)
|
98 |
+
|
99 |
+
# Display Missing Values Matrix in the second column
|
100 |
+
with col2:
|
101 |
+
st.write("Missing Values Matrix:")
|
102 |
+
fig, ax = plt.subplots(figsize=(20, 6))
|
103 |
+
mno.matrix(tech_df, ax=ax)
|
104 |
+
st.pyplot(fig)
|
105 |
+
|
106 |
+
if total_miss <= 30:
|
107 |
+
st.success("This Data set is reliable to use with small amounts of missing values, thus yielding accurate data.")
|
108 |
else:
|
109 |
st.warning("Poor data quality due to greater than 30 percent of missing value.")
|
110 |
st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
|
111 |
|
112 |
+
# - - - - - - - - - - - VISUALIZATION - - - - - - - - - - -
|
113 |
+
elif app_mode == "Visualization":
|
114 |
+
data = {
|
115 |
+
'ESG_ranking': tech_df['ESG_ranking'],
|
116 |
+
'PS_ratio': tech_df['PS_ratio'],
|
117 |
+
'PB_ratio': tech_df['PB_ratio'],
|
118 |
+
'roa_ratio': tech_df['roa_ratio'],
|
119 |
+
}
|
120 |
+
|
121 |
+
df = pd.DataFrame(data)
|
122 |
+
|
123 |
+
# Define weights for each metric
|
124 |
+
weights = {
|
125 |
+
'ESG_ranking': 0.3,
|
126 |
+
'PS_ratio': 0.2,
|
127 |
+
'PB_ratio': 0.3,
|
128 |
+
'roa_ratio': 0.2
|
129 |
+
}
|
130 |
+
|
131 |
+
data = {
|
132 |
+
'ESG_ranking': tech_df['ESG_ranking'],
|
133 |
+
'PS_ratio': tech_df['PS_ratio'],
|
134 |
+
'PB_ratio': tech_df['PB_ratio']
|
135 |
+
}
|
136 |
+
|
137 |
+
df = pd.DataFrame(data)
|
138 |
+
|
139 |
+
# Create interaction terms
|
140 |
+
tech_df['ESG_PS_interaction'] = tech_df['ESG_ranking'] * tech_df['PS_ratio']
|
141 |
+
tech_df['ESG_PB_interaction'] = tech_df['ESG_ranking'] * tech_df['PB_ratio']
|
142 |
+
tech_df['PS_PB_interaction'] = tech_df['PS_ratio'] * tech_df['PB_ratio']
|
143 |
+
|
144 |
+
|
145 |
+
# Calculate the composite score
|
146 |
+
tech_df['Composite_Score'] = sum(tech_df[col] * weights[col] for col in weights)
|
147 |
|
148 |
+
cols = ['ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation','PS_ratio','NetProfitMargin_ratio', 'PB_ratio', 'roa_ratio', 'roe_ratio','EPS_ratio','Composite_Score', 'ESG_PS_interaction', 'ESG_PB_interaction', 'PS_PB_interaction' ]
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
# - - - - - - - - - - - - PAIRPLOT
|
151 |
+
|
152 |
st.title("Visualization")
|
153 |
|
154 |
+
# DATA VISUALISATION
|
155 |
+
tab1, tab2, tab3 = st.tabs(["Pair Plots", "Correlation", "Feature Engineering"])
|
156 |
|
157 |
+
# DF defenition
|
158 |
+
tech_df = tech_df.sample(n=10000)
|
159 |
|
160 |
+
# - - - - - - - - - - - - - - - TAB1
|
161 |
+
image_paths = ['bigger_pairplot.png', 'Annoted_bigger_sns.png', 'smaller_pairplot.png']
|
162 |
+
messages = ["#### All variable pairplot", "#### Notable Relationships", "#### Focus Point Variables"]
|
163 |
+
|
164 |
+
# Display the initial image and message
|
165 |
+
tab1.title("PAIR PLOTS")
|
166 |
+
tab1.write(messages[0])
|
167 |
+
tab1.image(image_paths[0], use_column_width=True)
|
168 |
+
|
169 |
+
button = tab1.button("Next Pair Plot")
|
170 |
+
if button:
|
171 |
+
tab1.write(messages[1])
|
172 |
+
tab1.image(image_paths[1], use_column_width=True)
|
173 |
+
button2 = tab1.button('Next Pair Plot ')
|
174 |
+
if button2:
|
175 |
+
tab1.write(messages[2])
|
176 |
+
tab1.image(image_paths[2], use_column_width=True)
|
177 |
|
178 |
+
var = tab1.button('Variables')
|
179 |
+
if var:
|
180 |
+
tab1.markdown("##### 'ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation','PS_ratio','NetProfitMargin_ratio', 'PB_ratio', 'roa_ratio', 'roe_ratio','EPS_ratio'")
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
+
|
184 |
+
# - - - - - - - - - - - - - - TAB 2
|
185 |
+
|
186 |
+
tab2.title('Variable Correlation')
|
187 |
+
tab2.markdown("##### 'ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation','PS_ratio','NetProfitMargin_ratio', 'PB_ratio', 'roa_ratio', 'roe_ratio','EPS_ratio'")
|
188 |
+
|
189 |
+
# HEAT MAP
|
190 |
+
tab2.markdown('### Heatmap Correlation')
|
191 |
+
|
192 |
+
# heat map code
|
193 |
+
cols = ['ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation','PS_ratio','NetProfitMargin_ratio', 'PB_ratio', 'roa_ratio', 'roe_ratio','EPS_ratio'] # possible essential columns
|
194 |
+
corrMatrix = tech_df[cols].corr()
|
195 |
+
|
196 |
+
fig2, ax = plt.subplots()
|
197 |
+
sns.heatmap(corrMatrix, annot=True, cmap='coolwarm', fmt='.2f', ax=ax)
|
198 |
+
|
199 |
+
# Display the plot within the Streamlit app
|
200 |
+
tab2.pyplot(fig2)
|
201 |
+
|
202 |
+
|
203 |
+
# -- DESCRIBE TABLES --
|
204 |
+
tab2.markdown('Differences of ESG Rankings')
|
205 |
|
206 |
+
# Grouping based on condition
|
207 |
+
high_rank = tech_df.groupby(tech_df['ESG_ranking'] > tech_df['ESG_ranking'].mean())
|
208 |
|
209 |
+
# Get the group with ESG_ranking greater than the mean
|
210 |
+
high_rank_group = high_rank.get_group(True)
|
|
|
211 |
|
212 |
+
# Display summary statistics for the group
|
213 |
+
tab2.subheader("Summary statistics for high ESG ranking group:")
|
214 |
+
tab2.write(high_rank_group.describe())
|
215 |
+
|
216 |
+
# Get the group with ESG_ranking less than or equal to the mean
|
217 |
+
low_rank_group = high_rank.get_group(False)
|
218 |
+
|
219 |
+
# Display summary statistics for the group
|
220 |
+
tab2.subheader("Summary statistics for low ESG ranking group:")
|
221 |
+
tab2.write(low_rank_group.describe())
|
222 |
+
|
223 |
+
# -- HISTOGRAMS --
|
224 |
+
tab2.subheader('Histograms')
|
225 |
+
|
226 |
+
# Create subplots
|
227 |
+
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
|
228 |
+
|
229 |
+
# Plot histograms
|
230 |
+
sns.histplot(tech_df['ESG_ranking'], kde=True, ax=axes[0, 0])
|
231 |
+
axes[0, 0].set_title('Histogram of ESG Ranking')
|
232 |
+
|
233 |
+
sns.histplot(tech_df['PS_ratio'], kde=True, ax=axes[0, 1])
|
234 |
+
axes[0, 1].set_title('Histogram of PS Ratio')
|
235 |
+
|
236 |
+
sns.histplot(tech_df['PB_ratio'], kde=True, ax=axes[1, 0])
|
237 |
+
axes[1, 0].set_title('Histogram of PB Ratio')
|
238 |
+
|
239 |
+
sns.histplot(tech_df['roa_ratio'], kde=True, ax=axes[1, 1])
|
240 |
+
axes[1, 1].set_title('Histogram of ROA Ratio')
|
241 |
+
|
242 |
+
# Adjust layout
|
243 |
+
plt.tight_layout()
|
244 |
+
|
245 |
+
# Display the plot in Streamlit
|
246 |
+
tab2.pyplot(fig)
|
247 |
+
|
248 |
+
# -- BAR PLOTS --
|
249 |
+
fig, axes = plt.subplots(1, 4, figsize=(16, 8))
|
250 |
+
|
251 |
+
# Plot bar charts
|
252 |
+
sns.barplot(x='ESG_ranking', y='Volatility_sell', data=tech_df, ax=axes[0])
|
253 |
+
axes[0].set_title('Average stock sell by Group')
|
254 |
+
|
255 |
+
sns.barplot(x='ESG_ranking', y='expected_return (yearly)', data=tech_df, ax=axes[1])
|
256 |
+
axes[1].set_title('Average returns by Group')
|
257 |
+
|
258 |
+
sns.barplot(x='ESG_ranking', y='NetProfitMargin_ratio', data=tech_df, ax=axes[2])
|
259 |
+
axes[2].set_title('Average profits by Group')
|
260 |
+
|
261 |
+
sns.barplot(x='ESG_ranking', y='Volatility_Buy', data=tech_df, ax=axes[3]) # Swapped 'Volatility_Buy' with 'Volatility_sell'
|
262 |
+
axes[3].set_title('Average stock buy by Group')
|
263 |
+
|
264 |
+
# Adjust layout
|
265 |
+
plt.tight_layout()
|
266 |
+
|
267 |
+
# Display the plot in Streamlit
|
268 |
+
tab2.pyplot(fig)
|
269 |
+
|
270 |
+
# Bar Charts
|
271 |
+
tab2.subheader('Bar Charts')
|
272 |
+
|
273 |
+
# Create subplots
|
274 |
+
fig, axes = plt.subplots(1, 4, figsize=(12, 6))
|
275 |
+
|
276 |
+
# Plot bar charts
|
277 |
+
sns.barplot(x='ESG_ranking', y='PS_ratio', data=tech_df, ax=axes[0])
|
278 |
+
axes[0].set_title('Average PS Ratio by Group')
|
279 |
+
|
280 |
+
sns.barplot(x='ESG_ranking', y='PB_ratio', data=tech_df, ax=axes[1])
|
281 |
+
axes[1].set_title('Average PB Ratio by Group')
|
282 |
+
|
283 |
+
sns.barplot(x='ESG_ranking', y='roa_ratio', data=tech_df, ax=axes[2])
|
284 |
+
axes[2].set_title('Average ROA Ratio by Group')
|
285 |
+
|
286 |
+
sns.barplot(x='ESG_ranking', y='Volatility_sell', data=tech_df, ax=axes[3]) # Swapped 'Volatility_Buy' with 'Volatility_sell'
|
287 |
+
axes[3].set_title('Average stock sell by Group')
|
288 |
+
|
289 |
+
# Adjust layout
|
290 |
+
plt.tight_layout()
|
291 |
+
|
292 |
+
# Display the plot in Streamlit
|
293 |
+
tab2.pyplot(fig)
|
294 |
+
|
295 |
+
# Box Plots
|
296 |
+
tab2.subheader('Box Plots')
|
297 |
+
|
298 |
+
# Create subplots
|
299 |
+
fig, axes = plt.subplots(1, 4, figsize=(12, 6))
|
300 |
+
|
301 |
+
# Plot box plots
|
302 |
+
sns.boxplot(y='ESG_ranking', data=tech_df, ax=axes[0])
|
303 |
+
axes[0].set_title('Box Plot of ESG Ranking')
|
304 |
+
|
305 |
+
sns.boxplot(y='PS_ratio', data=tech_df, ax=axes[1])
|
306 |
+
axes[1].set_title('Box Plot of PS Ratio')
|
307 |
+
|
308 |
+
sns.boxplot(y='PB_ratio', data=tech_df, ax=axes[2])
|
309 |
+
axes[2].set_title('Box Plot of PB Ratio')
|
310 |
+
|
311 |
+
sns.boxplot(y='roa_ratio', data=tech_df, ax=axes[3])
|
312 |
+
axes[3].set_title('Box Plot of ROA Ratio')
|
313 |
+
|
314 |
+
# Adjust layout
|
315 |
+
plt.tight_layout()
|
316 |
+
|
317 |
+
# Display the plot in Streamlit
|
318 |
+
tab2.pyplot(fig)
|
319 |
+
|
320 |
+
|
321 |
+
# - - - - - - - - - - - - - - TAB 3
|
322 |
+
tab3.title('Feature(Data) Engineering')
|
323 |
+
tab3.markdown(
|
324 |
+
"""
|
325 |
+
ESG Ranking: This metric reflects a company's Environmental, Social, and Governance (ESG) performance. It evaluates factors such as carbon emissions, diversity policies, and board diversity. A higher ESG ranking suggests better sustainability practices.
|
326 |
+
PS Ratio (Price-to-Sales Ratio): This ratio compares a company's market capitalization to its total sales revenue. It indicates how much investors are willing to pay for each dollar of sales generated by the company. A lower PS ratio may suggest a potentially undervalued stock.
|
327 |
+
PB Ratio (Price-to-Book Ratio): The PB ratio compares a company's market value to its book value, indicating how much investors are willing to pay for each dollar of assets. It helps assess whether a stock is overvalued or undervalued relative to its assets.
|
328 |
+
ROA Ratio (Return on Assets Ratio): This ratio measures a company's profitability relative to its total assets. It indicates how efficiently a company is generating profits from its assets. A higher ROA ratio suggests better asset utilization and profitability.
|
329 |
+
|
330 |
+
Interaction Terms:
|
331 |
+
|
332 |
+
ESG-PS Interaction: The interaction between ESG ranking and PS ratio captures how a company's sustainability practices may influence its price-to-sales ratio. For example, companies with higher ESG rankings might have lower PS ratios if investors value sustainability.
|
333 |
+
ESG-PB Interaction: Similarly, this interaction captures how a company's ESG performance may impact its price-to-book ratio. It helps assess whether sustainability practices influence investors' perceptions of a company's value relative to its assets.
|
334 |
+
PS-PB Interaction: This interaction explores the relationship between price-to-sales and price-to-book ratios. It provides insights into how investors weigh sales revenue and asset value when evaluating a company's stock.
|
335 |
+
Composite Score:
|
336 |
+
|
337 |
+
The composite score combines the weighted contributions of ESG ranking, PS ratio, PB ratio, and possibly other metrics. It offers a holistic assessment of a company's overall performance and sustainability. A higher composite score indicates better overall performance based on the chosen metrics and weights. It helps investors, analysts, and stakeholders gauge a company's standing and potential investment value.
|
338 |
+
"""
|
339 |
+
)
|
340 |
+
|
341 |
+
# -- new table --
|
342 |
+
tab3.write(tech_df)
|
343 |
+
|
344 |
+
# - - - - - - - - - - - PREDICTION - - - - - - - - - - -
|
345 |
elif app_mode == "Prediction":
|
346 |
+
st.title("Predictions")
|
347 |
+
|
348 |
+
cols = ['ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation','PS_ratio','NetProfitMargin_ratio', 'PB_ratio', 'roa_ratio', 'roe_ratio','EPS_ratio'] # possible essential columns
|
349 |
+
temp_df = df[cols]
|
350 |
+
# Get list of all variable names
|
351 |
+
label_encoder = LabelEncoder()
|
352 |
+
for name in list(cols):
|
353 |
+
temp_df[name] = label_encoder.fit_transform(temp_df[name])
|
354 |
+
|
355 |
+
# Select the target variable for prediction
|
356 |
+
y = temp_df['NetProfitMargin_ratio']
|
357 |
+
|
358 |
+
# Select predictors (all other variables except the target variable)
|
359 |
+
X = temp_df.drop(columns=['NetProfitMargin_ratio'])
|
360 |
+
|
361 |
+
# Split the data into training and testing sets
|
362 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
363 |
+
|
364 |
+
# Fit linear regression model
|
365 |
+
model = LinearRegression()
|
366 |
+
model.fit(X_train, y_train)
|
367 |
+
|
368 |
+
# Make predictions
|
369 |
+
y_pred = model.predict(X_test)
|
370 |
+
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
|
371 |
+
|
372 |
+
# Display the subheader
|
373 |
+
st.subheader('Actual vs. Predicted for Net Profit Margin ratio (Linear Regression)')
|
374 |
+
|
375 |
+
# Create a new Matplotlib figure and axis
|
376 |
+
fig, ax = plt.subplots()
|
377 |
+
|
378 |
+
# Scatter plot
|
379 |
+
scatter_plot = sns.scatterplot(x='Actual', y='Predicted', data=results_df, ax=ax)
|
380 |
+
scatter_plot.set_title('Actual vs. Predicted for NetProfitMargin_ratio')
|
381 |
+
scatter_plot.set_xlabel('Actual')
|
382 |
+
scatter_plot.set_ylabel('Predicted')
|
383 |
+
|
384 |
+
# Regression line plot
|
385 |
+
sns.regplot(x='Actual', y='Predicted', data=results_df, scatter=False, color='red', ax=ax)
|
386 |
+
|
387 |
+
# Display the plot within the Streamlit app
|
388 |
+
st.pyplot(fig)
|
389 |
+
|
390 |
+
# - - - - - - - - - - - - - - DECISION TREE REGRESSOR
|
391 |
+
st.subheader('Decision Tree Regressor')
|
392 |
+
|
393 |
+
# Define columns
|
394 |
+
cols = ['ESG_ranking', 'Volatility_Buy', 'Sharpe Ratio', 'inflation', 'PS_ratio', 'NetProfitMargin_ratio',
|
395 |
+
'PB_ratio', 'roa_ratio', 'roe_ratio', 'EPS_ratio']
|
396 |
+
|
397 |
+
# Filter dataframe based on selected columns
|
398 |
+
temp_df = tech_df[cols]
|
399 |
+
|
400 |
+
# Split features and target variable
|
401 |
+
X = temp_df.drop(["NetProfitMargin_ratio"], axis=1)
|
402 |
+
y = temp_df["NetProfitMargin_ratio"]
|
403 |
+
|
404 |
+
# Split dataset into training set and test set
|
405 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
|
406 |
+
|
407 |
+
# Create Decision Tree Regressor object
|
408 |
+
clf = DecisionTreeRegressor(max_depth=3)
|
409 |
+
|
410 |
+
# Train Decision Tree Regressor
|
411 |
+
clf.fit(X_train, y_train)
|
412 |
+
|
413 |
+
# Predict the response for test dataset
|
414 |
+
y_pred = clf.predict(X_test)
|
415 |
+
|
416 |
+
# Calculate metrics
|
417 |
+
mse = metrics.mean_squared_error(y_test, y_pred)
|
418 |
+
r2_score = metrics.r2_score(y_test, y_pred)
|
419 |
+
|
420 |
+
# Display MSE and R2 score
|
421 |
+
st.write(f"MSE: {mse}")
|
422 |
+
st.write(f"R2 Score: {r2_score}")
|
423 |
+
|
424 |
+
# Plot decision tree
|
425 |
+
st.graphviz_chart(export_graphviz(clf, out_file=None, feature_names=X.columns, filled=True, rounded=True))
|
426 |
+
|
427 |
+
# - - - - - - - - - - - - - - - - - PYCARET
|
428 |
+
st.subheader('Pycaret Setup')
|
429 |
+
|
430 |
+
data = {
|
431 |
+
'Description': ['Session id', 'Target', 'Target type', 'Original data shape', 'Transformed data shape',
|
432 |
+
'Transformed train set shape', 'Transformed test set shape', 'Numeric features',
|
433 |
+
'Preprocess', 'Imputation type', 'Numeric imputation', 'Categorical imputation',
|
434 |
+
'Transform target', 'Transform target method', 'Fold Generator', 'Fold Number',
|
435 |
+
'CPU Jobs', 'Use GPU', 'Log Experiment', 'Experiment Name', 'USI'],
|
436 |
+
'Value': [2557, 'NetProfitMargin_ratio', 'Regression', '(92401, 10)', '(92401, 10)', '(64680, 10)',
|
437 |
+
'(27721, 10)', 9, True, 'simple', 'mean', 'mode', True, 'yeo-johnson', 'KFold', 10, -1,
|
438 |
+
False, False, 'test1', '08d7']
|
439 |
+
}
|
440 |
+
|
441 |
+
df = pd.DataFrame(data)
|
442 |
+
|
443 |
+
# Display DataFrame as a table
|
444 |
+
st.table(df)
|
445 |
+
|
446 |
+
|
447 |
+
st.subheader('Best Models - Pycaret/MLFlow')
|
448 |
+
|
449 |
+
# Create a DataFrame from the given data
|
450 |
+
data = {
|
451 |
+
'Model': ['knn', 'rf', 'et', 'lightgbm', 'xgboost', 'dt', 'gbr', 'ada', 'br', 'ridge',
|
452 |
+
'lr', 'huber', 'en', 'lasso', 'llar', 'par', 'omp', 'dummy', 'lar'],
|
453 |
+
'Algorithm': ['K Neighbors Regressor', 'Random Forest Regressor', 'Extra Trees Regressor',
|
454 |
+
'Light Gradient Boosting Machine', 'Extreme Gradient Boosting', 'Decision Tree Regressor',
|
455 |
+
'Gradient Boosting Regressor', 'AdaBoost Regressor', 'Bayesian Ridge', 'Ridge Regression',
|
456 |
+
'Linear Regression', 'Huber Regressor', 'Elastic Net', 'Lasso Regression',
|
457 |
+
'Lasso Least Angle Regression', 'Passive Aggressive Regressor', 'Orthogonal Matching Pursuit',
|
458 |
+
'Dummy Regressor', 'Least Angle Regression'],
|
459 |
+
'MAE': [0.0000, 0.0000, 0.0000, 0.0055, 0.0003, 0.0000, 0.2143, 1.2493, 2.2450, 2.2451,
|
460 |
+
2.2450, 2.1995, 2.3610, 2.3733, 2.3733, 3.0690, 6.3290, 8.3423, 8.7474],
|
461 |
+
'MSE': [0.0000, 0.0000, 0.0000, 0.0002, 0.0000, 0.0000, 0.0777, 2.3647, 7.3785, 7.3784,
|
462 |
+
7.3785, 8.0557, 9.1970, 9.4301, 9.4301, 16.9831, 68.2626, 108.6826, 147.4126],
|
463 |
+
'RMSE': [0.0000, 0.0000, 0.0000, 0.0125, 0.0007, 0.0000, 0.2785, 1.5376, 2.7163, 2.7163,
|
464 |
+
2.7163, 2.8372, 3.0326, 3.0708, 3.0708, 4.0527, 8.2619, 10.4250, 10.9345],
|
465 |
+
'R2': [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.9993, 0.9782, 0.9319, 0.9319,
|
466 |
+
0.9319, 0.9257, 0.9152, 0.9130, 0.9130, 0.8435, 0.3705, -0.0023, -0.3576],
|
467 |
+
'RMSLE': [0.0000, 0.0000, 0.0000, 0.0006, 0.0000, 0.0000, 0.0254, 0.1432, 0.2347, 0.2347,
|
468 |
+
0.2347, 0.2184, 0.2081, 0.2166, 0.2165, 0.2905, 0.8095, 1.0236, 0.8220],
|
469 |
+
'MAPE': [0.0000, 0.0000, 0.0000, 0.0006, 0.0000, 0.0000, 0.0309, 0.3354, 0.4365, 0.4367,
|
470 |
+
0.4364, 0.4038, 0.4272, 0.4359, 0.4358, 0.6183, 3.0713, 6.3344, 2.9445],
|
471 |
+
'TT (Sec)': [0.3600, 10.7310, 4.6500, 2.2730, 0.5930, 0.2650, 6.7620, 3.1140, 0.1550, 0.1480,
|
472 |
+
0.8520, 1.1060, 0.1560, 0.1560, 0.2480, 0.2530, 0.1470, 0.1440, 0.2080]
|
473 |
+
}
|
474 |
+
|
475 |
+
df = pd.DataFrame(data)
|
476 |
+
|
477 |
+
# Display DataFrame as a table
|
478 |
+
st.table(df)
|
479 |
+
|
480 |
+
# - - - - - - - - - - - - -
|
481 |
+
st.subheader('Feature Importance')
|
482 |
+
st.image('newplot.png')
|