Kwang517 commited on
Commit
9de83d7
1 Parent(s): 2df7ddf

Upload app (3).py

Browse files
Files changed (1) hide show
  1. app (3).py +533 -0
app (3).py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import mlflow
7
+ import random
8
+ import time
9
+ from PIL import Image
10
+ from sklearn.preprocessing import StandardScaler
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.neighbors import KNeighborsClassifier
13
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn.ensemble import RandomForestRegressor
16
+ from sklearn.preprocessing import OneHotEncoder
17
+ from sklearn.compose import ColumnTransformer
18
+ from sklearn.pipeline import Pipeline
19
+ from shapash.explainer.smart_explainer import SmartExplainer
20
+ from sklearn.preprocessing import LabelEncoder
21
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
22
+ from sklearn.svm import SVC
23
+ from sklearn.linear_model import LinearRegression, LogisticRegression
24
+ from sklearn.metrics import f1_score, r2_score
25
+ import pickle
26
+ import base64
27
+ import requests
28
+ import tensorflow as tf
29
+ from codecarbon import EmissionsTracker
30
+
31
+ app_mode = st.sidebar.selectbox('Select page:',['01 Introduction','02 Visualization', '03 Prediction','04 Explainable AI','05 MLFlow Tracking'])
32
+ if app_mode == '01 Introduction':
33
+ image_movie = Image.open('Title.jpeg')
34
+ st.image(image_movie, width=400)
35
+
36
+ st.title("What are the Key Influencers of :orange[_Obesity_] ? ")
37
+ st.subheader("A predictive analysis for health recommendation purposes", divider='rainbow')
38
+ # app_mode = st.sidebar.selectbox('Select Page',['Introduction'])
39
+
40
+ st.markdown('##### WHY THIS TOPIC❓')
41
+ st.markdown('Obesity, which causes physical and mental problems, is a global health problem with serious consequences. ')
42
+ st.markdown('The prevalence of obesity is increasing steadily, and therefore, this project is needed to examine the influencing factors of obesity and to predict the occurrence of the condition according to these factors.')
43
+
44
+ st.markdown("##### OUR GOAL 🎯 ")
45
+ st.markdown("With our project, we seek to identify the highest influential factor on individual’s obesity levels for health recommendation purposes. ")
46
+ st.markdown("The main objectives of evaluation is focusing on the personal habits, family history, eating habits and physical activity frequency. ")
47
+ st.markdown("Since our research are conducting with health recommendation purposes, we will avoid analyzing on the individuals' traits such as age, height, gender but rather focuses on personal behaviors that could be changed. ")
48
+
49
+ st.markdown("#### BIASES 🧐")
50
+ st.markdown("BIASE #1: Since we are not focusing on the individuals' traits such as age, height and gender, it might have a impact on our final prediction results. Because: they are still key factors relevant to obesity levels.")
51
+ st.markdown("BIASE #2: This study only collects data from three countries which limits our ability to apply the final prediction results on a global perspective.")
52
+
53
+ st.markdown('##### OUR DATA 📊')
54
+ st.markdown("This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia")
55
+ st.markdown('Our data contains 17 attributes and 2111 records')
56
+ st.markdown("As described by the dataset provider, 77% of the data was generated synthetically using the Weka tool and the SMOTE filter, 23% of the data was collected directly from users through a web platform.")
57
+
58
+
59
+ st.markdown("##### Explaination of KEY VARIABLES abbreviations 📓")
60
+ st.markdown("- PERSONAL HABITS ")
61
+ st.markdown("CH2O: How much water do you drink daily?")
62
+ st.markdown("SMOKE: Do you smoke? ")
63
+ st.markdown("TECH: How much time do you use technological devices?")
64
+ st.markdown("CALC: How often do you drink alcohol?")
65
+ st.markdown("MTRANS: Which transportation do you usually use?")
66
+ st.markdown("SCC: Do you monitor the calories you eat daily")
67
+
68
+ st.markdown("- EATING HABITS")
69
+ st.markdown("FAVC : Do you eat high caloric food frequently?")
70
+ st.markdown("FCVC : Do you usually eat vegetables in your meals?")
71
+ st.markdown("NCP: 'How many main meals do you have daily?")
72
+ st.markdown("CAEC: DO you eat any food between meals?")
73
+
74
+ st.markdown('- PHYSICAL ACTIVITY')
75
+ st.markdown("FAF: How often do you have physical activity?")
76
+
77
+ st.markdown('- FAMILY HISTORY')
78
+ st.markdown("FHWO: Family history with overweight")
79
+ st.markdown("OL: Obesity Level")
80
+
81
+
82
+ st.markdown("### Description of Data")
83
+ df = pd.read_csv("FINAL PROJECT.csv")
84
+ st.dataframe(df.describe())
85
+ st.markdown("🔍 Observation: Based on the description of Data shown above, we can get a better understanding about the individuals' information collected in the dataset.")
86
+ st.markdown("The mean age for the participants is about 24 years old, the mean height of the participants is about 1.7 metres, and the mean weight of the participants is 86.59 kg.")
87
+ st.markdown("### Missing Values")
88
+ st.markdown("Null or NaN values.")
89
+
90
+ dfnull = df.isnull().sum()/len(df)*100
91
+ totalmiss = dfnull.sum().round(2)
92
+ st.write("Percentage of total missing values:",totalmiss)
93
+ st.write(dfnull)
94
+ if totalmiss == 0.0:
95
+ st.success("✅ We do not exprience any missing values which is the ideal outcome of our data. We can proceed with higher accuracy in our further prediction.")
96
+ else:
97
+ st.warning("Poor data quality due to greater than 30 percent of missing value.")
98
+ st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
99
+
100
+ st.markdown("### Completeness")
101
+ st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
102
+
103
+ st.write("Total data length:", len(df))
104
+ nonmissing = (df.notnull().sum().round(2))
105
+ completeness= round(sum(nonmissing)/len(df),2)
106
+
107
+ st.write("Completeness ratio:",completeness)
108
+ st.write(nonmissing)
109
+ if completeness >= 0.80:
110
+ st.success("✅ We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
111
+ else:
112
+ st.success("Poor data quality due to low completeness ratio (less than 0.85).")
113
+
114
+ elif app_mode == '02 Visualization':
115
+ df=pd.read_csv("FINAL PROJECT.csv")
116
+
117
+ varibles = st.sidebar.radio("Pick the varible",["PERSONAL HABITS","EATING HABITS","PHYSICAL ACTIVITY","FAMILY HISTORY"])
118
+
119
+ if varibles == "PERSONAL HABITS":
120
+ st.header("Personal Habits")
121
+ st.subheader("SMOKE: Do you smoke?")
122
+ crosstab = pd.crosstab(df['SMOKE'], df['OL'])
123
+ fig, ax = plt.subplots()
124
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
125
+ for p in ax.patches:
126
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
127
+ st.pyplot(fig)
128
+ st.write('For the "no" category of "SMOKE", the bars are quite tall, suggesting higher counts for each obesity level, with "Obesity Type I" showing the highest count at 345. The "yes" category has significantly lower counts for each obesity level, which might indicate that "SMOKE" is not a key influential variable for obesity level.')
129
+
130
+ st.subheader("CALC: How often do you drink alcohol?")
131
+ crosstab = pd.crosstab(df['CALC'], df['OL'])
132
+ fig, ax = plt.subplots()
133
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
134
+ for p in ax.patches:
135
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
136
+ st.pyplot(fig)
137
+
138
+ st.subheader("SCC: Do you monitor the calories you eat daily")
139
+ crosstab = pd.crosstab(df['SCC'], df['OL'])
140
+ fig, ax = plt.subplots()
141
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
142
+ for p in ax.patches:
143
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
144
+ st.pyplot(fig)
145
+
146
+ st.subheader("MTRANS: Which transportation do you usually use?")
147
+ crosstab = pd.crosstab(df['MTRANS'], df['OL'])
148
+ fig, ax = plt.subplots()
149
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
150
+ for p in ax.patches:
151
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
152
+ st.pyplot(fig)
153
+
154
+ elif varibles=='EATING HABITS':
155
+
156
+ st.header("Eating Habits")
157
+ st.subheader("CAEC: DO you eat any food between meals?")
158
+ crosstab = pd.crosstab(df['CAEC'], df['OL'])
159
+ fig, ax = plt.subplots()
160
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
161
+ for p in ax.patches:
162
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
163
+ st.pyplot(fig)
164
+
165
+ st.subheader("FAVC : Do you eat high caloric food frequently??")
166
+ crosstab = pd.crosstab(df['FAVC'], df['OL'])
167
+ fig, ax = plt.subplots()
168
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
169
+ for p in ax.patches:
170
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
171
+ st.pyplot(fig)
172
+
173
+ st.subheader("FCVC : Do you usually eat vegetables in your meals?")
174
+ crosstab = pd.crosstab(df['FAVC'], df['OL'])
175
+ fig, ax = plt.subplots()
176
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
177
+ for p in ax.patches:
178
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
179
+ st.pyplot(fig)
180
+
181
+ elif varibles=='PHYSICAL ACTIVITY':
182
+
183
+ df_ot = df[df["OL"] == 'Obesity_Type_I' ]
184
+ df_ot2 = df[df["OL"] == 'Obesity_Type_II']
185
+ df_ot3 = df[df["OL"] == 'Obesity_Type_III']
186
+
187
+ df_ot_final = pd.concat([df_ot,df_ot2,df_ot3]) # data frem of Obesity_Type I, II, III
188
+ df_ot_final.reset_index(drop=True, inplace = True)
189
+
190
+ df_ow = df[df["OL"]=='Overweight_Level_I']
191
+ df_ow2 = df[df["OL"]=='Overweight_Level_II']
192
+
193
+ df_ow_final = pd.concat([df_ow,df_ow2]) # data frem of Over_weight_Type I, II
194
+ df_ow_final.reset_index(drop=True, inplace = True)
195
+
196
+ df_n = df[df["OL"]=='Normal_Weight']
197
+
198
+ df_In = df[df["OL"]=='Insufficient_Weight']
199
+
200
+ st.header('Physical Activity')
201
+ st.subheader("FAF: How often do you have physical activity?")
202
+ data_list = [df_ot_final, df_ow_final, df_n, df_In]
203
+ data_name = ["obesity_type", "over_weight_type", "normal", "Insufficient_Weight"]
204
+
205
+ fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
206
+
207
+ for i in range(2):
208
+ sns.kdeplot(ax=axes[i,0], data=data_list[i], x="FAF", hue="OL", fill=True)
209
+ axes[i, 0].set_title(f'{data_name[i]} vs FAF')
210
+
211
+ sns.kdeplot(ax=axes[i,1], data=data_list[i+2], x="FAF", hue="OL", fill=True)
212
+ axes[i, 1].set_title(f'{data_name[i+2]} vs FAF')
213
+
214
+ fig.suptitle('Obesity_levels vs FAF')
215
+ plt.tight_layout()
216
+ st.pyplot(fig)
217
+
218
+ elif varibles=='FAMILY HISTORY':
219
+
220
+ st.header('Family History')
221
+ st.subheader("FHWO: Family history with overweight")
222
+ crosstab = pd.crosstab(df['FHWO'], df['OL'])
223
+ fig, ax = plt.subplots()
224
+ crosstab.plot(kind='bar',width=0.8, ax=ax)
225
+ for p in ax.patches:
226
+ ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
227
+ st.pyplot(fig)
228
+
229
+
230
+
231
+ if app_mode == '03 Prediction':
232
+ image_2 = Image.open('image2.png')
233
+ st.image(image_2, width=300)
234
+ #Data Preprocessing
235
+ df = pd.read_csv("FINAL PROJECT.csv")
236
+ X = df.drop('OL', axis=1)
237
+ y = df['OL']
238
+
239
+ # Convert categorical columns using get_dummies (one-hot encoding)
240
+ X = pd.get_dummies(X)
241
+ #train test split
242
+ X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=42)
243
+
244
+ #normalize the features
245
+ scaler = StandardScaler()
246
+ X_train_scaled = scaler.fit_transform(X_train)
247
+ X_test_scaled = scaler.transform(X_test)
248
+ model_choice = st.sidebar.selectbox('Select to see:', ['KNN', 'Random Forest','Comparison Analysis'])
249
+ if model_choice == 'KNN':
250
+
251
+ ##The KNN Model
252
+ knn = KNeighborsClassifier (n_neighbors = 3)
253
+ knn.fit(X_train_scaled, y_train)
254
+ y_pred = knn.predict(X_test_scaled)
255
+ accuracy = accuracy_score(y_test, y_pred)
256
+ st.title("Prediction - k-nearest neighbors Model:")
257
+ st.write(f"Model Accuracy: {accuracy*100:.2f}%")
258
+ ##select box --
259
+ option = st.selectbox(
260
+ 'What would you like to see❓',
261
+ ('Confusion Matrix 📈', 'Predicted Results with Classification Report📑')
262
+ )
263
+ if option == 'Confusion Matrix 📈':
264
+ ##KNN Confusion Matrix
265
+ conf_matrix = confusion_matrix (y_test, y_pred)
266
+ plt.figure(figsize=(10, 8))
267
+ sns.heatmap(conf_matrix, annot=True, fmt="d")
268
+ plt.xlabel('Predicted Labels')
269
+ plt.ylabel('True Labels')
270
+ plt.title('Confusion Matrix: KNN Model Prediction')
271
+ st.pyplot(plt)
272
+ st.markdown("#### The Labels from 0-6 indicates that:")
273
+ st.markdown("0 = Insufficient Weight 1 = Normal Weight")
274
+ st.markdown("2 = Obesity Type I 3 = Obesity Type II 4 = Obesity Type III")
275
+ st.markdown("5 = Overweight Level I 6 = Overwight Level II")
276
+ st.markdown("#### Explaination of the graph:")
277
+ st.markdown(" The numbers in the matrix represent the counts of instances for each true-predicted label pair.")
278
+ st.markdown(" The DIAGONAL from the top-left to the bottom-right shows the number of CORRECT predictions for each class.")
279
+ st.markdown(" The NON-DIAGONAL numbers indicate the instances where the model made ERRORS, showing how many it made in this Model")
280
+ elif option == 'Predicted Results with Classification Report📑':
281
+ #KNN Results
282
+ results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
283
+ results.reset_index(drop=True, inplace=True)
284
+ st.dataframe(results.head(10))
285
+ #Report
286
+ report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
287
+ st.text('Classification Report:')
288
+ st.table(report)
289
+
290
+ elif model_choice == 'RandomForest':
291
+ st.title("Prediction - Random Forest Classifier Model :")
292
+ ##The Random Forest Classifier Model
293
+ df = pd.read_csv('FINAL PROJECT.csv')
294
+ # Identifying categorical columns (excluding the target variable 'OL')
295
+ categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
296
+ categorical_columns.remove('OL')
297
+
298
+ # Encoding
299
+ preprocessor = ColumnTransformer(transformers=[
300
+ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
301
+ ], remainder='passthrough')
302
+
303
+ # Pipeline with preprocessing and RandomForestClassifier
304
+ pipeline = Pipeline(steps=[
305
+ ('preprocessor', preprocessor),
306
+ ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
307
+ ])
308
+
309
+ y = df['OL']
310
+
311
+ # Split
312
+ X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['OL']), y, test_size=0.2, random_state=42)
313
+
314
+ # Fit the model on the training data
315
+ pipeline.fit(X_train, y_train)
316
+
317
+ # Predict on the test set
318
+ y_pred = pipeline.predict(X_test)
319
+
320
+ # Calculate the accuracy on the test set
321
+ accuracy = accuracy_score(y_test, y_pred)
322
+ st.write(f"Model Accuracy: {accuracy*100:.2f}%")
323
+ option = st.selectbox(
324
+ 'What would you like to see❓',
325
+ ('Confusion Matrix 📈', 'Predicted Results with Classification Report📑')
326
+ )
327
+ if option == 'Confusion Matrix 📈':
328
+ plt.figure(figsize=(10, 7))
329
+ conf_matrix = confusion_matrix (y_test, y_pred)
330
+ sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=pipeline.named_steps['classifier'].classes_, yticklabels=pipeline.named_steps['classifier'].classes_)
331
+ plt.xlabel('Predicted')
332
+ plt.ylabel('True')
333
+ plt.title('Confusion Matrix')
334
+ st.pyplot(plt)
335
+ elif option == 'Predicted Results with Classification Report📑':
336
+ #RandomForest Results
337
+ results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
338
+ results.reset_index(drop=True, inplace=True)
339
+ st.dataframe(results.head(10))
340
+ #Report
341
+ report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
342
+ st.text('Classification Report:')
343
+ st.table(report)
344
+ elif model_choice == 'Comparison Analysis':
345
+ st.title('Comparison ⚖️')
346
+ st.markdown('#### Confusion Matrices:')
347
+ st.markdown ("KNN Model: High Accuracy for 'Obesity Type III'(Label 4) and 'Overweight Level II'(Label 6)")
348
+ st.markdown ("Struggles with 'Normal weight' and 'Overweight Level I'")
349
+ st.markdown ("RandomForestClassifier: Shows fewer misclassifications overall. Significantly in distinguishing 'Normal Weight and 'Overweight Level I'")
350
+ st.markdown ('#### Classification Reports:')
351
+ st.markdown ("KNN: The recall for 'Normal Weight' is particularly low (0.4677), indicating many instances of this class were misclassified.")
352
+ st.markdown ("But the precision is high for Obesity prediction, especially 'Obesity Type III' with a perfect recall.")
353
+ st.markdown ("RandomDorest Classifier: the recall for 'Normal Weight' is much improved to 0.9355")
354
+ st.markdown ("#### OVERALL")
355
+ st.markdown ("Accuracy: RandomForest Classifier increased from 0.8109 to 0.9433.")
356
+ st.markdown ("F1-Score: The harmonic mean of precision and recall is higher in the RandomForest model, suggesting a better balance.")
357
+
358
+ if app_mode == '04 Explainable AI':
359
+ st.title('Explainable AI: Shapash')
360
+ df = pd.read_csv('FINAL PROJECT.csv')
361
+
362
+ # Encoding categorical variables
363
+ label_encoder = LabelEncoder()
364
+ categorical_columns = ['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'FHWO', 'CAEC', 'MTRANS', 'OL']
365
+ for col in categorical_columns:
366
+ df[col] = label_encoder.fit_transform(df[col])
367
+
368
+ X = df[['CALC', 'FAVC', 'FCVC', 'NCP', 'SCC', 'SMOKE', 'CH2O', 'FHWO', 'FAF', 'TECH', 'CAEC', 'MTRANS']]
369
+ y = df['OL']
370
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
371
+
372
+ # Model training
373
+ model = RandomForestRegressor(max_depth=5, random_state=42, n_estimators=12)
374
+ model.fit(X_train, y_train)
375
+
376
+ option = st.selectbox(
377
+ 'What would you like to see❓',
378
+ ('Feature Importance', 'Feature Contribution', 'Local Explanation')
379
+ )
380
+
381
+ if option == 'Feature Importance':
382
+ # Make predictions and format them as a DataFrame
383
+ y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
384
+ xpl = SmartExplainer(model=model) # Pass the model correctly
385
+ xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
386
+ fig = xpl.plot.features_importance()
387
+ st.write(fig)
388
+ if option == 'Feature Contribution':
389
+ feature_list = X_test.columns.tolist()
390
+ selected_feature = st.selectbox('Select a feature for the contribution plot:', feature_list)
391
+ y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
392
+ xpl = SmartExplainer(model=model) # Pass the model correctly
393
+ xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
394
+ fig = xpl.plot.contribution_plot(selected_feature)
395
+ st.write(fig)
396
+ if option == 'Local Explanation':
397
+ y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
398
+ xpl = SmartExplainer(model=model) # Pass the model correctly
399
+ xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
400
+ fig = xpl.plot.local_plot(index=random.choice(X_test.index))
401
+ st.write(fig)
402
+
403
+ if app_mode == '05 MLFlow Tracking':
404
+ def load_data():
405
+ df = pd.read_csv('FINAL PROJECT.csv')
406
+ df['target'] = df['OL']
407
+ return df
408
+
409
+ # Preprocessing function to encode categorical variables
410
+ def preprocess_features(df, feature_choices):
411
+ categorical_features = df[feature_choices].select_dtypes(include=['object']).columns.tolist()
412
+ numeric_features = df[feature_choices].select_dtypes(exclude=['object']).columns.tolist()
413
+
414
+ # Create transformers for numeric and categorical data
415
+ numeric_transformer = 'passthrough' # No transformation needed for numeric data
416
+ categorical_transformer = OneHotEncoder(handle_unknown='ignore') # OneHot encode categorical data
417
+
418
+ # Create a column transformer to apply transformations
419
+ preprocessor = ColumnTransformer(
420
+ transformers=[
421
+ ('num', numeric_transformer, numeric_features),
422
+ ('cat', categorical_transformer, categorical_features)
423
+ ])
424
+
425
+ return preprocessor
426
+
427
+ # Available models and their problem types
428
+ MODELS = {
429
+ "classification": {
430
+ "KNN": KNeighborsClassifier,
431
+ "SVM": SVC,
432
+ "Random Forest": RandomForestClassifier
433
+ }
434
+ }
435
+
436
+ st.title("Model Experimentation with MLflow 🚀")
437
+
438
+ # User selects the task type
439
+ task_type = st.selectbox("Select the task type:", ["classification"])
440
+
441
+ # Load data
442
+ df = load_data()
443
+ st.write(df)
444
+
445
+ # Model and feature selection
446
+ model_options = list(MODELS[task_type].keys())
447
+ model_choice = st.selectbox("Choose a model ⚙️", model_options)
448
+ feature_options = df.columns.drop('target').tolist() # Adjust 'target' as necessary
449
+ feature_choice = st.multiselect("Choose some features", feature_options)
450
+ target_choice = st.selectbox("Select target column", df.columns)
451
+
452
+ # Preprocess selected features
453
+ preprocessor = preprocess_features(df, feature_choice)
454
+
455
+ # MLflow tracking
456
+ track_with_mlflow = st.checkbox("Track with MLflow?")
457
+
458
+ # Model training and evaluation
459
+ if st.button("Start training"):
460
+ if track_with_mlflow:
461
+ mlflow.set_experiment("Obesity_Prediction")
462
+ mlflow.start_run()
463
+ mlflow.log_param('model', model_choice)
464
+ mlflow.log_param('features', feature_choice)
465
+ mlflow.log_param('task', task_type)
466
+
467
+ # Create a pipeline with preprocessing and model
468
+ model = MODELS[task_type][model_choice]()
469
+ pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
470
+
471
+ X = df[feature_choice]
472
+ y = df[target_choice]
473
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
474
+
475
+ pipeline.fit(X_train, y_train)
476
+
477
+ # Evaluate the model
478
+ preds_train = pipeline.predict(X_train)
479
+ preds_test = pipeline.predict(X_test)
480
+ if task_type == "classification":
481
+ metric_train = f1_score(y_train, preds_train, average='micro')
482
+ metric_test = f1_score(y_test, preds_test, average='micro')
483
+ else:
484
+ metric_train = r2_score(y_train, preds_train)
485
+ metric_test = r2_score(y_test, preds_test)
486
+ st.write("metric_train", round(metric_train, 3))
487
+ st.write("metric_test", round(metric_test, 3))
488
+
489
+ if track_with_mlflow:
490
+ mlflow.sklearn.log_model(pipeline, "model")
491
+ mlflow.log_metric("metric_train", metric_train)
492
+ mlflow.log_metric("metric_test", metric_test)
493
+ mlflow.end_run()
494
+
495
+ with open('model.pkl', 'wb') as file:
496
+ pickle.dump(pipeline, file)
497
+ mnist = tf.keras.datasets.mnist
498
+
499
+ (x_train, y_train), (x_test, y_test) = mnist.load_data()
500
+ x_train, x_test = x_train / 255.0, x_test / 255.0
501
+
502
+
503
+ model = tf.keras.models.Sequential(
504
+ [
505
+ tf.keras.layers.Flatten(input_shape=(28, 28)),
506
+ tf.keras.layers.Dense(128, activation="relu"),
507
+ tf.keras.layers.Dropout(0.2),
508
+ tf.keras.layers.Dense(10),
509
+ ]
510
+ )
511
+
512
+ loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
513
+
514
+ model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
515
+
516
+ tracker = EmissionsTracker()
517
+ tracker.start()
518
+ model.fit(x_train, y_train, epochs=3)
519
+ emissions: float = tracker.stop()
520
+ st.write(emissions)
521
+
522
+ def download_file():
523
+ file_path = 'model.pkl' # Replace with the actual path to your model.pkl file
524
+ with open(file_path, 'rb') as file:
525
+ contents = file.read()
526
+ b64 = base64.b64encode(contents).decode()
527
+ href = f'<a href="data:file/pkl;base64,{b64}" download="model.pkl">Download model.pkl file</a>'
528
+ st.markdown(href, unsafe_allow_html=True)
529
+
530
+ st.title("Download Model Example")
531
+ st.write("Click the button below to download the model.pkl file.")
532
+ if st.button("Download"):
533
+ download_file()