File size: 26,825 Bytes
9de83d7 6e23001 9de83d7 ae9d6d9 9de83d7 8e74e3c 9de83d7 6158220 9de83d7 4d6c15e 9de83d7 383ca98 9de83d7 002905d 9de83d7 002905d 9de83d7 002905d 9de83d7 bdb3fcd 56786d0 9de83d7 56786d0 9de83d7 56786d0 9de83d7 56786d0 9de83d7 56786d0 9de83d7 56786d0 9de83d7 554a409 6e502e5 8196df9 6e502e5 a4d34ef 65d1752 a4d34ef 6e502e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 |
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import random
import time
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from shapash.explainer.smart_explainer import SmartExplainer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import f1_score, r2_score
from mlflow import log_param, log_metric, end_run, start_run
import pickle
import base64
import requests
import tensorflow as tf
from codecarbon import EmissionsTracker
app_mode = st.sidebar.selectbox('Select page:',['01 Introduction','02 Visualization', '03 Prediction','04 Explainable AI','05 MLFlow Tracking','06 Conclusion & Insights'])
if app_mode == '01 Introduction':
image_movie = Image.open('Title.jpeg')
st.image(image_movie, width=400)
st.title("What are the Key Influencers of :orange[_Obesity_] ? ")
st.subheader("A predictive analysis for health recommendation purposes", divider='rainbow')
# app_mode = st.sidebar.selectbox('Select Page',['Introduction'])
st.markdown('##### WHY THIS TOPIC❓')
st.markdown('Obesity, which causes physical and mental problems, is a global health problem with serious consequences. ')
st.markdown('The prevalence of obesity is increasing steadily, and therefore, this project is needed to examine the influencing factors of obesity and to predict the occurrence of the condition according to these factors.')
st.markdown("##### OUR GOAL 🎯 ")
st.markdown("With our project, we seek to identify the highest influential factor on individual’s obesity levels for health recommendation purposes. ")
st.markdown("The main objectives of evaluation is focusing on the personal habits, family history, eating habits and physical activity frequency. ")
st.markdown("Since our research are conducting with health recommendation purposes, we will avoid analyzing on the individuals' traits such as age, height, gender but rather focuses on personal behaviors that could be changed. ")
st.markdown("#### BIASES 🧐")
st.markdown("BIASE #1: Since we are not focusing on the individuals' traits such as age, height and gender, it might have a impact on our final prediction results. Because: they are still key factors relevant to obesity levels.")
st.markdown("BIASE #2: This study only collects data from three countries which limits our ability to apply the final prediction results on a global perspective.")
st.markdown('##### OUR DATA 📊')
st.markdown("This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia")
st.markdown('Our data contains 17 attributes and 2111 records')
st.markdown("As described by the dataset provider, 77% of the data was generated synthetically using the Weka tool and the SMOTE filter, 23% of the data was collected directly from users through a web platform.")
st.markdown("##### Explaination of KEY VARIABLES abbreviations 📓")
st.markdown("- PERSONAL HABITS ")
st.markdown("CH2O: How much water do you drink daily?")
st.markdown("SMOKE: Do you smoke? ")
st.markdown("TECH: How much time do you use technological devices?")
st.markdown("CALC: How often do you drink alcohol?")
st.markdown("MTRANS: Which transportation do you usually use?")
st.markdown("SCC: Do you monitor the calories you eat daily")
st.markdown("- EATING HABITS")
st.markdown("FAVC : Do you eat high caloric food frequently?")
st.markdown("FCVC : Do you usually eat vegetables in your meals?")
st.markdown("NCP: 'How many main meals do you have daily?")
st.markdown("CAEC: DO you eat any food between meals?")
st.markdown('- PHYSICAL ACTIVITY')
st.markdown("FAF: How often do you have physical activity?")
st.markdown('- FAMILY HISTORY')
st.markdown("FHWO: Family history with overweight")
st.markdown("OL: Obesity Level")
st.markdown("### Description of Data")
df = pd.read_csv("FINAL PROJECT.csv")
st.dataframe(df.describe())
st.markdown("🔍 Observation: Based on the description of Data shown above, we can get a better understanding about the individuals' information collected in the dataset.")
st.markdown("The mean age for the participants is about 24 years old, the mean height of the participants is about 1.7 metres, and the mean weight of the participants is 86.59 kg.")
st.markdown("### Missing Values")
st.markdown("Null or NaN values.")
dfnull = df.isnull().sum()/len(df)*100
totalmiss = dfnull.sum().round(2)
st.write("Percentage of total missing values:",totalmiss)
st.write(dfnull)
if totalmiss == 0.0:
st.success("✅ We do not exprience any missing values which is the ideal outcome of our data. We can proceed with higher accuracy in our further prediction.")
else:
st.warning("Poor data quality due to greater than 30 percent of missing value.")
st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
st.markdown("### Completeness")
st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
st.write("Total data length:", len(df))
nonmissing = (df.notnull().sum().round(2))
completeness= round(sum(nonmissing)/len(df),2)
st.write("Completeness ratio:",completeness)
st.write(nonmissing)
if completeness >= 0.80:
st.success("✅ We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
else:
st.success("Poor data quality due to low completeness ratio (less than 0.85).")
elif app_mode == '02 Visualization':
df=pd.read_csv("FINAL PROJECT.csv")
varibles = st.sidebar.radio("Pick the varible",["PERSONAL HABITS","EATING HABITS","PHYSICAL ACTIVITY","FAMILY HISTORY"])
if varibles == "PERSONAL HABITS":
st.header("Personal Habits")
st.subheader("SMOKE: Do you smoke?")
crosstab = pd.crosstab(df['SMOKE'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
st.subheader("CALC: How often do you drink alcohol?")
crosstab = pd.crosstab(df['CALC'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
st.subheader("SCC: Do you monitor the calories you eat daily")
crosstab = pd.crosstab(df['SCC'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
st.subheader("MTRANS: Which transportation do you usually use?")
crosstab = pd.crosstab(df['MTRANS'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
elif varibles=='EATING HABITS':
st.header("Eating Habits")
st.subheader("CAEC: DO you eat any food between meals?")
crosstab = pd.crosstab(df['CAEC'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
st.subheader("FAVC : Do you eat high caloric food frequently??")
crosstab = pd.crosstab(df['FAVC'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
st.subheader("FCVC : Do you usually eat vegetables in your meals?")
df_ot = df[df["OL"] == 'Obesity_Type_I' ]
df_ot2 = df[df["OL"] == 'Obesity_Type_II']
df_ot3 = df[df["OL"] == 'Obesity_Type_III']
df_ot_final = pd.concat([df_ot,df_ot2,df_ot3]) # data frem of Obesity_Type I, II, III
df_ot_final.reset_index(drop=True, inplace = True)
df_ow = df[df["OL"]=='Overweight_Level_I']
df_ow2 = df[df["OL"]=='Overweight_Level_II']
df_ow_final = pd.concat([df_ow,df_ow2]) # data frem of Over_weight_Type I, II
df_ow_final.reset_index(drop=True, inplace = True)
df_n = df[df["OL"]=='Normal_Weight']
df_In = df[df["OL"]=='Insufficient_Weight']
data_list = [df_ot_final, df_ow_final, df_n, df_In]
data_name = ["obesity_type", "over_weight_type", "normal", "Insufficient_Weight"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
for i in range(2):
sns.kdeplot(ax=axes[i,0], data=data_list[i], x="FCVC", hue="OL", fill=True)
axes[i, 0].set_title(f'{data_name[i]} vs FCVC')
sns.kdeplot(ax=axes[i,1], data=data_list[i+2], x="FCVC", hue="OL", fill=True)
axes[i, 1].set_title(f'{data_name[i+2]} vs FCVC')
fig.suptitle('Obesity_levels vs FCVC')
plt.tight_layout()
st.pyplot(fig)
elif varibles=='PHYSICAL ACTIVITY':
df_ot = df[df["OL"] == 'Obesity_Type_I' ]
df_ot2 = df[df["OL"] == 'Obesity_Type_II']
df_ot3 = df[df["OL"] == 'Obesity_Type_III']
df_ot_final = pd.concat([df_ot,df_ot2,df_ot3]) # data frem of Obesity_Type I, II, III
df_ot_final.reset_index(drop=True, inplace = True)
df_ow = df[df["OL"]=='Overweight_Level_I']
df_ow2 = df[df["OL"]=='Overweight_Level_II']
df_ow_final = pd.concat([df_ow,df_ow2]) # data frem of Over_weight_Type I, II
df_ow_final.reset_index(drop=True, inplace = True)
df_n = df[df["OL"]=='Normal_Weight']
df_In = df[df["OL"]=='Insufficient_Weight']
st.header('Physical Activity')
st.subheader("FAF: How often do you have physical activity?")
data_list = [df_ot_final, df_ow_final, df_n, df_In]
data_name = ["obesity_type", "over_weight_type", "normal", "Insufficient_Weight"]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
for i in range(2):
sns.kdeplot(ax=axes[i,0], data=data_list[i], x="FAF", hue="OL", fill=True)
axes[i, 0].set_title(f'{data_name[i]} vs FAF')
sns.kdeplot(ax=axes[i,1], data=data_list[i+2], x="FAF", hue="OL", fill=True)
axes[i, 1].set_title(f'{data_name[i+2]} vs FAF')
fig.suptitle('Obesity_levels vs FAF')
plt.tight_layout()
st.pyplot(fig)
elif varibles=='FAMILY HISTORY':
st.header('Family History')
st.subheader("FHWO: Family history with overweight")
crosstab = pd.crosstab(df['FHWO'], df['OL'])
fig, ax = plt.subplots()
crosstab.plot(kind='bar',width=0.8, ax=ax)
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
st.pyplot(fig)
if app_mode == '03 Prediction':
image_2 = Image.open('image2.png')
st.image(image_2, width=300)
#Data Preprocessing
df = pd.read_csv("FINAL PROJECT.csv")
X = df.drop('OL', axis=1)
y = df['OL']
# Convert categorical columns using get_dummies (one-hot encoding)
X = pd.get_dummies(X)
#train test split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=42)
#normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model_choice = st.sidebar.selectbox('Select to see:', ['KNN', 'Random Forest','Comparison Analysis'])
if model_choice == 'KNN':
##The KNN Model
knn = KNeighborsClassifier (n_neighbors = 3)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
st.title("Prediction - k-nearest neighbors Model:")
st.write(f"Model Accuracy: {accuracy*100:.2f}%")
##select box --
option = st.selectbox(
'What would you like to see❓',
('Confusion Matrix 📈', 'Predicted Results with Classification Report📑')
)
if option == 'Confusion Matrix 📈':
##KNN Confusion Matrix
conf_matrix = confusion_matrix (y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d")
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix: KNN Model Prediction')
st.pyplot(plt)
st.markdown("#### The Labels from 0-6 indicates that:")
st.markdown("0 = Insufficient Weight 1 = Normal Weight")
st.markdown("2 = Obesity Type I 3 = Obesity Type II 4 = Obesity Type III")
st.markdown("5 = Overweight Level I 6 = Overwight Level II")
st.markdown("#### Explaination of the graph:")
st.markdown(" The numbers in the matrix represent the counts of instances for each true-predicted label pair.")
st.markdown(" The DIAGONAL from the top-left to the bottom-right shows the number of CORRECT predictions for each class.")
st.markdown(" The NON-DIAGONAL numbers indicate the instances where the model made ERRORS, showing how many it made in this Model")
elif option == 'Predicted Results with Classification Report📑':
#KNN Results
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.reset_index(drop=True, inplace=True)
st.dataframe(results.head(10))
#Report
report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
st.text('Classification Report:')
st.table(report)
elif model_choice == 'Random Forest':
st.title("Prediction - Random Forest Classifier Model :")
##The Random Forest Classifier Model
df = pd.read_csv('FINAL PROJECT.csv')
# Identifying categorical columns (excluding the target variable 'OL')
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns.remove('OL')
# Encoding
preprocessor = ColumnTransformer(transformers=[
('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
], remainder='passthrough')
# Pipeline with preprocessing and RandomForestClassifier
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
y = df['OL']
# Split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['OL']), y, test_size=0.2, random_state=42)
# Fit the model on the training data
pipeline.fit(X_train, y_train)
# Predict on the test set
y_pred = pipeline.predict(X_test)
# Calculate the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
st.write(f"Model Accuracy: {accuracy*100:.2f}%")
option = st.selectbox(
'What would you like to see❓',
('Confusion Matrix 📈', 'Predicted Results with Classification Report📑')
)
if option == 'Confusion Matrix 📈':
plt.figure(figsize=(10, 7))
conf_matrix = confusion_matrix (y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=pipeline.named_steps['classifier'].classes_, yticklabels=pipeline.named_steps['classifier'].classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
st.pyplot(plt)
elif option == 'Predicted Results with Classification Report📑':
#RandomForest Results
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.reset_index(drop=True, inplace=True)
st.dataframe(results.head(10))
#Report
report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
st.text('Classification Report:')
st.table(report)
elif model_choice == 'Comparison Analysis':
st.title('Comparison ⚖️')
st.markdown('#### Confusion Matrices:')
st.markdown ("KNN Model: High Accuracy for 'Obesity Type III'(Label 4) and 'Overweight Level II'(Label 6)")
st.markdown ("Struggles with 'Normal weight' and 'Overweight Level I'")
st.markdown ("RandomForestClassifier: Shows fewer misclassifications overall. Significantly in distinguishing 'Normal Weight and 'Overweight Level I'")
st.markdown ('#### Classification Reports:')
st.markdown ("KNN: The recall for 'Normal Weight' is particularly low (0.4677), indicating many instances of this class were misclassified.")
st.markdown ("But the precision is high for Obesity prediction, especially 'Obesity Type III' with a perfect recall.")
st.markdown ("RandomForest Classifier: the recall for 'Normal Weight' is much improved to 0.9355")
st.markdown ("#### OVERALL")
st.markdown ("Accuracy: RandomForest Classifier increased from 0.8109 to 0.9433.")
st.markdown ("F1-Score: The harmonic mean of precision and recall is higher in the RandomForest model, suggesting a better balance.")
if app_mode == '04 Explainable AI':
st.title('Explainable AI: Shapash')
df = pd.read_csv('FINAL PROJECT.csv')
# Encoding categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'CALC', 'FAVC', 'SCC', 'SMOKE', 'FHWO', 'CAEC', 'MTRANS', 'OL']
for col in categorical_columns:
df[col] = label_encoder.fit_transform(df[col])
X = df[['CALC', 'FAVC', 'FCVC', 'NCP', 'SCC', 'SMOKE', 'CH2O', 'FHWO', 'FAF', 'TECH', 'CAEC', 'MTRANS']]
y = df['OL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Model training
model = RandomForestRegressor(max_depth=5, random_state=42, n_estimators=12)
model.fit(X_train, y_train)
option = st.selectbox(
'What would you like to see❓',
('Feature Importance', 'Feature Contribution', 'Local Explanation')
)
if option == 'Feature Importance':
# Make predictions and format them as a DataFrame
y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
xpl = SmartExplainer(model=model) # Pass the model correctly
xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
fig = xpl.plot.features_importance()
st.write(fig)
if option == 'Feature Contribution':
feature_list = X_test.columns.tolist()
selected_feature = st.selectbox('Select a feature for the contribution plot:', feature_list)
y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
xpl = SmartExplainer(model=model) # Pass the model correctly
xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
fig = xpl.plot.contribution_plot(selected_feature)
st.write(fig)
if option == 'Local Explanation':
y_pred = pd.DataFrame(model.predict(X_test), columns=['pred'], index=X_test.index)
xpl = SmartExplainer(model=model) # Pass the model correctly
xpl.compile(x=X_test, y_pred=y_pred) # Use the correctly formatted predictions
fig = xpl.plot.local_plot(index=random.choice(X_test.index))
st.write(fig)
if app_mode == '05 MLFlow Tracking':
def load_data():
df = pd.read_csv('FINAL PROJECT.csv')
df['target'] = df['OL']
return df
# Preprocessing function to encode categorical variables
def preprocess_features(df, feature_choices):
categorical_features = df[feature_choices].select_dtypes(include=['object']).columns.tolist()
numeric_features = df[feature_choices].select_dtypes(exclude=['object']).columns.tolist()
# Create transformers for numeric and categorical data
numeric_transformer = 'passthrough' # No transformation needed for numeric data
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # OneHot encode categorical data
# Create a column transformer to apply transformations
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
return preprocessor
# Available models and their problem types
MODELS = {
"classification": {
"KNN": KNeighborsClassifier,
"SVM": SVC,
"Random Forest": RandomForestClassifier
}
}
st.title("Model Experimentation with MLflow 🚀")
# User selects the task type
task_type = st.selectbox("Select the task type:", ["classification"])
# Load data
df = load_data()
st.write(df)
# Model and feature selection
model_options = list(MODELS[task_type].keys())
model_choice = st.selectbox("Choose a model ⚙️", model_options)
feature_options = df.columns.drop('target').tolist() # Adjust 'target' as necessary
feature_choice = st.multiselect("Choose some features", feature_options)
target_choice = st.selectbox("Select target column", df.columns)
# Preprocess selected features
preprocessor = preprocess_features(df, feature_choice)
# MLflow tracking
track_with_mlflow = st.checkbox("Track with MLflow?")
# Model training and evaluation
if st.button("Start training"):
if track_with_mlflow:
if mlflow.active_run():
mlflow.end_run() # Ensure any previous run is closed
mlflow.set_experiment("Obesity_Prediction")
with mlflow.start_run():
if track_with_mlflow:
mlflow.log_param('model', model_choice)
mlflow.log_param('features', feature_choice)
mlflow.log_param('task', task_type)
# Create a pipeline with preprocessing and model
model = MODELS[task_type][model_choice]()
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
X = df[feature_choice]
y = df[target_choice]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train, y_train)
# Evaluate the model
preds_train = pipeline.predict(X_train)
preds_test = pipeline.predict(X_test)
if task_type == "classification":
metric_train = f1_score(y_train, preds_train, average='micro')
metric_test = f1_score(y_test, preds_test, average='micro')
else:
metric_train = r2_score(y_train, preds_train)
metric_test = r2_score(y_test, preds_test)
st.write("metric_train", round(metric_train, 3))
st.write("metric_test", round(metric_test, 3))
if track_with_mlflow:
mlflow.sklearn.log_model(pipeline, "model")
mlflow.log_metric("metric_train", metric_train)
mlflow.log_metric("metric_test", metric_test)
# mlflow.end_run()
with open('model.pkl', 'wb') as file:
pickle.dump(pipeline, file)
# mnist = tf.keras.datasets.mnist
# (x_train, y_train), (x_test, y_test) = mnist.load_data()
# x_train, x_test = x_train / 255.0, x_test / 255.0
# model = tf.keras.models.Sequential(
# [
# tf.keras.layers.Flatten(input_shape=(28, 28)),
# tf.keras.layers.Dense(128, activation="relu"),
# tf.keras.layers.Dropout(0.2),
# tf.keras.layers.Dense(10),
# ]
# )
# loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
# tracker = EmissionsTracker()
# tracker.start()
# model.fit(x_train, y_train, epochs=3)
# emissions: float = tracker.stop()
# st.write(emissions)
def download_file():
file_path = 'model.pkl' # Replace with the actual path to your model.pkl file
with open(file_path, 'rb') as file:
contents = file.read()
b64 = base64.b64encode(contents).decode()
href = f'<a href="data:file/pkl;base64,{b64}" download="model.pkl">Download model.pkl file</a>'
st.markdown(href, unsafe_allow_html=True)
st.title("Download Model Example")
st.write("Click the button below to download the model.pkl file.")
if st.button("Download"):
download_file()
if app_mode == '06 Conclusion & Insights':
LastImage = Image.open('conclusion.jpeg')
st.image(LastImage, width=400)
st.title("Conclusion and Insights 🔬")
st.markdown('##### _Health advisories based on our data observations_ 📝')
st.markdown("'Based on our overall data analysis, there is a strong relationship between individuals' eating behavior and the obesity occurences")
st.markdown("Especially for revolving around the question of 'Do you eat any food between meals?'(CAEC), we can see individuals who has a high frequency and tendency to eat food between meals have a higher possiblity for obesity.")
st.markdown('Understanding the importance of eating behavior, we want to suggest a regulated eating schedule with correct food consumption frequencies to reduce health risks related with obesity 🥗🥩 ')
st.markdown('Alongside with eating behaviors, physical activity also have a strong influence: we can see a decreased physical activity level as the obesity intensity increases.')
st.markdown('Therefore, another health advisory would be to get active. It is crucial for individuals to improve the frequency of physical activity more importantly than its intensity. The key is to get in the habit of moving 👟')
st.markdown('As we focus on the personal behaviors,unlike our assumptions, actually does not have a strong influence on the likelihood of obesity.')
st.markdown('Although some behaviors like smoking or alcohol consumption have a impact on health, it does not directly impact obesity levels. ')
st.markdown('##### _Improvements?_ 🔜')
st.markdown('1. Increase Geographic Diversity: Collaborate with international health organizations to gather obesity-related data from other regions.')
st.markdown('2. Enhance direct-user engagement: to make our model more reflective of real-world scenarios, we should increase the amount of real user data collected by using strategies to implement incentives for users to provide data or partner with health clinics and hospitals to collect more comprehensive data sets')
st.markdown('3. Collect additional demographic data: age, height, and gender') |