Spaces:
Sleeping
Sleeping
File size: 7,175 Bytes
e81cf7c 46f8a9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import shap
import xgboost as xgb
import gradio as gr
import matplotlib.pyplot as plt
import joblib
SVM = joblib.load('SVM.pkl')
Log_Reg = joblib.load('Log_Reg.pkl')
XGB = xgb.XGBClassifier()
XGB.load_model('XGB.model')
df = pd.read_csv('Superstore.csv')
df.dropna(subset=["Region", "Category", "Sub-Category", "Quantity", "Discount"], inplace=True)
MEDIAN = 8.662 # from the exploratory analysis file
RANDOM_STATE = 42 # random seed to ensure results are reproducible
region=np.unique(df['Region'], return_inverse=True)[1]
category=np.unique(df['Category'], return_inverse=True)[1]
subCategory=np.unique(df['Sub-Category'], return_inverse=True)[1]
# turn quantity, discount, and profit columns into vectors of numbers
quantity = df["Quantity"].to_numpy()
discount = df["Discount"].to_numpy()
profit = df["Profit"].to_numpy()
vectorizedDataset = np.empty((len(region), 5))
labels = np.empty(len(region))
# generate feature vectors
for i in range(0, len(region)):
data = np.zeros((1, 5))
data[0][0] = region[i]
data[0][1] = category[i]
data[0][2] = subCategory[i]
data[0][3] = quantity[i]
data[0][4] = discount[i]
vectorizedDataset[i] = data
if (profit[i] > MEDIAN):
labels[i] = 1
else:
labels[i] = 0
train, test, trainLabels, testLabels = train_test_split(vectorizedDataset, labels, test_size=0.3, random_state=RANDOM_STATE)
region_label = {'Central': 0, 'East': 1, 'South': 2, 'West': 3}
category_label = {'Furniture': 0, 'Office Supplies': 1, 'Technology': 2}
sub_category_label = {'Accessories': 0, 'Appliances': 1, 'Art': 2, 'Binders': 3, 'Bookcases': 4,
'Chairs': 5, 'Copiers': 6, 'Envelopes': 7, 'Fasteners': 8, 'Furnishings': 9,
'Labels': 10, 'Machines': 11, 'Paper': 12, 'Phones': 13, 'Storage': 14, 'Supplies': 15,
'Tables': 16}
profit_label = {0: 'Below Median Profit', 1: 'Above Median Profit'}
feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
def sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount):
try:
Region = region_label[Region]
Category = category_label[Category]
Sub_Category = sub_category_label[Sub_Category]
except KeyError:
return ["Please provide region, category, and sub category from the pre-defined Superstore dataset classes", None]
if Quantity < 1 or Discount < 0:
return ["Quantity and Discount must be positive", None]
if not isinstance(Quantity, int):
return ["Quantity must be an integer", None]
if Discount > 1:
return ["Discount cannot be greater than one", None]
return [Region, Category, Sub_Category]
def XGB_predict(Region, Category, Sub_Category, Quantity, Discount):
sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
if len(sanitized)==2:
return sanitized
input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
predicted_class = XGB.predict(input)
explainer = shap.Explainer(XGB, test)
shap_values = explainer(input)
shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
plot = shap.plots.bar(shap_values, show=False)
plt.savefig('shap_plot_XGB.png')
return [profit_label[predicted_class[0]], 'shap_plot_XGB.png']
def SVM_predict(Region, Category, Sub_Category, Quantity, Discount):
sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
if len(sanitized)==2:
return sanitized
input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
predicted_class = SVM.predict(input)
explainer = shap.Explainer(SVM, test)
shap_values = explainer(input)
shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
plot = shap.plots.bar(shap_values, show=False)
plt.savefig('shap_plot_SVM.png')
return [profit_label[predicted_class[0]], 'shap_plot_SVM.png']
def Log_reg_predict(Region, Category, Sub_Category, Quantity, Discount):
sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
if len(sanitized)==2:
return sanitized
input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
predicted_class = Log_Reg.predict(input)
explainer = shap.Explainer(Log_Reg, test)
shap_values = explainer(input)
shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
plot = shap.plots.bar(shap_values, show=False)
plt.savefig('shap_plot_LogReg.png')
return [profit_label[predicted_class[0]], 'shap_plot_LogReg.png']
LogReg_tab = gr.Interface(
fn=Log_reg_predict,
inputs=["text", "text", "text", "number", "number"],
outputs=[
gr.Label(label="Model Prediction"),
gr.Image(label="Shapley Values"),
],
title="Logistic Regression Profit Prediction",
description="Create your own purchases and see if the Logistic Regression model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)
SVM_tab = gr.Interface(
fn=SVM_predict,
inputs=["text", "text", "text", "number", "number"],
outputs=[
gr.Label(label="Model Prediction"),
gr.Image(label="Shapley Values"),
],
title="SVM Profit Prediction",
description="Create your own purchases and see if the SVM model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)
XGB_tab = gr.Interface(
fn=XGB_predict,
inputs=["text", "text", "text", "number", "number"],
outputs=[
gr.Label(label="Model Prediction"),
gr.Image(label="Shapley Values"),
],
title="XGB Profit Prediction",
description="Create your own purchases and see if the XGB model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)
demo = gr.TabbedInterface([LogReg_tab, SVM_tab, XGB_tab], tab_names=["Logistic Regression", "SVM", "XGB"], theme=gr.themes.Soft())
demo.launch(debug=True) |