File size: 7,175 Bytes
e81cf7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46f8a9f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
import shap
import xgboost as xgb
import gradio as gr
import matplotlib.pyplot as plt
import joblib

SVM = joblib.load('SVM.pkl')
Log_Reg = joblib.load('Log_Reg.pkl')
XGB = xgb.XGBClassifier()
XGB.load_model('XGB.model')


df = pd.read_csv('Superstore.csv')
df.dropna(subset=["Region", "Category", "Sub-Category", "Quantity", "Discount"], inplace=True)
MEDIAN = 8.662 # from the exploratory analysis file
RANDOM_STATE = 42 # random seed to ensure results are reproducible

region=np.unique(df['Region'], return_inverse=True)[1]
category=np.unique(df['Category'], return_inverse=True)[1]
subCategory=np.unique(df['Sub-Category'], return_inverse=True)[1]

# turn quantity, discount, and profit columns into vectors of numbers
quantity = df["Quantity"].to_numpy()
discount = df["Discount"].to_numpy()
profit = df["Profit"].to_numpy()

vectorizedDataset = np.empty((len(region), 5))
labels = np.empty(len(region))

# generate feature vectors
for i in range(0, len(region)):
  data = np.zeros((1, 5))
  data[0][0] = region[i]
  data[0][1] = category[i]
  data[0][2] = subCategory[i]
  data[0][3] = quantity[i]
  data[0][4] = discount[i]

  vectorizedDataset[i] = data

  if (profit[i] > MEDIAN):
    labels[i] = 1
  else:
    labels[i] = 0

train, test, trainLabels, testLabels = train_test_split(vectorizedDataset, labels, test_size=0.3, random_state=RANDOM_STATE)

region_label = {'Central': 0, 'East': 1, 'South': 2, 'West': 3}
category_label = {'Furniture': 0, 'Office Supplies': 1, 'Technology': 2}
sub_category_label = {'Accessories': 0, 'Appliances': 1, 'Art': 2, 'Binders': 3, 'Bookcases': 4,
       'Chairs': 5, 'Copiers': 6, 'Envelopes': 7, 'Fasteners': 8, 'Furnishings': 9,
       'Labels': 10, 'Machines': 11, 'Paper': 12, 'Phones': 13, 'Storage': 14, 'Supplies': 15,
       'Tables': 16}
profit_label = {0: 'Below Median Profit', 1: 'Above Median Profit'}
feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]

def sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount):
  try:
    Region = region_label[Region]
    Category = category_label[Category]
    Sub_Category = sub_category_label[Sub_Category]
  except KeyError:
    return ["Please provide region, category, and sub category from the pre-defined Superstore dataset classes", None]

  if Quantity < 1 or Discount < 0:
    return ["Quantity and Discount must be positive", None]

  if not isinstance(Quantity, int):
    return ["Quantity must be an integer", None]

  if Discount > 1:
     return ["Discount cannot be greater than one", None]

  return [Region, Category, Sub_Category]

def XGB_predict(Region, Category, Sub_Category, Quantity, Discount):
  sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)

  if len(sanitized)==2:
    return sanitized

  input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
  predicted_class = XGB.predict(input)

  explainer = shap.Explainer(XGB, test)
  shap_values = explainer(input)
  shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]

  plot = shap.plots.bar(shap_values, show=False)
  plt.savefig('shap_plot_XGB.png')

  return [profit_label[predicted_class[0]], 'shap_plot_XGB.png']

def SVM_predict(Region, Category, Sub_Category, Quantity, Discount):
  sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)

  if len(sanitized)==2:
    return sanitized

  input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
  predicted_class = SVM.predict(input)

  explainer = shap.Explainer(SVM, test)
  shap_values = explainer(input)
  shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]

  plot = shap.plots.bar(shap_values, show=False)
  plt.savefig('shap_plot_SVM.png')

  return [profit_label[predicted_class[0]], 'shap_plot_SVM.png']

def Log_reg_predict(Region, Category, Sub_Category, Quantity, Discount):
  sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)

  if len(sanitized)==2:
    return sanitized

  input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
  predicted_class = Log_Reg.predict(input)

  explainer = shap.Explainer(Log_Reg, test)
  shap_values = explainer(input)
  shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]

  plot = shap.plots.bar(shap_values, show=False)
  plt.savefig('shap_plot_LogReg.png')

  return [profit_label[predicted_class[0]], 'shap_plot_LogReg.png']



LogReg_tab = gr.Interface(
    fn=Log_reg_predict,
    inputs=["text", "text", "text", "number", "number"],
    outputs=[
        gr.Label(label="Model Prediction"),
        gr.Image(label="Shapley Values"),
    ],
    title="Logistic Regression Profit Prediction",
    description="Create your own purchases and see if the Logistic Regression model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)

SVM_tab = gr.Interface(
    fn=SVM_predict,
    inputs=["text", "text", "text", "number", "number"],
    outputs=[
        gr.Label(label="Model Prediction"),
        gr.Image(label="Shapley Values"),
    ],
    title="SVM Profit Prediction",
    description="Create your own purchases and see if the SVM model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)

XGB_tab = gr.Interface(
    fn=XGB_predict,
    inputs=["text", "text", "text", "number", "number"],
    outputs=[
        gr.Label(label="Model Prediction"),
        gr.Image(label="Shapley Values"),
    ],
    title="XGB Profit Prediction",
    description="Create your own purchases and see if the XGB model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
)

demo = gr.TabbedInterface([LogReg_tab, SVM_tab, XGB_tab], tab_names=["Logistic Regression", "SVM", "XGB"], theme=gr.themes.Soft())

demo.launch(debug=True)