John Guerrerio commited on
Commit
e81cf7c
1 Parent(s): aefbc5b

first try at deployment

Browse files
Files changed (6) hide show
  1. Log_Reg.pkl +0 -0
  2. SVM.pkl +0 -0
  3. Superstore.csv +0 -0
  4. XGB.model +0 -0
  5. app.py +174 -0
  6. requirements.txt +8 -0
Log_Reg.pkl ADDED
Binary file (1.14 kB). View file
 
SVM.pkl ADDED
Binary file (1.15 kB). View file
 
Superstore.csv ADDED
The diff for this file is too large to render. See raw diff
 
XGB.model ADDED
Binary file (125 kB). View file
 
app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split, GridSearchCV
4
+ from sklearn.linear_model import SGDClassifier
5
+ from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
6
+ import shap
7
+ import xgboost as xgb
8
+ import gradio as gr
9
+ import matplotlib.pyplot as plt
10
+ import joblib
11
+
12
+ SVM = joblib.load('SVM.pkl')
13
+ Log_Reg = joblib.load('Log_Reg.pkl')
14
+ XGB = xgb.XGBClassifier()
15
+ XGB.load_model('XGB.model')
16
+
17
+
18
+ df = pd.read_csv('Superstore.csv')
19
+ df.dropna(subset=["Region", "Category", "Sub-Category", "Quantity", "Discount"], inplace=True)
20
+ MEDIAN = 8.662 # from the exploratory analysis file
21
+ RANDOM_STATE = 42 # random seed to ensure results are reproducible
22
+
23
+ region=np.unique(df['Region'], return_inverse=True)[1]
24
+ category=np.unique(df['Category'], return_inverse=True)[1]
25
+ subCategory=np.unique(df['Sub-Category'], return_inverse=True)[1]
26
+
27
+ # turn quantity, discount, and profit columns into vectors of numbers
28
+ quantity = df["Quantity"].to_numpy()
29
+ discount = df["Discount"].to_numpy()
30
+ profit = df["Profit"].to_numpy()
31
+
32
+ vectorizedDataset = np.empty((len(region), 5))
33
+ labels = np.empty(len(region))
34
+
35
+ # generate feature vectors
36
+ for i in range(0, len(region)):
37
+ data = np.zeros((1, 5))
38
+ data[0][0] = region[i]
39
+ data[0][1] = category[i]
40
+ data[0][2] = subCategory[i]
41
+ data[0][3] = quantity[i]
42
+ data[0][4] = discount[i]
43
+
44
+ vectorizedDataset[i] = data
45
+
46
+ if (profit[i] > MEDIAN):
47
+ labels[i] = 1
48
+ else:
49
+ labels[i] = 0
50
+
51
+ train, test, trainLabels, testLabels = train_test_split(vectorizedDataset, labels, test_size=0.3, random_state=RANDOM_STATE)
52
+
53
+ region_label = {'Central': 0, 'East': 1, 'South': 2, 'West': 3}
54
+ category_label = {'Furniture': 0, 'Office Supplies': 1, 'Technology': 2}
55
+ sub_category_label = {'Accessories': 0, 'Appliances': 1, 'Art': 2, 'Binders': 3, 'Bookcases': 4,
56
+ 'Chairs': 5, 'Copiers': 6, 'Envelopes': 7, 'Fasteners': 8, 'Furnishings': 9,
57
+ 'Labels': 10, 'Machines': 11, 'Paper': 12, 'Phones': 13, 'Storage': 14, 'Supplies': 15,
58
+ 'Tables': 16}
59
+ profit_label = {0: 'Below Median Profit', 1: 'Above Median Profit'}
60
+ feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
61
+
62
+ def sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount):
63
+ try:
64
+ Region = region_label[Region]
65
+ Category = category_label[Category]
66
+ Sub_Category = sub_category_label[Sub_Category]
67
+ except KeyError:
68
+ return ["Please provide region, category, and sub category from the pre-defined Superstore dataset classes", None]
69
+
70
+ if Quantity < 1 or Discount < 0:
71
+ return ["Quantity and Discount must be positive", None]
72
+
73
+ if not isinstance(Quantity, int):
74
+ return ["Quantity must be an integer", None]
75
+
76
+ if Discount > 1:
77
+ return ["Discount cannot be greater than one", None]
78
+
79
+ return [Region, Category, Sub_Category]
80
+
81
+ def XGB_predict(Region, Category, Sub_Category, Quantity, Discount):
82
+ sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
83
+
84
+ if len(sanitized)==2:
85
+ return sanitized
86
+
87
+ input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
88
+ predicted_class = XGB.predict(input)
89
+
90
+ explainer = shap.Explainer(XGB, test)
91
+ shap_values = explainer(input)
92
+ shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
93
+
94
+ plot = shap.plots.bar(shap_values, show=False)
95
+ plt.savefig('shap_plot_XGB.png')
96
+
97
+ return [profit_label[predicted_class[0]], 'shap_plot_XGB.png']
98
+
99
+ def SVM_predict(Region, Category, Sub_Category, Quantity, Discount):
100
+ sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
101
+
102
+ if len(sanitized)==2:
103
+ return sanitized
104
+
105
+ input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
106
+ predicted_class = SVM.predict(input)
107
+
108
+ explainer = shap.Explainer(SVM, test)
109
+ shap_values = explainer(input)
110
+ shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
111
+
112
+ plot = shap.plots.bar(shap_values, show=False)
113
+ plt.savefig('shap_plot_SVM.png')
114
+
115
+ return [profit_label[predicted_class[0]], 'shap_plot_SVM.png']
116
+
117
+ def Log_reg_predict(Region, Category, Sub_Category, Quantity, Discount):
118
+ sanitized = sanitize_inputs(Region, Category, Sub_Category, Quantity, Discount)
119
+
120
+ if len(sanitized)==2:
121
+ return sanitized
122
+
123
+ input = np.array([[sanitized[0], sanitized[1], sanitized[2], Quantity, Discount]])
124
+ predicted_class = Log_Reg.predict(input)
125
+
126
+ explainer = shap.Explainer(Log_Reg, test)
127
+ shap_values = explainer(input)
128
+ shap_values.feature_names = ["Region", "Category", "Sub-Category", "Quantity", "Discount"]
129
+
130
+ plot = shap.plots.bar(shap_values, show=False)
131
+ plt.savefig('shap_plot_LogReg.png')
132
+
133
+ return [profit_label[predicted_class[0]], 'shap_plot_LogReg.png']
134
+
135
+
136
+
137
+ LogReg_tab = gr.Interface(
138
+ fn=Log_reg_predict,
139
+ inputs=["text", "text", "text", "number", "number"],
140
+ outputs=[
141
+ gr.Label(label="Model Prediction"),
142
+ gr.Image(label="Shapley Values"),
143
+ ],
144
+ title="Logistic Regression Profit Prediction",
145
+ description="Create your own purchases and see if the Logistic Regression model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
146
+ )
147
+
148
+ SVM_tab = gr.Interface(
149
+ fn=SVM_predict,
150
+ inputs=["text", "text", "text", "number", "number"],
151
+ outputs=[
152
+ gr.Label(label="Model Prediction"),
153
+ gr.Image(label="Shapley Values"),
154
+ ],
155
+ title="SVM Profit Prediction",
156
+ description="Create your own purchases and see if the SVM model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
157
+ )
158
+
159
+ XGB_tab = gr.Interface(
160
+ fn=XGB_predict,
161
+ inputs=["text", "text", "text", "number", "number"],
162
+ outputs=[
163
+ gr.Label(label="Model Prediction"),
164
+ gr.Image(label="Shapley Values"),
165
+ ],
166
+ title="XGB Profit Prediction",
167
+ description="Create your own purchases and see if the XGB model predicts they will make above or below the median profit\n\nValid regions: ['Central', 'East', 'South', 'West']\n\nValid product categories: ['Furniture', 'Office Supplies', 'Technology']\n\nValid product sub-categories: ['Accessories', 'Appliances', 'Art', 'Binders', 'Bookcases', 'Chairs', 'Copiers', 'Envelopes', 'Fasteners', 'Furnishings', 'Labels', 'Machines', 'Paper', 'Phones', 'Storage', 'Supplies', 'Tables']",
168
+ )
169
+
170
+
171
+
172
+ demo = gr.TabbedInterface([LogReg_tab, SVM_tab, XGB_tab], tab_names=["Logistic Regression", "SVM", "XGB"], theme=gr.themes.Soft())
173
+
174
+ demo.launch(debug=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ numpy==1.25.2
3
+ scikit-learn==1.2.2
4
+ shap==0.45.0
5
+ shapely==2.0.4
6
+ xgboost==2.0.3
7
+ matplotlib==3.7.1
8
+ joblib==1.4.0