Spaces:
Running
Running
updated name
Browse files
app.py
CHANGED
@@ -1,417 +1,416 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import time
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
import numpy as np
|
7 |
-
import pmdarima as pm
|
8 |
-
import matplotlib.pyplot as plt
|
9 |
-
from pmdarima import auto_arima
|
10 |
-
import plotly.graph_objects as go
|
11 |
-
|
12 |
-
import torch
|
13 |
-
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
|
14 |
-
|
15 |
-
st.set_page_config(
|
16 |
-
page_title="Sales
|
17 |
-
page_icon="π",
|
18 |
-
layout="wide",
|
19 |
-
initial_sidebar_state="expanded",
|
20 |
-
)
|
21 |
-
|
22 |
-
# Preprocessing
|
23 |
-
@st.cache_data
|
24 |
-
def merge(B, C, A):
|
25 |
-
i = j = k = 0
|
26 |
-
|
27 |
-
# Convert 'Date' columns to datetime.date objects
|
28 |
-
B['Date'] = pd.to_datetime(B['Date']).dt.date
|
29 |
-
C['Date'] = pd.to_datetime(C['Date']).dt.date
|
30 |
-
A['Date'] = pd.to_datetime(A['Date']).dt.date
|
31 |
-
|
32 |
-
while i < len(B) and j < len(C):
|
33 |
-
if B['Date'].iloc[i] <= C['Date'].iloc[j]:
|
34 |
-
A['Date'].iloc[k] = B['Date'].iloc[i]
|
35 |
-
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
36 |
-
i += 1
|
37 |
-
|
38 |
-
else:
|
39 |
-
A['Date'].iloc[k] = C['Date'].iloc[j]
|
40 |
-
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
41 |
-
j += 1
|
42 |
-
k += 1
|
43 |
-
|
44 |
-
while i < len(B):
|
45 |
-
A['Date'].iloc[k] = B['Date'].iloc[i]
|
46 |
-
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
47 |
-
i += 1
|
48 |
-
k += 1
|
49 |
-
|
50 |
-
while j < len(C):
|
51 |
-
A['Date'].iloc[k] = C['Date'].iloc[j]
|
52 |
-
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
53 |
-
j += 1
|
54 |
-
k += 1
|
55 |
-
|
56 |
-
return A
|
57 |
-
|
58 |
-
@st.cache_data
|
59 |
-
def merge_sort(dataframe):
|
60 |
-
if len(dataframe) > 1:
|
61 |
-
center = len(dataframe) // 2
|
62 |
-
left = dataframe.iloc[:center]
|
63 |
-
right = dataframe.iloc[center:]
|
64 |
-
merge_sort(left)
|
65 |
-
merge_sort(right)
|
66 |
-
|
67 |
-
return merge(left, right, dataframe)
|
68 |
-
|
69 |
-
else:
|
70 |
-
return dataframe
|
71 |
-
|
72 |
-
@st.cache_data
|
73 |
-
def drop (dataframe):
|
74 |
-
def get_columns_containing(dataframe, substrings):
|
75 |
-
return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
|
76 |
-
|
77 |
-
columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
|
78 |
-
dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
|
79 |
-
dataframe = dataframe.dropna()
|
80 |
-
|
81 |
-
return dataframe
|
82 |
-
|
83 |
-
@st.cache_data
|
84 |
-
def date_format(dataframe):
|
85 |
-
for i, d, s in dataframe.itertuples():
|
86 |
-
dataframe['Date'][i] = dataframe['Date'][i].strip()
|
87 |
-
|
88 |
-
for i, d, s in dataframe.itertuples():
|
89 |
-
new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
|
90 |
-
dataframe['Date'][i] = new_date
|
91 |
-
|
92 |
-
return dataframe
|
93 |
-
|
94 |
-
@st.cache_data
|
95 |
-
def group_to_three(dataframe):
|
96 |
-
dataframe['Date'] = pd.to_datetime(dataframe['Date'])
|
97 |
-
dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
|
98 |
-
dataframe = dataframe.replace(0, np.nan).dropna()
|
99 |
-
|
100 |
-
return dataframe
|
101 |
-
|
102 |
-
@st.cache_data
|
103 |
-
def series_to_df_exogenous(series):
|
104 |
-
dataframe = series.to_frame()
|
105 |
-
dataframe = dataframe.reset_index()
|
106 |
-
dataframe = dataframe.set_index('Date')
|
107 |
-
dataframe = dataframe.dropna()
|
108 |
-
# Create the eXogenous values
|
109 |
-
dataframe['Sales First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(1)
|
110 |
-
dataframe['Seasonal First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(12)
|
111 |
-
dataframe = dataframe.dropna()
|
112 |
-
return dataframe
|
113 |
-
|
114 |
-
@st.cache_data
|
115 |
-
def dates_df(dataframe):
|
116 |
-
dataframe = dataframe.reset_index()
|
117 |
-
dataframe['Date'] = dataframe['Date'].dt.strftime('%B %d, %Y')
|
118 |
-
dataframe[dataframe.columns] = dataframe[dataframe.columns].astype(str)
|
119 |
-
return dataframe
|
120 |
-
|
121 |
-
@st.cache_data
|
122 |
-
def get_forecast_period(period):
|
123 |
-
return round(period / 3)
|
124 |
-
|
125 |
-
# SARIMAX Model
|
126 |
-
@st.cache_data
|
127 |
-
def train_test(dataframe, n):
|
128 |
-
training_y = dataframe.iloc[:-n,0]
|
129 |
-
test_y = dataframe.iloc[-n:,0]
|
130 |
-
test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
|
131 |
-
training_X = dataframe.iloc[:-n,1:]
|
132 |
-
test_X = dataframe.iloc[-n:,1:]
|
133 |
-
future_X = dataframe.iloc[0:,1:]
|
134 |
-
return (training_y, test_y, test_y_series, training_X, test_X, future_X)
|
135 |
-
|
136 |
-
@st.cache_data
|
137 |
-
def test_fitting(dataframe, Exo, trainY):
|
138 |
-
trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
|
139 |
-
test='adf',min_p=1,min_q=1,
|
140 |
-
max_p=3, max_q=3, m=12,
|
141 |
-
start_P=2, start_Q=2, seasonal=True,
|
142 |
-
d=None, D=1, trace=True,
|
143 |
-
error_action='ignore',
|
144 |
-
suppress_warnings=True,
|
145 |
-
stepwise=True, maxiter = 50)
|
146 |
-
model = trainTestModel
|
147 |
-
return model
|
148 |
-
|
149 |
-
def forecast_accuracy(forecast, actual):
|
150 |
-
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
|
151 |
-
rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
|
152 |
-
corr = np.corrcoef(forecast, actual)[0,1] # corr
|
153 |
-
mins = np.amin(np.hstack([forecast[:,None],
|
154 |
-
actual[:,None]]), axis=1)
|
155 |
-
maxs = np.amax(np.hstack([forecast[:,None],
|
156 |
-
actual[:,None]]), axis=1)
|
157 |
-
minmax = 1 - np.mean(mins/maxs) # minmax
|
158 |
-
return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
|
159 |
-
|
160 |
-
@st.cache_data
|
161 |
-
def sales_growth(dataframe, fittedValues):
|
162 |
-
sales_growth = fittedValues.to_frame()
|
163 |
-
sales_growth = sales_growth.reset_index()
|
164 |
-
sales_growth.columns = ("Date", "Sales")
|
165 |
-
sales_growth = sales_growth.set_index('Date')
|
166 |
-
|
167 |
-
sales_growth['Sales'] = (sales_growth['Sales']).round(2)
|
168 |
-
|
169 |
-
# Calculate and create the column for sales difference and growth
|
170 |
-
sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
|
171 |
-
sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
|
172 |
-
|
173 |
-
# Calculate and create the first row for sales difference and growth
|
174 |
-
sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
|
175 |
-
sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
|
176 |
-
|
177 |
-
return sales_growth
|
178 |
-
|
179 |
-
@st.cache_data
|
180 |
-
def merge_forecast_data(actual, predicted, future): # debug
|
181 |
-
actual = actual.to_frame()
|
182 |
-
print("BEFORE RENAME ACTUAL")
|
183 |
-
print(actual)
|
184 |
-
actual.rename(columns={actual.columns[0]: "Actual Sales"}, inplace=True)
|
185 |
-
print("ACTUAL")
|
186 |
-
print(actual)
|
187 |
-
|
188 |
-
predicted = predicted.to_frame()
|
189 |
-
predicted.rename(columns={predicted.columns[0]: "Predicted Sales"}, inplace=True)
|
190 |
-
print("PREDICTED")
|
191 |
-
print(predicted)
|
192 |
-
|
193 |
-
future = future.to_frame()
|
194 |
-
future = future.rename_axis('Date')
|
195 |
-
future.rename(columns={future.columns[0]: "Forecasted Future Sales"}, inplace=True)
|
196 |
-
print("FUTURE")
|
197 |
-
print(future)
|
198 |
-
|
199 |
-
merged_dataframe = pd.concat([actual, predicted, future], axis=1)
|
200 |
-
print("MERGED DATAFRAME")
|
201 |
-
print(merged_dataframe)
|
202 |
-
merged_dataframe = merged_dataframe.reset_index()
|
203 |
-
print("MERGED DATAFRAME RESET INDEX")
|
204 |
-
print(merged_dataframe)
|
205 |
-
return merged_dataframe
|
206 |
-
|
207 |
-
def interpret_mape(mape_score):
|
208 |
-
score = (mape_score * 100).round(2)
|
209 |
-
if score < 10:
|
210 |
-
interpretation = "Great"
|
211 |
-
color = "green"
|
212 |
-
elif score < 20:
|
213 |
-
interpretation = "Good"
|
214 |
-
color = "seagreen"
|
215 |
-
elif score < 50:
|
216 |
-
interpretation = "Relatively good"
|
217 |
-
color = "orange"
|
218 |
-
else:
|
219 |
-
interpretation = "Poor"
|
220 |
-
color = "red"
|
221 |
-
return score, interpretation, color
|
222 |
-
|
223 |
-
# TAPAS Model
|
224 |
-
|
225 |
-
@st.cache_resource
|
226 |
-
def load_tapas_model():
|
227 |
-
model_name = "google/tapas-large-finetuned-wtq"
|
228 |
-
tokenizer = TapasTokenizer.from_pretrained(model_name)
|
229 |
-
model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
|
230 |
-
pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
|
231 |
-
return pipe
|
232 |
-
|
233 |
-
pipe = load_tapas_model()
|
234 |
-
|
235 |
-
def get_answer(table, query):
|
236 |
-
answers = pipe(table=table, query=query)
|
237 |
-
return answers
|
238 |
-
|
239 |
-
def convert_answer(answer):
|
240 |
-
if answer['aggregator'] == 'SUM':
|
241 |
-
cells = answer['cells']
|
242 |
-
converted = sum(float(value.replace(',', '')) for value in cells)
|
243 |
-
return converted
|
244 |
-
|
245 |
-
if answer['aggregator'] == 'AVERAGE':
|
246 |
-
cells = answer['cells']
|
247 |
-
values = [float(value.replace(',', '')) for value in cells]
|
248 |
-
converted = sum(values) / len(values)
|
249 |
-
return converted
|
250 |
-
|
251 |
-
if answer['aggregator'] == 'COUNT':
|
252 |
-
cells = answer['cells']
|
253 |
-
converted = sum(int(value.replace(',', '')) for value in cells)
|
254 |
-
return converted
|
255 |
-
|
256 |
-
else:
|
257 |
-
|
258 |
-
return answer['answer']
|
259 |
-
|
260 |
-
def get_converted_answer(table, query):
|
261 |
-
converted_answer = convert_answer(get_answer(table, query))
|
262 |
-
return converted_answer
|
263 |
-
|
264 |
-
# Session States
|
265 |
-
if 'uploaded' not in st.session_state:
|
266 |
-
st.session_state.uploaded = False
|
267 |
-
|
268 |
-
if 'forecasted' not in st.session_state:
|
269 |
-
st.session_state.forecasted = False
|
270 |
-
|
271 |
-
# Web Application
|
272 |
-
st.title("Forecasting Dashboard π")
|
273 |
-
if not st.session_state.uploaded:
|
274 |
-
st.subheader("Welcome User, get started forecasting by uploading your file in the sidebar!")
|
275 |
-
|
276 |
-
# Sidebar Menu
|
277 |
-
with st.sidebar:
|
278 |
-
|
279 |
-
st.
|
280 |
-
st.
|
281 |
-
uploaded_file
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
st.
|
293 |
-
|
294 |
-
|
295 |
-
st.
|
296 |
-
st.write(
|
297 |
-
|
298 |
-
|
299 |
-
df =
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
st.
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
'
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
n_periods
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
print(df) # debug
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
fitted_series =
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
future_fitted_series =
|
353 |
-
|
354 |
-
#
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
print("
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
print(df)
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
print(merged_data)
|
373 |
-
print(merged_data.
|
374 |
-
|
375 |
-
|
376 |
-
fig_compare =
|
377 |
-
fig_compare.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['
|
378 |
-
fig_compare.
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
fig_forecast =
|
384 |
-
fig_forecast.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['
|
385 |
-
fig_forecast.
|
386 |
-
|
387 |
-
st.
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
col_table
|
393 |
-
|
394 |
-
col_table[0].
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
answer
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
"""
|
417 |
st.markdown(hide_st_style, unsafe_allow_html=True)
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import time
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import pmdarima as pm
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
from pmdarima import auto_arima
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
|
14 |
+
|
15 |
+
st.set_page_config(
|
16 |
+
page_title="Sales Predictor-AI Project",
|
17 |
+
page_icon="π",
|
18 |
+
layout="wide",
|
19 |
+
initial_sidebar_state="expanded",
|
20 |
+
)
|
21 |
+
|
22 |
+
# Preprocessing
|
23 |
+
@st.cache_data
|
24 |
+
def merge(B, C, A):
|
25 |
+
i = j = k = 0
|
26 |
+
|
27 |
+
# Convert 'Date' columns to datetime.date objects
|
28 |
+
B['Date'] = pd.to_datetime(B['Date']).dt.date
|
29 |
+
C['Date'] = pd.to_datetime(C['Date']).dt.date
|
30 |
+
A['Date'] = pd.to_datetime(A['Date']).dt.date
|
31 |
+
|
32 |
+
while i < len(B) and j < len(C):
|
33 |
+
if B['Date'].iloc[i] <= C['Date'].iloc[j]:
|
34 |
+
A['Date'].iloc[k] = B['Date'].iloc[i]
|
35 |
+
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
36 |
+
i += 1
|
37 |
+
|
38 |
+
else:
|
39 |
+
A['Date'].iloc[k] = C['Date'].iloc[j]
|
40 |
+
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
41 |
+
j += 1
|
42 |
+
k += 1
|
43 |
+
|
44 |
+
while i < len(B):
|
45 |
+
A['Date'].iloc[k] = B['Date'].iloc[i]
|
46 |
+
A['Sales'].iloc[k] = B['Sales'].iloc[i]
|
47 |
+
i += 1
|
48 |
+
k += 1
|
49 |
+
|
50 |
+
while j < len(C):
|
51 |
+
A['Date'].iloc[k] = C['Date'].iloc[j]
|
52 |
+
A['Sales'].iloc[k] = C['Sales'].iloc[j]
|
53 |
+
j += 1
|
54 |
+
k += 1
|
55 |
+
|
56 |
+
return A
|
57 |
+
|
58 |
+
@st.cache_data
|
59 |
+
def merge_sort(dataframe):
|
60 |
+
if len(dataframe) > 1:
|
61 |
+
center = len(dataframe) // 2
|
62 |
+
left = dataframe.iloc[:center]
|
63 |
+
right = dataframe.iloc[center:]
|
64 |
+
merge_sort(left)
|
65 |
+
merge_sort(right)
|
66 |
+
|
67 |
+
return merge(left, right, dataframe)
|
68 |
+
|
69 |
+
else:
|
70 |
+
return dataframe
|
71 |
+
|
72 |
+
@st.cache_data
|
73 |
+
def drop (dataframe):
|
74 |
+
def get_columns_containing(dataframe, substrings):
|
75 |
+
return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)]
|
76 |
+
|
77 |
+
columns_to_keep = get_columns_containing(dataframe, ["date", "sale"])
|
78 |
+
dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep))
|
79 |
+
dataframe = dataframe.dropna()
|
80 |
+
|
81 |
+
return dataframe
|
82 |
+
|
83 |
+
@st.cache_data
|
84 |
+
def date_format(dataframe):
|
85 |
+
for i, d, s in dataframe.itertuples():
|
86 |
+
dataframe['Date'][i] = dataframe['Date'][i].strip()
|
87 |
+
|
88 |
+
for i, d, s in dataframe.itertuples():
|
89 |
+
new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date()
|
90 |
+
dataframe['Date'][i] = new_date
|
91 |
+
|
92 |
+
return dataframe
|
93 |
+
|
94 |
+
@st.cache_data
|
95 |
+
def group_to_three(dataframe):
|
96 |
+
dataframe['Date'] = pd.to_datetime(dataframe['Date'])
|
97 |
+
dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2)
|
98 |
+
dataframe = dataframe.replace(0, np.nan).dropna()
|
99 |
+
|
100 |
+
return dataframe
|
101 |
+
|
102 |
+
@st.cache_data
|
103 |
+
def series_to_df_exogenous(series):
|
104 |
+
dataframe = series.to_frame()
|
105 |
+
dataframe = dataframe.reset_index()
|
106 |
+
dataframe = dataframe.set_index('Date')
|
107 |
+
dataframe = dataframe.dropna()
|
108 |
+
# Create the eXogenous values
|
109 |
+
dataframe['Sales First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(1)
|
110 |
+
dataframe['Seasonal First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(12)
|
111 |
+
dataframe = dataframe.dropna()
|
112 |
+
return dataframe
|
113 |
+
|
114 |
+
@st.cache_data
|
115 |
+
def dates_df(dataframe):
|
116 |
+
dataframe = dataframe.reset_index()
|
117 |
+
dataframe['Date'] = dataframe['Date'].dt.strftime('%B %d, %Y')
|
118 |
+
dataframe[dataframe.columns] = dataframe[dataframe.columns].astype(str)
|
119 |
+
return dataframe
|
120 |
+
|
121 |
+
@st.cache_data
|
122 |
+
def get_forecast_period(period):
|
123 |
+
return round(period / 3)
|
124 |
+
|
125 |
+
# SARIMAX Model
|
126 |
+
@st.cache_data
|
127 |
+
def train_test(dataframe, n):
|
128 |
+
training_y = dataframe.iloc[:-n,0]
|
129 |
+
test_y = dataframe.iloc[-n:,0]
|
130 |
+
test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index)
|
131 |
+
training_X = dataframe.iloc[:-n,1:]
|
132 |
+
test_X = dataframe.iloc[-n:,1:]
|
133 |
+
future_X = dataframe.iloc[0:,1:]
|
134 |
+
return (training_y, test_y, test_y_series, training_X, test_X, future_X)
|
135 |
+
|
136 |
+
@st.cache_data
|
137 |
+
def test_fitting(dataframe, Exo, trainY):
|
138 |
+
trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1,
|
139 |
+
test='adf',min_p=1,min_q=1,
|
140 |
+
max_p=3, max_q=3, m=12,
|
141 |
+
start_P=2, start_Q=2, seasonal=True,
|
142 |
+
d=None, D=1, trace=True,
|
143 |
+
error_action='ignore',
|
144 |
+
suppress_warnings=True,
|
145 |
+
stepwise=True, maxiter = 50)
|
146 |
+
model = trainTestModel
|
147 |
+
return model
|
148 |
+
|
149 |
+
def forecast_accuracy(forecast, actual):
|
150 |
+
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE
|
151 |
+
rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE
|
152 |
+
corr = np.corrcoef(forecast, actual)[0,1] # corr
|
153 |
+
mins = np.amin(np.hstack([forecast[:,None],
|
154 |
+
actual[:,None]]), axis=1)
|
155 |
+
maxs = np.amax(np.hstack([forecast[:,None],
|
156 |
+
actual[:,None]]), axis=1)
|
157 |
+
minmax = 1 - np.mean(mins/maxs) # minmax
|
158 |
+
return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax})
|
159 |
+
|
160 |
+
@st.cache_data
|
161 |
+
def sales_growth(dataframe, fittedValues):
|
162 |
+
sales_growth = fittedValues.to_frame()
|
163 |
+
sales_growth = sales_growth.reset_index()
|
164 |
+
sales_growth.columns = ("Date", "Sales")
|
165 |
+
sales_growth = sales_growth.set_index('Date')
|
166 |
+
|
167 |
+
sales_growth['Sales'] = (sales_growth['Sales']).round(2)
|
168 |
+
|
169 |
+
# Calculate and create the column for sales difference and growth
|
170 |
+
sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2)
|
171 |
+
sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2)
|
172 |
+
|
173 |
+
# Calculate and create the first row for sales difference and growth
|
174 |
+
sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2)
|
175 |
+
sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2)
|
176 |
+
|
177 |
+
return sales_growth
|
178 |
+
|
179 |
+
@st.cache_data
|
180 |
+
def merge_forecast_data(actual, predicted, future): # debug
|
181 |
+
actual = actual.to_frame()
|
182 |
+
print("BEFORE RENAME ACTUAL")
|
183 |
+
print(actual)
|
184 |
+
actual.rename(columns={actual.columns[0]: "Actual Sales"}, inplace=True)
|
185 |
+
print("ACTUAL")
|
186 |
+
print(actual)
|
187 |
+
|
188 |
+
predicted = predicted.to_frame()
|
189 |
+
predicted.rename(columns={predicted.columns[0]: "Predicted Sales"}, inplace=True)
|
190 |
+
print("PREDICTED")
|
191 |
+
print(predicted)
|
192 |
+
|
193 |
+
future = future.to_frame()
|
194 |
+
future = future.rename_axis('Date')
|
195 |
+
future.rename(columns={future.columns[0]: "Forecasted Future Sales"}, inplace=True)
|
196 |
+
print("FUTURE")
|
197 |
+
print(future)
|
198 |
+
|
199 |
+
merged_dataframe = pd.concat([actual, predicted, future], axis=1)
|
200 |
+
print("MERGED DATAFRAME")
|
201 |
+
print(merged_dataframe)
|
202 |
+
merged_dataframe = merged_dataframe.reset_index()
|
203 |
+
print("MERGED DATAFRAME RESET INDEX")
|
204 |
+
print(merged_dataframe)
|
205 |
+
return merged_dataframe
|
206 |
+
|
207 |
+
def interpret_mape(mape_score):
|
208 |
+
score = (mape_score * 100).round(2)
|
209 |
+
if score < 10:
|
210 |
+
interpretation = "Great"
|
211 |
+
color = "green"
|
212 |
+
elif score < 20:
|
213 |
+
interpretation = "Good"
|
214 |
+
color = "seagreen"
|
215 |
+
elif score < 50:
|
216 |
+
interpretation = "Relatively good"
|
217 |
+
color = "orange"
|
218 |
+
else:
|
219 |
+
interpretation = "Poor"
|
220 |
+
color = "red"
|
221 |
+
return score, interpretation, color
|
222 |
+
|
223 |
+
# TAPAS Model
|
224 |
+
|
225 |
+
@st.cache_resource
|
226 |
+
def load_tapas_model():
|
227 |
+
model_name = "google/tapas-large-finetuned-wtq"
|
228 |
+
tokenizer = TapasTokenizer.from_pretrained(model_name)
|
229 |
+
model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
|
230 |
+
pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
|
231 |
+
return pipe
|
232 |
+
|
233 |
+
pipe = load_tapas_model()
|
234 |
+
|
235 |
+
def get_answer(table, query):
|
236 |
+
answers = pipe(table=table, query=query)
|
237 |
+
return answers
|
238 |
+
|
239 |
+
def convert_answer(answer):
|
240 |
+
if answer['aggregator'] == 'SUM':
|
241 |
+
cells = answer['cells']
|
242 |
+
converted = sum(float(value.replace(',', '')) for value in cells)
|
243 |
+
return converted
|
244 |
+
|
245 |
+
if answer['aggregator'] == 'AVERAGE':
|
246 |
+
cells = answer['cells']
|
247 |
+
values = [float(value.replace(',', '')) for value in cells]
|
248 |
+
converted = sum(values) / len(values)
|
249 |
+
return converted
|
250 |
+
|
251 |
+
if answer['aggregator'] == 'COUNT':
|
252 |
+
cells = answer['cells']
|
253 |
+
converted = sum(int(value.replace(',', '')) for value in cells)
|
254 |
+
return converted
|
255 |
+
|
256 |
+
else:
|
257 |
+
|
258 |
+
return answer['answer']
|
259 |
+
|
260 |
+
def get_converted_answer(table, query):
|
261 |
+
converted_answer = convert_answer(get_answer(table, query))
|
262 |
+
return converted_answer
|
263 |
+
|
264 |
+
# Session States
|
265 |
+
if 'uploaded' not in st.session_state:
|
266 |
+
st.session_state.uploaded = False
|
267 |
+
|
268 |
+
if 'forecasted' not in st.session_state:
|
269 |
+
st.session_state.forecasted = False
|
270 |
+
|
271 |
+
# Web Application
|
272 |
+
st.title("Forecasting Dashboard π")
|
273 |
+
if not st.session_state.uploaded:
|
274 |
+
st.subheader("Welcome User, get started forecasting by uploading your file in the sidebar!")
|
275 |
+
|
276 |
+
# Sidebar Menu
|
277 |
+
with st.sidebar:
|
278 |
+
st.title("Forecaster v1.1")
|
279 |
+
st.subheader("An intelligent sales forecasting system")
|
280 |
+
uploaded_file = st.file_uploader("Upload your store data here to proceed (must atleast contain Date and Sales)", type=["csv"])
|
281 |
+
if uploaded_file is not None:
|
282 |
+
date_found = False
|
283 |
+
sales_found = False
|
284 |
+
df = pd.read_csv(uploaded_file, parse_dates=True)
|
285 |
+
for column in df.columns:
|
286 |
+
if 'Date' in column:
|
287 |
+
date_found = True
|
288 |
+
if 'Sales' in column:
|
289 |
+
sales_found = True
|
290 |
+
if(date_found == False or sales_found == False):
|
291 |
+
st.error('Please upload a csv containing both Date and Sales...')
|
292 |
+
st.stop()
|
293 |
+
|
294 |
+
st.success("File uploaded successfully!")
|
295 |
+
st.write("Your uploaded data:")
|
296 |
+
st.write(df)
|
297 |
+
|
298 |
+
df = drop(df)
|
299 |
+
df = date_format(df)
|
300 |
+
merge_sort(df)
|
301 |
+
series = group_to_three(df)
|
302 |
+
|
303 |
+
st.session_state.uploaded = True
|
304 |
+
|
305 |
+
with open('sample.csv', 'rb') as f:
|
306 |
+
st.download_button("Download our sample CSV", f, file_name='sample.csv')
|
307 |
+
|
308 |
+
if (st.session_state.uploaded):
|
309 |
+
st.subheader("Sales History")
|
310 |
+
st.line_chart(series)
|
311 |
+
|
312 |
+
MIN_DAYS = 30
|
313 |
+
MAX_DAYS = 90
|
314 |
+
period = st.slider('How many days would you like to forecast?', min_value=MIN_DAYS, max_value=MAX_DAYS)
|
315 |
+
forecast_period = get_forecast_period(period)
|
316 |
+
|
317 |
+
forecast_button = st.button(
|
318 |
+
'Start Forecasting',
|
319 |
+
key='forecast_button',
|
320 |
+
type="primary",
|
321 |
+
)
|
322 |
+
|
323 |
+
if (forecast_button or st.session_state.forecasted):
|
324 |
+
df = series_to_df_exogenous(series)
|
325 |
+
n_periods = round(len(df) * 0.2)
|
326 |
+
print(n_periods) # debug
|
327 |
+
|
328 |
+
train = train_test(df, n_periods)
|
329 |
+
training_y, test_y, test_y_series, training_X, test_X, future_X = train
|
330 |
+
train_test_model = test_fitting(df, training_X, training_y)
|
331 |
+
|
332 |
+
print(df) # debug
|
333 |
+
print(len(df)) # debug
|
334 |
+
|
335 |
+
future_n_periods = forecast_period
|
336 |
+
fitted, confint = train_test_model.predict(X=test_X, n_periods=n_periods, return_conf_int=True)
|
337 |
+
index_of_fc = test_y_series.index
|
338 |
+
|
339 |
+
# make series for plotting purpose
|
340 |
+
fitted_series = pd.Series(fitted)
|
341 |
+
fitted_series.index = index_of_fc
|
342 |
+
lower_series = pd.Series(confint[:, 0], index=index_of_fc)
|
343 |
+
upper_series = pd.Series(confint[:, 1], index=index_of_fc)
|
344 |
+
|
345 |
+
#Future predictions
|
346 |
+
frequency = '3D'
|
347 |
+
future_fitted, confint = train_test_model.predict(X=df.iloc[-future_n_periods:,1:], n_periods=future_n_periods, return_conf_int=True, freq=frequency)
|
348 |
+
future_index_of_fc = pd.date_range(df['Sales'].index[-1], periods = future_n_periods, freq=frequency)
|
349 |
+
|
350 |
+
# make series for future plotting purpose
|
351 |
+
future_fitted_series = pd.Series(future_fitted)
|
352 |
+
future_fitted_series.index = future_index_of_fc
|
353 |
+
# future_lower_series = pd.Series(confint[:, 0], index=future_index_of_fc)
|
354 |
+
# future_upper_series = pd.Series(confint[:, 1], index=future_index_of_fc)
|
355 |
+
|
356 |
+
future_sales_growth = sales_growth(df, future_fitted_series)
|
357 |
+
|
358 |
+
test_y, predictions = np.array(test_y), np.array(fitted)
|
359 |
+
print("Test Y:", test_y) # debug
|
360 |
+
print("Prediction:", fitted) # debug
|
361 |
+
score = forecast_accuracy(predictions, test_y)
|
362 |
+
print("Score:", score) # debug
|
363 |
+
mape, interpretation, mape_color = interpret_mape(score['mape'])
|
364 |
+
|
365 |
+
print(df)
|
366 |
+
print(df['Sales'])
|
367 |
+
merged_data = merge_forecast_data(df['Sales'], fitted_series, future_fitted_series)
|
368 |
+
|
369 |
+
col_charts = st.columns(2)
|
370 |
+
|
371 |
+
print(merged_data) # debug
|
372 |
+
print(merged_data.info)
|
373 |
+
print(merged_data.dtypes)
|
374 |
+
with col_charts[0]:
|
375 |
+
fig_compare = go.Figure()
|
376 |
+
fig_compare.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['Actual Sales'], mode='lines', name='Actual Sales'))
|
377 |
+
fig_compare.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['Predicted Sales'], mode='lines', name='Predicted Sales', line=dict(color='#006400')))
|
378 |
+
fig_compare.update_layout(title='Historical Sales Data', xaxis_title='Date', yaxis_title='Sales')
|
379 |
+
st.plotly_chart(fig_compare, use_container_width=True)
|
380 |
+
|
381 |
+
with col_charts[1]:
|
382 |
+
fig_forecast = go.Figure()
|
383 |
+
fig_forecast.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['Actual Sales'], mode='lines', name='Actual Sales'))
|
384 |
+
fig_forecast.add_trace(go.Scatter(x=merged_data[merged_data.columns[0]], y=merged_data['Forecasted Future Sales'], mode='lines', name='Future Forecasted Sales', line=dict(color=mape_color)))
|
385 |
+
fig_forecast.update_layout(title='Forecasted Sales Data', xaxis_title='Date', yaxis_title='Sales')
|
386 |
+
st.plotly_chart(fig_forecast, use_container_width=True)
|
387 |
+
st.write(f"MAPE score: {mape}% - {interpretation}")
|
388 |
+
|
389 |
+
df = dates_df(future_sales_growth)
|
390 |
+
|
391 |
+
col_table = st.columns(2)
|
392 |
+
with col_table[0]:
|
393 |
+
col_table[0].subheader(f"Forecasted sales in the next {period} days")
|
394 |
+
col_table[0].write(df)
|
395 |
+
|
396 |
+
with col_table[1]:
|
397 |
+
col_table[1] = st.subheader("Question-Answering")
|
398 |
+
with st.form("question_form"):
|
399 |
+
question = st.text_input('Ask a Question about the Forecasted Data', placeholder="What is the total sales in the month of December?")
|
400 |
+
query_button = st.form_submit_button(label='Generate Answer')
|
401 |
+
if query_button or question:
|
402 |
+
answer = get_converted_answer(df, question)
|
403 |
+
if answer is not None:
|
404 |
+
st.write("The answer is:", answer)
|
405 |
+
else:
|
406 |
+
st.write("Answer is not found in table")
|
407 |
+
st.session_state.forecasted = True
|
408 |
+
|
409 |
+
|
410 |
+
# Hide Streamlit default style
|
411 |
+
hide_st_style = """
|
412 |
+
<style>
|
413 |
+
footer {visibility: hidden;}
|
414 |
+
</style>
|
415 |
+
"""
|
|
|
416 |
st.markdown(hide_st_style, unsafe_allow_html=True)
|