Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import time | |
from datetime import datetime | |
import numpy as np | |
import pmdarima as pm | |
import matplotlib.pyplot as plt | |
from pmdarima import auto_arima | |
# import plotly.graph_objects as go | |
import torch | |
from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering | |
st.set_page_config( | |
page_title="Sales Forecasting System", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="expanded", | |
) | |
# Preprocessing | |
def merge(B, C, A): | |
i = j = k = 0 | |
# Convert 'Date' columns to datetime.date objects | |
B['Date'] = pd.to_datetime(B['Date']).dt.date | |
C['Date'] = pd.to_datetime(C['Date']).dt.date | |
A['Date'] = pd.to_datetime(A['Date']).dt.date | |
while i < len(B) and j < len(C): | |
if B['Date'].iloc[i] <= C['Date'].iloc[j]: | |
A['Date'].iloc[k] = B['Date'].iloc[i] | |
A['Sales'].iloc[k] = B['Sales'].iloc[i] | |
i += 1 | |
else: | |
A['Date'].iloc[k] = C['Date'].iloc[j] | |
A['Sales'].iloc[k] = C['Sales'].iloc[j] | |
j += 1 | |
k += 1 | |
while i < len(B): | |
A['Date'].iloc[k] = B['Date'].iloc[i] | |
A['Sales'].iloc[k] = B['Sales'].iloc[i] | |
i += 1 | |
k += 1 | |
while j < len(C): | |
A['Date'].iloc[k] = C['Date'].iloc[j] | |
A['Sales'].iloc[k] = C['Sales'].iloc[j] | |
j += 1 | |
k += 1 | |
return A | |
def merge_sort(dataframe): | |
if len(dataframe) > 1: | |
center = len(dataframe) // 2 | |
left = dataframe.iloc[:center] | |
right = dataframe.iloc[center:] | |
merge_sort(left) | |
merge_sort(right) | |
return merge(left, right, dataframe) | |
else: | |
return dataframe | |
def drop (dataframe): | |
def get_columns_containing(dataframe, substrings): | |
return [col for col in dataframe.columns if any(substring.lower() in col.lower() for substring in substrings)] | |
columns_to_keep = get_columns_containing(dataframe, ["date", "sale"]) | |
dataframe = dataframe.drop(columns=dataframe.columns.difference(columns_to_keep)) | |
dataframe = dataframe.dropna() | |
return dataframe | |
def date_format(dataframe): | |
for i, d, s in dataframe.itertuples(): | |
dataframe['Date'][i] = dataframe['Date'][i].strip() | |
for i, d, s in dataframe.itertuples(): | |
new_date = datetime.strptime(dataframe['Date'][i], "%m/%d/%Y").date() | |
dataframe['Date'][i] = new_date | |
return dataframe | |
def group_to_three(dataframe): | |
dataframe['Date'] = pd.to_datetime(dataframe['Date']) | |
dataframe = dataframe.groupby([pd.Grouper(key='Date', freq='3D')])['Sales'].mean().round(2) | |
dataframe = dataframe.replace(0, np.nan).dropna() | |
return dataframe | |
def series_to_df_exogenous(series): | |
dataframe = series.to_frame() | |
dataframe = dataframe.reset_index() | |
dataframe = dataframe.set_index('Date') | |
dataframe = dataframe.dropna() | |
# Create the eXogenous values | |
dataframe['Sales First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(1) | |
dataframe['Seasonal First Difference'] = dataframe['Sales'] - dataframe['Sales'].shift(12) | |
dataframe = dataframe.dropna() | |
return dataframe | |
def dates_df(dataframe): | |
dataframe = dataframe.reset_index() | |
dataframe['Date'] = dataframe['Date'].dt.strftime('%B %d, %Y') | |
dataframe[dataframe.columns] = dataframe[dataframe.columns].astype(str) | |
return dataframe | |
def get_forecast_period(period): | |
return round(period / 3) | |
# SARIMAX Model | |
def train_test(dataframe): | |
n = round(len(dataframe) * 0.2) | |
training_y = dataframe.iloc[:-n,0] | |
test_y = dataframe.iloc[-n:,0] | |
test_y_series = pd.Series(test_y, index=dataframe.iloc[-n:, 0].index) | |
training_X = dataframe.iloc[:-n,1:] | |
test_X = dataframe.iloc[-n:,1:] | |
future_X = dataframe.iloc[0:,1:] | |
return (training_y, test_y, test_y_series, training_X, test_X, future_X) | |
def test_fitting(dataframe, Exo, trainY): | |
trainTestModel = auto_arima(X = Exo, y = trainY, start_p=1, start_q=1, | |
test='adf',min_p=1,min_q=1, | |
max_p=3, max_q=3, m=12, | |
start_P=0, seasonal=True, | |
d=None, D=1, trace=True, | |
error_action='ignore', | |
suppress_warnings=True, | |
stepwise=True) | |
model = trainTestModel | |
return model | |
def forecast_accuracy(forecast, actual): | |
mape = np.mean(np.abs(forecast - actual)/np.abs(actual)).round(4) # MAPE | |
rmse = (np.mean((forecast - actual)**2)**.5).round(2) # RMSE | |
corr = np.corrcoef(forecast, actual)[0,1] # corr | |
mins = np.amin(np.hstack([forecast[:,None], | |
actual[:,None]]), axis=1) | |
maxs = np.amax(np.hstack([forecast[:,None], | |
actual[:,None]]), axis=1) | |
minmax = 1 - np.mean(mins/maxs) # minmax | |
return({'mape':mape, 'rmse':rmse, 'corr':corr, 'min-max':minmax}) | |
def sales_growth(dataframe, fittedValues): | |
sales_growth = fittedValues.to_frame() | |
sales_growth = sales_growth.reset_index() | |
sales_growth.columns = ("Date", "Sales") | |
sales_growth = sales_growth.set_index('Date') | |
sales_growth['Sales'] = (sales_growth['Sales']).round(2) | |
# Calculate and create the column for sales difference and growth | |
sales_growth['Forecasted Sales First Difference']=(sales_growth['Sales']-sales_growth['Sales'].shift(1)).round(2) | |
sales_growth['Forecasted Sales Growth']=(((sales_growth['Sales']-sales_growth['Sales'].shift(1))/sales_growth['Sales'].shift(1))*100).round(2) | |
# Calculate and create the first row for sales difference and growth | |
sales_growth['Forecasted Sales First Difference'].iloc[0] = (dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2]).round(2) | |
sales_growth['Forecasted Sales Growth'].iloc[0]=(((dataframe['Sales'].iloc[-1]-dataframe['Sales'].iloc[-2])/dataframe['Sales'].iloc[-1])*100).round(2) | |
return sales_growth | |
def merge_forecast_data(actual, predicted, future): | |
actual = actual.to_frame() | |
actual.rename(columns={actual.columns[0]: "Actual Sales"}, inplace=True) | |
predicted = predicted.to_frame() | |
predicted.rename(columns={predicted.columns[0]: "Predicted Sales"}, inplace=True) | |
future = future.to_frame() | |
future = future.rename_axis('Date') | |
future.rename(columns={future.columns[0]: "Future Forecasted Sales"}, inplace=True) | |
merged_dataframe = pd.concat([actual, predicted, future], axis=1) | |
merged_dataframe = merged_dataframe.fillna(-1) | |
merged_dataframe = merged_dataframe.reset_index() | |
return merged_dataframe | |
def get_combined_date(actual, predicted, future): | |
actual = actual.to_frame() | |
predicted = predicted.to_frame() | |
future = future.to_frame() | |
future = future.rename_axis('Date') | |
actual = actual.reset_index() | |
predicted = predicted.reset_index() | |
future = future.reset_index() | |
actual_date = actual['Date'].to_frame() | |
predicted_date = predicted['Date'].to_frame() | |
future_date = future['Date'].to_frame() | |
combined_date = pd.concat([actual_date, predicted_date, future_date]) | |
combined_date = combined_date.reset_index() | |
combined_date = combined_date.drop(columns='index') | |
return combined_date | |
# TAPAS Model | |
def load_tapas_model(): | |
model_name = "google/tapas-large-finetuned-wtq" | |
tokenizer = TapasTokenizer.from_pretrained(model_name) | |
model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False) | |
pipe = pipeline("table-question-answering", model=model, tokenizer=tokenizer) | |
return pipe | |
pipe = load_tapas_model() | |
def get_answer(table, query): | |
answers = pipe(table=table, query=query) | |
return answers | |
def convert_answer(answer): | |
if answer['aggregator'] == 'SUM': | |
cells = answer['cells'] | |
converted = sum(float(value.replace(',', '')) for value in cells) | |
return converted | |
if answer['aggregator'] == 'AVERAGE': | |
cells = answer['cells'] | |
values = [float(value.replace(',', '')) for value in cells] | |
converted = sum(values) / len(values) | |
return converted | |
if answer['aggregator'] == 'COUNT': | |
cells = answer['cells'] | |
converted = sum(int(value.replace(',', '')) for value in cells) | |
return converted | |
else: | |
return answer | |
def get_converted_answer(table, query): | |
converted_answer = convert_answer(get_answer(table, query)) | |
return converted_answer | |
# Session States | |
if 'uploaded' not in st.session_state: | |
st.session_state.uploaded = False | |
if 'forecasted' not in st.session_state: | |
st.session_state.forecasted = False | |
# Web Application | |
st.title("Forecasting Dashboard π") | |
if not st.session_state.uploaded: | |
st.subheader("Welcome User, get started forecasting by uploading your file in the sidebar!") | |
# Sidebar Menu | |
with st.sidebar: | |
# TODO Name for product | |
st.title("MLCast v1.0") | |
st.subheader("An intelligent sales forecasting system") | |
uploaded_file = st.file_uploader("Upload your store data here to proceed (must atleast contain Date and Sales)", type=["csv"]) | |
if uploaded_file is not None: | |
date_found = False | |
sales_found = False | |
df = pd.read_csv(uploaded_file, parse_dates=True) | |
for column in df.columns: | |
if 'Date' in column: | |
date_found = True | |
if 'Sales' in column: | |
sales_found = True | |
if(date_found == False or sales_found == False): | |
st.error('Please upload a csv containing both Date and Sales...') | |
st.stop() | |
st.success("File uploaded successfully!") | |
st.write("Your uploaded data:") | |
st.write(df) | |
df = drop(df) | |
df = date_format(df) | |
merge_sort(df) | |
series = group_to_three(df) | |
st.session_state.uploaded = True | |
with open('sample.csv', 'rb') as f: | |
st.download_button("Download our sample CSV", f, file_name='sample.csv') | |
if (st.session_state.uploaded): | |
st.subheader("Sales History") | |
st.line_chart(series) | |
MIN_DAYS = 30 | |
MAX_DAYS = 90 | |
period = st.slider('How many days would you like to forecast?', min_value=MIN_DAYS, max_value=MAX_DAYS) | |
forecast_period = get_forecast_period(period) | |
forecast_button = st.button( | |
'Start Forecasting', | |
key='forecast_button', | |
type="primary", | |
) | |
if (forecast_button or st.session_state.forecasted): | |
df = series_to_df_exogenous(series) | |
train = train_test(df) | |
training_y, test_y, test_y_series, training_X, test_X, future_X = train | |
train_test_model = test_fitting(df, training_X, training_y) | |
n_periods = round(len(df) * 0.2) | |
future_n_periods = forecast_period + n_periods | |
fitted, confint = train_test_model.predict(X=test_X, n_periods=n_periods, return_conf_int=True) | |
index_of_fc = test_y_series.index | |
# make series for plotting purpose | |
fitted_series = pd.Series(fitted) | |
fitted_series.index = index_of_fc | |
lower_series = pd.Series(confint[:, 0], index=index_of_fc) | |
upper_series = pd.Series(confint[:, 1], index=index_of_fc) | |
#Future predictions | |
frequency = '3D' | |
future_fitted, confint = train_test_model.predict(X=df.iloc[-future_n_periods:,1:], n_periods=future_n_periods, return_conf_int=True, freq=frequency) | |
future_index_of_fc = pd.date_range(df['Sales'].index[-1], periods = future_n_periods, freq=frequency) | |
# make series for future plotting purpose | |
future_fitted_series = pd.Series(future_fitted) | |
future_fitted_series.index = future_index_of_fc | |
future_lower_series = pd.Series(confint[:, 0], index=future_index_of_fc) | |
future_upper_series = pd.Series(confint[:, 1], index=future_index_of_fc) | |
# Plot | |
# plt.plot(df['Sales'], color='b', label = 'Actual Sales') | |
# plt.plot(test_y, color='b') | |
# plt.plot(fitted_series, color='r', label = 'Predicted Sales') | |
# plt.title("SARIMAX - Forecast of Auto Business Retail Sales VS Actual Sales") | |
# plt.legend(loc='upper left', fontsize=8) | |
# plt.plot(future_fitted_series, color='darkgreen', label ='Future Forecasted Sales') | |
# plt.fill_between(future_lower_series.index, | |
# future_lower_series, | |
# future_upper_series, | |
# color='k', alpha=.15) | |
# plt.fill_between(lower_series.index, | |
# lower_series, | |
# upper_series, | |
# color='k', alpha=.15) | |
future_sales_growth = sales_growth(df, future_fitted_series) | |
future_sales_growth = future_sales_growth.iloc[n_periods:] | |
df = dates_df(future_sales_growth) | |
test_y, predictions = np.array(test_y), np.array(fitted) | |
acc = forecast_accuracy(predictions, test_y) | |
col = st.columns(2) | |
with col[0]: | |
col[0].header("Sales Forecast") | |
# merged_data = plot_data(df['Sales'], fitted_series, future_fitted_series) | |
# date = get_combined_date(df['Sales'], fitted_series, future_fitted_series) | |
# col[0].line_chart(x=date, y=[df['Sales'], fitted_series], color='blue', key='actual') | |
merged_data = merge_forecast_data(df['Sales'], fitted_series, future_fitted_series) | |
col[0].line_chart(merged_data, x="index", y=["Actual Sales", "Predicted Sales", "Future Forecasted Sales"]) | |
col[0].write(f"MAPE score: {acc['mape']} (lower is better)") | |
with col[1]: | |
col[1].subheader(f"Forecasted sales in the next {period} days") | |
col[1].write(df) | |
st.session_state.forecasted = True | |
with st.form("question_form"): | |
question = st.text_input('Ask a Question about the Forecasted Data', placeholder="What is the total sales in the month of December?") | |
query_button = st.form_submit_button(label='Generate Answer') | |
if query_button or question: | |
answer = get_converted_answer(df, question) | |
if answer is not None: | |
st.subheader("The answer is:", answer) | |
else: | |
st.subheader("Answer is not found in table") | |
# Hide Streamlit default style | |
hide_st_style = """ | |
<style> | |
footer {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_st_style, unsafe_allow_html=True) |