Spaces:
Running
Running
import sys | |
import inspect | |
import math | |
import pandas as pd | |
import numpy as np | |
import polars as pl | |
import seaborn as sns | |
import matplotlib | |
import utils | |
from matplotlib import pyplot as plt | |
import sklearn | |
import gradio as gr | |
from IPython.display import display | |
import plotly.figure_factory as ff | |
from sklearn.impute import SimpleImputer | |
from utils import create_seasons | |
from bs4 import BeautifulSoup | |
from IPython.display import display, HTML | |
from bertopic import BERTopic | |
import html | |
import xgboost as xgb | |
from xgboost import plot_importance | |
from sklearn.metrics import r2_score, mean_absolute_percentage_error | |
from utils import find_variable_data, build_temporal_features, create_datetime, map_vals | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import plotly.figure_factory as ff | |
from plotly.subplots import make_subplots | |
import plotly.io as pio | |
import folium | |
import gc | |
import json | |
from utils import MyNaiveImputer | |
matplotlib.use('agg') | |
dark_mode = """ | |
function refresh() { | |
const url = new URL(window.location); | |
if (url.searchParams.get('__theme') !== 'dark') { | |
url.searchParams.set('__theme', 'dark'); | |
window.location.href = url.href; | |
} | |
} | |
""" | |
# Imputation Variables | |
wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0) | |
wd_full_local = wd_full_local.reset_index() | |
wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d") | |
wd_full_local = build_temporal_features(wd_full_local, "Datetime") | |
impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint', | |
'Percipitation', 'WindSpeed', 'MaxSustainedWind', | |
'Gust', 'Rain', 'SnowDepth', 'SnowIce'] | |
my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7) | |
imputers = { | |
"Mean": my_imputer.impute_all(impute_cols, strategy="mean"), | |
"Median": my_imputer.impute_all(impute_cols, strategy="median"), | |
"Max": my_imputer.impute_all(impute_cols, strategy="max"), | |
"Min": my_imputer.impute_all(impute_cols, strategy="min") | |
} | |
# Merged Data Variables | |
data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0) | |
data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d") | |
data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name() | |
data_merged["Year String"] = data_merged["Year"].astype(str) | |
data_merged["Month String"] = data_merged["Datetime"].dt.month_name() | |
data_merged["Rain Bool"] = data_merged["Rain"].astype(bool) | |
data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool) | |
data_merged = data_merged.set_index("Datetime") | |
weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy() | |
data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)] | |
# Feature Preprocessing | |
data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy() | |
data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear") | |
data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3) | |
data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5) | |
data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic") | |
data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear") | |
data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"] | |
data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7) | |
data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"] | |
data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"] | |
data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p) | |
data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7) | |
data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"] | |
# Final Preprocessed Variables | |
data_final = pd.read_csv("data/data_final.csv") | |
data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d") | |
data_final = data_final.set_index("Datetime") | |
test = data_final[-7:] | |
dataset = data_final[:-7] | |
split_point = int(len(data_final[:-7])*0.75) | |
train, val = dataset[:split_point], dataset[split_point:] | |
X_train, y_train = train.drop(columns="Target"), train["Target"] | |
X_val, y_val = val.drop(columns="Target"), val["Target"] | |
X_test, y_test = test.drop(columns="Target"), test["Target"] | |
forecast_model = xgb.XGBRegressor() | |
forecast_model.load_model("models/final_model.json") | |
# Current Predictions | |
global r2_val, r2_train, mape_train, mape_val | |
r2_train = 0.8691238468740025 | |
mape_train = 0.04889510400934162 | |
r2_val = 0.6072642783665692 | |
mape_val = 0.6072642783665692 | |
# Initial Variables | |
reports = { | |
"weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"), | |
"weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"), | |
"service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser") | |
} | |
iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp") | |
iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date") | |
# Code Variables to show in app | |
load_code = """ | |
# Load Weather Data in pandas | |
# No need for polars because data is sufficiently small | |
weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv") | |
# Load Service data in polars for speed optimization | |
# Loading directly with polars leads to errors | |
# Load in pandas then convert to polars | |
service_data_pd = pd.read_csv("data/311-2016-2018.csv") | |
assert service_data_pd["Unique Key"].nunique() == len(service_data_pd) | |
# This casting is done just because of some errors when loading pl from pandas | |
service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string") | |
service_data_pd["BBL"] = service_data_pd["BBL"].astype("string") | |
service_data = pl.DataFrame(service_data_pd) | |
# Clear some ram | |
del service_data_pd | |
gc.collect()""" | |
map_code = """ | |
lat_min = service_data["Latitude"].min() | |
lat_max = service_data["Latitude"].max() | |
long_min = service_data["Longitude"].min() | |
long_max = service_data["Longitude"].max() | |
mincon_lat = weather_data["Latitude"] >= lat_min | |
maxcon_lat = weather_data["Latitude"] <= lat_max | |
mincon_long = weather_data["Longitude"] >= long_min | |
maxcon_long = weather_data["Longitude"] <= long_max | |
wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long] | |
""" | |
Closed_Ticket_Code = """ | |
# Fill null and Typos with mean time diff (13 days) | |
service_data = service_data.with_columns( | |
Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date") > pl.duration(days=1)) | |
.then(pl.col("Created Date") + pl.duration(days=mean_diff)) | |
.otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff)) | |
) | |
# Check for no null values | |
assert service_data["Closed_Date_New"].is_null().sum() == 0 | |
# Pair wise GroupBy and Filter | |
closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \ | |
.agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here | |
.sort("Closed_Date_New") \ # Sort by new column Closed Date New | |
.filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window | |
.group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering | |
ct_df = closed_tickets.with_columns( | |
pl.col("num_closed_tickets") # Rename Column | |
) | |
""" | |
global topic_model | |
topic_model = BERTopic.load("models/BERTopic") | |
def plot_imputations(var, data, imputers=imputers): | |
plt.close('all') | |
fig = plt.figure(figsize=(15,5)) | |
plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual") | |
plt.title(f"{var} Imputation") | |
for method in imputers: | |
plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method) | |
plt.legend() | |
return gr.update(value=fig) | |
def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600): | |
plt.close('all') | |
if var == "": | |
return gr.update() | |
from utils import plot_timeseries | |
fig = plot_timeseries(data, var, data_name, all_vars, height, width) | |
return gr.update(value=fig) | |
def plot_bivariate(data, x, y, subset=None, trendline=True): | |
plt.close('all') | |
map_var = { | |
"Year": "Year String", | |
"Season": "Season", | |
"Month": "Month String", | |
"Day Of Week": "Day Of Week", | |
"Weekend": "is_weekend", | |
"Holiday": "is_holiday", | |
"Rain": "Rain Bool", | |
"SnowIce": "SnowIce Bool", | |
"None": None, | |
"": None, | |
} | |
subset = map_var[subset] | |
from utils import plot_bivariate | |
fig = plot_bivariate(data, x, y, subset, trendline) | |
return gr.update(value=fig) | |
def plot_seasonality(data, x, y, show_box=True, show_outliers=False): | |
plt.close('all') | |
map_var = { | |
"Year": "Year String", | |
"Season": "Season", | |
"Month": "Month String", | |
"Day Of Week": "Day Of Week", | |
"Weekend": "is_weekend", | |
"Holiday": "is_holiday", | |
"Rain": "Rain Bool", | |
"SnowIce": "SnowIce Bool", | |
"None": None, | |
} | |
x = map_var[x] | |
from utils import plot_seasonality | |
fig = plot_seasonality(data, x, y, show_box, show_outliers) | |
return gr.update(value=fig) | |
def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"): | |
plt.close('all') | |
from utils import plot_correlations | |
fig = plot_correlations(data, covar, target, lags, method) | |
return gr.update(value=fig) | |
def plot_autocorr(data, var, apply=None): | |
plt.close('all') | |
from utils import plot_acf, plot_pacf | |
time_series = data.loc[:, var].to_frame().copy() | |
if apply: | |
time_series[var] = time_series[var].apply(apply) | |
fig, ax = plt.subplots(2, 1, figsize=(12, 8)) | |
_ = plot_acf(time_series[var], lags=30, ax=ax[0]) | |
_ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1]) | |
_ = plt.suptitle(f"{var}", y=0.95) | |
return gr.update(value=fig) | |
def plot_all_correlations(data, data_name="weather", method="pearson"): | |
plt.close('all') | |
from utils import plot_all_correlations | |
fig = plot_all_correlations(data, data_name, method) | |
return fig | |
def run_report(report_base, variable_name, report_category="full"): | |
report_name = report_base + "_" + report_category | |
iframe, _ = find_variable_data(reports[report_name], variable_name) | |
return gr.update(value=iframe) | |
def test_stationary(data, var): | |
from utils import test_stationary | |
df = test_stationary(data, var) | |
return df | |
def plot_interpolation(data): | |
plt.close('all') | |
from utils import plot_gust_interpolation | |
fig = plot_gust_interpolation(data) | |
return fig | |
def plot_model_feature_importance(): | |
plt.close('all') | |
from utils import plot_final_feature_importance | |
fig = plot_final_feature_importance(forecast_model) | |
return fig | |
def plot_final_predictions(): | |
plt.close('all') | |
from utils import predict_recurse | |
next_7_day_prediction = predict_recurse(dataset, test, forecast_model) | |
fig = plt.subplots(figsize=(15, 5)) | |
data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction | |
ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction") | |
data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax) | |
ax.legend() | |
curr_fig = plt.gcf() | |
plt.close() | |
return curr_fig | |
def plot_train_split(): | |
plt.close('all') | |
from utils import plot_train_split | |
fig = plot_train_split(train, val) | |
return fig | |
def plot_val_predicitons(): | |
data = val.copy() | |
data["Prediction"] = preds_val | |
from utils import plot_predictions | |
fig = plot_predictions(train, val, preds_val) | |
return fig | |
curr_theme = gr.themes.Default( | |
text_size=gr.themes.sizes.text_lg | |
) | |
with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app: | |
title = gr.HTML("""<h1 align="center">Point72 Case Study</h1>""") | |
with gr.Tabs() as pages: | |
with gr.Tab("Overview") as toc_page: | |
gr.Markdown("# My Point72 Case Study Results") | |
gr.Markdown(""" | |
* Please follow the tabs sequentially left to right to get the full story of my work | |
* There will be many interactive parts where you will be able to test and view different parameters | |
* This app may also be built and ran locally | |
* This app is hosted and served from a cloud server VM Instance | |
* Any questions please email me: [email protected] | |
""") | |
with gr.Tab("Data Preprocessing") as data_preprocessing_page: | |
with gr.Tab("Data Loading") as dp_overview: | |
gr.HTML("<h1 style=\"text-align: center;\">Loading the Data</h1>") | |
gr.Markdown("## Goal: Load the Data as efficiently as possible") | |
gr.Markdown(""" | |
* Using Pandas alone is **slow and inefficient**. | |
* With small datasets, pandas is great because the API is robust. | |
* With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster. | |
* As data gets even larger, multi-processing languages like Spark are required. | |
* For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility. | |
""") | |
with gr.Accordion("Code", open=False): | |
gr.Code(load_code, language="python") | |
with gr.Tab("Location Mapping") as dp_overview: | |
src_doc = html.escape(open("figures/map1.html","r").read()) | |
iframe1 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>' | |
src_doc = html.escape(open("figures/map2.html","r").read()) | |
iframe2 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>' | |
src_doc = html.escape(open("figures/bounded_map.html","r").read()) | |
iframe3 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>' | |
src_doc = html.escape(open("figures/final_map.html","r").read()) | |
iframe4 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>' | |
gr.HTML("<h1 style=\"text-align: center;\">Location Mapping for Both Datasets</h1>") | |
with gr.Row(elem_classes="map-legend"): | |
gr.Markdown(""" | |
**Legend:** | |
* <span style=\"color: red\">Red:</span> Weather records | |
* <span style=\"color: #5989ff\">Blue:</span> 311 Service records | |
""", elem_classes="map-legend-text") | |
with gr.Row(): | |
with gr.Column(): | |
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York State</h1>") | |
map1 = gr.HTML(iframe1, elem_classes="map") | |
with gr.Column(): | |
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York City</h1>") | |
map2 = gr.HTML(iframe2, elem_classes="map") | |
with gr.Row(): | |
gr.Markdown(""" | |
Juxtaposing these two maps and seeing the approximate distributions of data observations, | |
its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset. | |
Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds | |
from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within | |
these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure | |
congruity between the two. **Below you can see the bounding box I created and how the new weather data | |
observations fit in this bounding box.** | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
map3 = gr.HTML(iframe3, elem_classes="map") | |
with gr.Column(): | |
map4 = gr.HTML(iframe4, elem_classes="map") | |
with gr.Accordion("Code", open=False): | |
gr.Code(map_code, language="python") | |
with gr.Tab("Variable Pruning") as var_pruning: | |
gr.HTML("<h1 style=\"text-align: center;\">How I pruned the datasets</h1>") | |
gr.Markdown("## Goal: Remove as many useless features as possible") | |
gr.HTML("<h3 style=\"color: darkorange;\">Key Factors for Feature Removal</h3>") | |
gr.Markdown(""" | |
* Percentage of missing data points | |
* Distribution Imbalance | |
* Irrelevance | |
* Number of distinct categories | |
* Another variable was chosen as replacement <br/><br/> | |
NOTE: Look in the appendix for visualizations of individual variables | |
""") | |
droped_var_df = pd.read_excel("data/drop_vars.xlsx") | |
gr.Dataframe( | |
droped_var_df, | |
wrap=True, | |
label="Dropped Variables & Justification (Weather on Bottom)" | |
) | |
with gr.Tab("Time Aggregation") as time_agg: | |
gr.HTML("<h1 style=\"text-align: center;\">Aggregate Data by Date</h1>") | |
gr.Markdown("## Goal: Aggregate data by Date") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Data must be aggregated by day to find ticket counts</li> | |
<li>Covariate features need a special transformation</li> | |
<li>Final Aggregations Mapping</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Created Date ==> groupby.count ==> Target (Created ticket count)</li> | |
<li>Closed Date ==> Agg* ==> Number of closed tickets (Agg* explained in next tabs)</li> | |
<li>Agency ==> Agg* ==> Number of tickets by Agency (Agg* explained in next tabs)</li> | |
<li>Borough ==> Agg* ==> Number of tickets by Boroguh (Agg* explained in next tabs)</li> | |
<li>Descriptor ==> Agg* ==> Number of tickets by Descriptor Group/Category (Agg* explained in next tabs)</li> | |
</ul> | |
</ul>""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Weather data is not aggregated by day</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>To merge with 311 Service data, both datasets must be aggregated</li> | |
<li>Additional transformations may be applied only after time aggregation</li> | |
<li>Aggregation function needs to be handled feature by feature</li> | |
<li>Final Aggregation Mapping</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>MaxTemp, MaxSustainedWind ==> groupby.max ==> Variables have an inherent max feature</li> | |
<li>MinTemp ==> groupby.min ==> Variable has an inherent min feature</li> | |
<li>Rain, SnowIce ==> groupby.mean.round ==> Binary variables are first aggregated than rounded back to binary</li> | |
<li>All Other Variables ==> groupy.mean ==> Mean used by default as it is the least lossy pooling method</li> | |
</ul> | |
</ul>""") | |
with gr.Tab("Weather Data: Imputation") as wd_impute: | |
gr.HTML("<h1 style=\"text-align: center;\">Data Imputation</h1>") | |
gr.Markdown("## Goal: Impute missing values in Weather Data") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue: Weather data is incomplete, 49 days are missing in 2018</h3>") | |
gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Use a simple imputer rather than a robust imputation method to reduce model complexity</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Using a robust imputer = Conducting a multivariate forcast, Very complex & can be slow</li> | |
<li>Using a simple imputer = Low complexity, low latency</li> | |
</ul> | |
<li>Simple imputer applies an aggregate function using Day Of Year (1-366) as the interval</li> | |
<li>4 different Imputation Methods: Mean, Median, Min, Max</li> | |
<li>7 additional days are imputed so the weather data can be used as a future covariate in our model</li> | |
<li>Final Aggregation Mapping</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>WindSpeed, MaxSustainedWind, Gust, SnowDepth => Use Mean => Noisy Variables, Non-Mean/Median methods are too biased, curve best fit with Mean</li> | |
<li>Rain => Use Max => Binary Variables with noise, min/mean/median imputes 0, which does not follow the trend</li> | |
<li>SnowIce => Use Min (impute 0) => Binary variables but mostly 0's, any other imputation is visually inaccurate</li> | |
<li>MeanTemp, MinTemp, MaxTemp, DewPoint, Percipitation => Use Min => Perhaps helping to remove non-stationarity (global warming), Winter is colder now than before, Curve best fits with min</li> | |
</ul> | |
</ul>""") | |
gr.Markdown("Use plots below to view the plots used to help justify above reasoning") | |
with gr.Accordion("Show Plots", open=False): | |
impute_data = gr.State(wd_full_local) | |
impute_choices = ["None"] | |
impute_choices.extend(impute_cols) | |
wd_impute_col = gr.Dropdown( | |
choices=impute_choices, | |
value="None", | |
label="Choose a Variable to plot all imputation methods" | |
) | |
wd_impute_plot = gr.Plot() | |
wd_impute_col.change( | |
plot_imputations, | |
[wd_impute_col, impute_data], | |
[wd_impute_plot] | |
) | |
with gr.Tab("311: Closed Ticket Counting") as ct_date: | |
gr.HTML("<h1 style=\"text-align: center;\">Closed Ticket Feature</h1>") | |
gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Data Error, Typos, and/or Null valuess</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Number of Null Values: </li> | |
<li>Number of Closed Dates where Closed Date > Created Date: </li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>These values were most likely typos/data recording errors</li> | |
<li>For instance, some of these values dated to 1900</li> | |
</ul> | |
<li>SOLUTION: For every data error, impute with the mean difference (recompute Closed Date based off Created)</li> | |
<li>Mean is calculated as the mean time differential between all valid Closed & Created Dates</li> | |
<li>Mean Time Differential: 13 Days</li> | |
</ul>""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Data Leakage - Future into Past</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Most of the Closed Date values are 13 days ahead relative to Created Date</li> | |
<li>GroupBy Closed Date only will lead to some closed ticket counts leaking into future created dates</li> | |
<li>SOLUTION: GroupBy [Closed Date, Created Date] pairwise, filter so Created Date < Closed Date</li> | |
</ul>""") | |
with gr.Accordion("Code", open=False): | |
gr.Code(Closed_Ticket_Code, language="python") | |
with gr.Tab("311: Categorical Grouping") as cat_groups: | |
BERTopic = gr.State(BERTopic.load("models/BERTopic")) | |
gr.HTML("<h1 style=\"text-align: center;\">Categorical Features</h1>") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Categorical Features have too many categories</h3>") | |
gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Borough:</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Only 9 Categories without grouping</li> | |
<li>Four Categories are either typos or just null => Group all into OTHER</li> | |
</ul> | |
<li>Agency:</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>30 Agencies in total are listed</li> | |
<li>Manual Research to group each Agency by Category of what they typically do</li> | |
<li>30 Agencies down to 7 Agency Groupings, based on frequency and research</li> | |
</ul> | |
<li>Complaint Type: Removed because analysis showed complaints were too related to the agency</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>299 unique pairs out of 271 unique complaints => only ~10% difference in distribution</li> | |
</ul> | |
<li>Descriptor: Over 1000+ unique categories. Only way to realistically group is to use NLP</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Pretrained a BERTopic model to extract topics from the text</li> | |
<li>BERTopic uses TF-IDF & Transformers to extract topics from text</li> | |
<li>BERTopic reduced 1000 categories into 8 groups</li> | |
</ul> | |
</ul>""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day</h3>") | |
gr.Markdown("#### One Hot Encode and Sum per category") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Step 1: One hot encode all the features before aggregation</li> | |
<li>Step 2: GroupBy date and Sum for each encoding</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Example: A categorical group with 4 categories</li> | |
<li>One Sum column per category representing the frequency of that category per day</li> | |
</ul> | |
<li>Main Downside: Highly correlated with Created Ticket data; aggregation method was essentially the same</li> | |
<ul style="padding-inline-start: 40px;"> | |
<li>Summing across the four feature categories in the example above would just equal the ticket count</li> | |
</ul> | |
<li>Solution: Leave some categories out of final vector to reduce bias (Shown in feature engineering stage)</li> | |
</ul>""") | |
with gr.Accordion("View Feature Groups", open=False): | |
with gr.Accordion("Borough", open=False): | |
gr.JSON(json.loads(open("code/Borough.json", "r").read())) | |
with gr.Accordion("Agency", open=False): | |
gr.JSON(open("code/Agency.json", "r").read()) | |
with gr.Accordion("Descriptor", open=False): | |
gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]]) | |
gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1)))) | |
with gr.Tab("All Code") as code_preprocess: | |
gr.Markdown("# View Full Code for building Weather Data") | |
with gr.Accordion(open=False): | |
gr.Code(open("code/build_weather.py", "r").read()) | |
gr.Markdown("# View Full Code for building 311 Service Data") | |
with gr.Accordion(open=False): | |
gr.Code(open("code/build_service.py", "r").read()) | |
with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page: | |
bivar_data = gr.State(data_merged_eda) | |
with gr.Tab("Overview", id="eda_overview") as eda_overview: | |
gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations") | |
gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.") | |
gr.HTML("<h3 style=\"color: darkorange;\">Key Insights</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Missing Values:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Gust if used may need interpolation to fill missing values</li> | |
</ul> | |
<li>Stationarity</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li> | |
<ul style="padding-inline-start: 60px; font-size: 18px;"> | |
<li>Trends are clear for some like Temperature and DewPoint</li> | |
<li>Possible cause of constant non-stationarity are factors such as global warming</li> | |
</ul> | |
<li>311 Calls may exhibit some forms of weekly non-stationarity</li> | |
<ul style="padding-inline-start: 60px; font-size: 18px;"> | |
<li>Potentially weekly and monthly non-stationarity</li> | |
<li>Affected by Holidays and Weekends</li> | |
<li>More robust tests needed</li> | |
</ul> | |
<li>Action Item: Test for stationarity and remove</li> | |
</ul> | |
<li>Bivariate Interactions:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>311 Calls have stronger relationships with certain Agency, Borough and Descriptor categories</li> | |
<li>311 calls exhibit weak overal linear relationships with weather</li> | |
<ul style="padding-inline-start: 60px; font-size: 18px;"> | |
<li>Monthly and Seasonal relationship is strongest in winter months</li> | |
<li>Month Of January: strongest linear relationship between MinTemp, DewPoint</li> | |
</ul> | |
</ul> | |
<li>Seasonality:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Weather variables exhibit a strong Yearly and Seasonal seasonality</li> | |
<li>311 Service Variables exhibit Weekly Seasonality</li> | |
<li>311 Variables affected strongly by holidays and weekends (less 311 calls on weekends and holidays)</li> | |
</ul> | |
<li>Correlation:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Heavy Collinearity among weather variables (especially Min, Mean, MaxTemp)</li> | |
<li>Varying degrees of correlation among 311 covariates and 311 volume</li> | |
</ul> | |
<li>Lags & Autocorrelation:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>311 Service Calls have highest correlation with 7,14,21 weekly lags</li> | |
<li>6,8 day lag intervals second strongest relationship. 8 day exhibits some negative correlation</li> | |
<li>1 day lag exhibits similar correlation with 6,7 day lags</li> | |
</ul> | |
</ul>""") | |
with gr.Tab("Univariate", id="eda_univar") as eda_univar: | |
with gr.Tab("Weather Data") as eda_uni_weather: | |
eda_univar_weatherdf = gr.State(weather_full_df) | |
gr.Markdown("# Use the Interactive plot below") | |
eda_uni_weather_name = gr.State("Weather") | |
weather_vars = [ | |
"", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', | |
'MinTemp', 'MaxTemp', 'MaxSustainedWind' | |
] | |
select_weather_var = gr.Dropdown( | |
choices=weather_vars, | |
value="", | |
label="Select a Variable to View" | |
) | |
weather_uniplot = gr.Plot() | |
select_weather_var.change( | |
plot_timeseries, | |
inputs=[ | |
eda_univar_weatherdf, | |
select_weather_var, | |
eda_uni_weather_name | |
], | |
outputs=[ | |
weather_uniplot | |
] | |
) | |
with gr.Tab("311 Service Data") as eda_uni_weather: | |
eda_univar_servicedf = gr.State(data_merged_eda) | |
gr.Markdown("# Use the Interactive plot below") | |
gr.Markdown("**NOTE: Target is the count of 311 service records**") | |
eda_uni_service_name = gr.State("Weather") | |
service_vars = [ | |
"", 'Target', 'num_closed_tickets', | |
# Agency Group Counts | |
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health', | |
'AG_Parks', 'AG_Security', 'AG_Transportation', | |
'AG_Other', | |
# Borough Counts | |
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN', | |
'Borough_QUEENS', 'Borough_STATEN ISLAND', | |
'Borough_OTHER', | |
# Descriptor Group Counts | |
'DG_damaged_sign_sidewalk_missing', | |
'DG_english_emergency_spanish_chinese', | |
'DG_exemption_commercial_tax_business', | |
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead', | |
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition', | |
'DG_water_basin_litter_missed' | |
] | |
select_service_var = gr.Dropdown( | |
choices=service_vars, | |
value="", | |
label="Select a Variable to View" | |
) | |
service_uniplot = gr.Plot() | |
select_service_var.change( | |
plot_timeseries, | |
inputs=[ | |
eda_univar_servicedf, | |
select_service_var, | |
eda_uni_service_name | |
], | |
outputs=[ | |
service_uniplot | |
] | |
) | |
with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar: | |
gr.Markdown("# Use the Interactive plot below") | |
gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate") | |
with gr.Column(): | |
with gr.Row() as bivar_params: | |
bivar_dist_target = gr.Dropdown( | |
choices=["Target"], | |
value="Target", | |
label="Target Variable (One option)" | |
) | |
all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"] | |
all_bivars.extend(weather_vars) | |
all_bivars = sorted(all_bivars) | |
all_bivars = all_bivars[1:] | |
bivar_dist_cov = gr.Dropdown( | |
choices=all_bivars, | |
value="MeanTemp", | |
label="Select Covariate" | |
) | |
bivar_trendline = gr.Dropdown( | |
choices=[True, False], | |
value=True, | |
label="Graph with OLS Trendline" | |
) | |
with gr.Accordion("Add Seasonality", open=False): | |
bivar_subset = gr.Dropdown( | |
choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"], | |
value="None", | |
label="Seasonality Options (Disabled for Agency, Borough and Descriptor)" | |
) | |
bivar_submit = gr.Button("Run") | |
bivar_plot = gr.Plot() | |
bivar_submit.click( | |
plot_bivariate, | |
[bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline], | |
bivar_plot | |
) | |
with gr.Tab("Seasonality") as bivar_season: | |
gr.Markdown("## Exploring the affect of Seasonality") | |
with gr.Row() as bivar_season_params: | |
bivar_season_var = gr.Dropdown( | |
choices=["Target", 'MeanTemp', 'DewPoint', | |
'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', | |
'MinTemp', 'MaxTemp', 'MaxSustainedWind'], | |
value="Target", | |
label="Variable" | |
) | |
bivar_season_cov = gr.Dropdown( | |
choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"], | |
value="Year", | |
label="Seasonality" | |
) | |
with gr.Column(): | |
season_boxplot = gr.Checkbox(value=True, label="Show Boxplot") | |
season_outlier = gr.Checkbox(value=False, label="Show Outliers") | |
bivar_season_btn = gr.Button("Run") | |
bivar_season_plot = gr.Plot() | |
bivar_season_btn.click( | |
plot_seasonality, | |
[bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier], | |
[bivar_season_plot] | |
) | |
with gr.Tab("Correlation") as corr: | |
with gr.Tab("Weather Correlations") as corr_weather: | |
gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson")) | |
with gr.Tab("311 Service Correlations") as corr_service: | |
gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson")) | |
with gr.Tab("Lag Correlations") as corr_dynamic: | |
gr.Markdown("## Use this to dynamically view correlations based on Lag") | |
gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable") | |
gr.Markdown("Scroll Down For AutoCorrelation Graphs") | |
with gr.Row(): | |
corr_vars = [ | |
"None", 'Target', 'num_closed_tickets', | |
# Weather Variables | |
'MeanTemp', 'DewPoint', 'Percipitation', | |
'WindSpeed', 'Gust', 'SnowDepth', | |
'MinTemp', 'MaxTemp', 'MaxSustainedWind', | |
# Agency Group Counts | |
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health', | |
'AG_Parks', 'AG_Security', 'AG_Transportation', | |
'AG_Other', | |
# Borough Counts | |
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN', | |
'Borough_QUEENS', 'Borough_STATEN ISLAND', | |
'Borough_OTHER', | |
# Descriptor Group Counts | |
'DG_damaged_sign_sidewalk_missing', | |
'DG_english_emergency_spanish_chinese', | |
'DG_exemption_commercial_tax_business', | |
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead', | |
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition', | |
'DG_water_basin_litter_missed' | |
] | |
corr_vars = gr.Dropdown( | |
choices=corr_vars, | |
value="Target", | |
label="Variable" | |
) | |
corr_btn = gr.Button("Run") | |
corr_plot = gr.Plot() | |
autocorr_plot = gr.Plot() | |
corr_btn.click( | |
plot_correlations, | |
[bivar_data, corr_vars], | |
[corr_plot] | |
) | |
corr_btn.click( | |
plot_autocorr, | |
[bivar_data, corr_vars], | |
[autocorr_plot] | |
) | |
with gr.Tab("Feature Engineering") as feature_engineer_page: | |
with gr.Tab("Feature Selection") as feature_select: | |
gr.HTML("<h1 style=\"text-align: center;\">Select Features Based on EDA</h1>") | |
gr.Markdown("### Below is the logic used in our model feature selection") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Weather Covariates</li> | |
<ul style="padding-inline-start: 30px; font-size: 18px;"> | |
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li> | |
<li>MeanTemp, MaxTemp: High collinearity with MinTemp. MinTemp has highest correlation of 3 => REMOVE</li> | |
<ul style="padding-inline-start: 50px; font-size: 18px;"> | |
<li>Possible Reason: High temps, people stay indoors. A/C doesn't break nowadays. Lower Temps lead to building/tech failure more often</li> | |
</ul> | |
<li>Percipitation: Bivariate plot shows weak relationship, outliers no effect on 311 => REMOVE</li> | |
<li>SnowDepth: High number missing values, low correlation => REMOVE</li> | |
<li>Rain, SnowIce: Binary, plots (look in Seasonality Tab) show weak relationship, SnowIce heavily imbalanced (99% 0's) => REMOVE</li> | |
</ul> | |
<li>311 Service Covariates:</li> | |
<ul style="padding-inline-start: 30px; font-size: 18px;"> | |
<li>LOO (Leave One - or many - Out) Encoding:</li> | |
<ul style="padding-inline-start: 50px; font-size: 18px;"> | |
<li>Remove weakest features from our categorical covariates</li> | |
<li>Reduces bias and removes multicollinearity inherent to One-Hot Encoding</li> | |
<li>Candidates For Removal:</li> | |
<ul style="padding-inline-start: 70px; font-size: 18px;"> | |
<li>AG_Health, AG_Other: Lowest Correlation, lowest counts => REMOVE</li> | |
<li>AG_Parks: Lowest Correlation, but low multi-collinearity => KEEP</li> | |
<li>Borough_OTHER: Weakest Correlation, lowest count => REMOVE</li> | |
<li>DG_english_emergency, DG_exemption_commercial: Weakest Correlation, lowest counts => REMOVE</li> | |
<li>DG_odor_food_air_smoke: Lowest Count, but high correlation => KEEP</li> | |
</ul> | |
</ul> | |
</ul> | |
</ul>""") | |
with gr.Accordion("Show Final Variable List", open=False): | |
gr.JSON(json.loads(open("code/all_vars.json","r").read())) | |
with gr.Tab("Feature Preprocessing") as feature_prep: | |
data_feature_prep = gr.State(data_preprocess) | |
gr.HTML("<h1 style=\"text-align: center;\">Preprocess Features</h1>") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Missing Values</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Only One value has missing values to impute: Gust</li> | |
<ul style="padding-inline-start: 30px; font-size: 18px;"> | |
<li>Various interpolation methods were tested</li> | |
<li>Methods like Spline and Polynomial over-estimated some values, breaking inherent data ranges</li> | |
<li>Turns out Simple Linear interpolation was best</li> | |
</ul> | |
<li>SOLUTION: Interpolate Gust with Linear method</li> | |
</ul>""") | |
with gr.Accordion("Show Interpolation Plots", open=False): | |
gr.Plot(plot_interpolation(data_preprocess)) | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Remove Non-Stationarity</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Variables that are non-stationary change over time, they have a trend</li> | |
<li>Ideal to transform non-stationarity variables for modeling</li> | |
<li>Ignore Categorical Variables (simply to keep model complexity low)</li> | |
<li>Numerical Variables were tested for Non-Stationarity using two methods: ADF and KPSS</li> | |
<ul style="padding-inline-start: 30px; font-size: 18px;"> | |
<li>Using ADF and KPSS together can reveal what kind of trend exists in the data</li> | |
<li>Only 1 Case Met: Pass KPSS, Fail ADF = Trend Stationary (most likely by season)</li> | |
</ul> | |
<li>Only Two Variables failed the tests: DewPoint & MinTemp</li> | |
<li>SOLUTION: Use Differencing (7d lag) + Log for MinTemp and Differencing (7d lag) for DewPoint (Log caused many NaNs)</li> | |
</ul>""") | |
with gr.Accordion("View Results Below", open=False): | |
gr.Markdown("### MinTemp (Log) Tests Before and After Transformation") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments") | |
with gr.Column(): | |
gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing") | |
gr.Markdown("### DewPoint Tests Before and After Transformation") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments") | |
with gr.Column(): | |
gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing") | |
with gr.Tab("Feature Engineering") as feature_eng: | |
with gr.Tab("Past Covariates") as fe_past: | |
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>") | |
gr.Markdown(""" | |
* Past Covariates are datapoints that are implied to be only related to past information | |
* For Instance, using past sales of product B to predict futures sales of product A | |
* There are two ways to use past covariates | |
* *Option 1:* Build a multi-variate forecast to predict these variables simultaneously | |
* *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts) | |
""") | |
gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>") | |
gr.Markdown(""" | |
* By using lags, I can shift my data in a way to avoid leaking past data into the future | |
* For predicting 7 days into the future, I must lag my data by at least 7 days | |
* Use a rolling window that will reset over time | |
""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Curse of Dimensionality</h3>") | |
gr.Markdown(""" | |
* Possible to use many variations of lags, rolling and differences to generate many features | |
* Too many features leads to the curse of dimensionality, i.e. Overfitting | |
* Thus, I keep my Feature Set as simple as possible | |
""") | |
gr.Markdown(""" | |
### Feature Set | |
* Lags: 7D, 14D, 21D | |
* Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days) | |
* Differencing (7D difference = 7D lag - 14D lag): 7D | |
""") | |
with gr.Accordion("Open to view implementation code", open=False): | |
gr.Code(open("code/past_features.py","r").read()) | |
with gr.Tab("Future Covariates") as fe_past: | |
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>") | |
gr.Markdown(""" | |
* Future Covariates are data that I have about the future | |
* For Instance, I can use the projected revenue of Company A to predict daily sales | |
* For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days. | |
* I apply a rolling and expanding window as more features | |
* Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume | |
""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Curse of Dimensionality</h3>") | |
gr.Markdown(""" | |
* Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible | |
* The more features, the more we may overfit | |
""") | |
gr.Markdown(""" | |
### Feature Set | |
* Lags: 0D, 1D, 2D | |
* Rolling: Mean & Min of last 14D | |
* Expanding Window: Max, Min (min-length of 14) | |
* Differencing already performed to remove trends | |
""") | |
with gr.Accordion("Open to view implementation code", open=False): | |
gr.Code(open("code/future_features.py","r").read()) | |
with gr.Tab("Target Variable") as fe_past: | |
gr.HTML("<h1 style=\"text-align: center;\">311 Service Calls Features</h1>") | |
gr.Markdown(""" | |
* For providing feature transformations of our Target, we can follow a similar process as above | |
* Main Difference: Lags of < prediction window need to be recomputed at each iteration | |
* So, for predicting at time (t+1) we need the predicted value at time (t) | |
* For a recursive prediction model, this means the model cannot make batch predictions without iterating | |
""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: More variables increase complexity for prediction</h3>") | |
gr.Markdown(""" | |
* The more features, the more overfitting & more computation | |
* As I will use a recursive model, these values must be recomputed at each step t+1 | |
* In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation) | |
""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>") | |
gr.Markdown(""" | |
* Must be careful about how these features are computed | |
* For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum | |
* For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2) | |
""") | |
gr.Markdown(""" | |
### Feature Set | |
* Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality) | |
* Differencing: 7D, 14D | |
""") | |
with gr.Accordion("Open to view implementation code", open=False): | |
gr.Code(open("code/target_features.py","r").read()) | |
with gr.Tab("Forecast Model") as model_select_train_page: | |
with gr.Tab("Splitting the data") as model_data_split: | |
gr.HTML("<h1 style=\"text-align: center;\">Splitting Time-Series Data</h1>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Splitting Time-Series Data is different than splitting other data</li> | |
<li>Rather than splitting on random samples, you split the data by time with order consistent</li> | |
<li>I took a 75% splitting approach where I split my data at the date that sits on the 75% of data length</li> | |
</ul>""") | |
gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data") | |
gr.Plot(plot_train_split()) | |
with gr.Tab("Model Selection") as model_data_split: | |
gr.HTML("<h1 style=\"text-align: center;\">Choosing the Right Model</h1>") | |
gr.Markdown("### Types of Forecast Models for Multi-Step Prediction") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Parallel Models: Train a model for each prediction (one for 1 day ahead, another for 2, etc.)</li> | |
<li>Recursive Models: Model makes a forecast, fills any values it needs for the next prediction, predicts again</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>One of the assumptions was to build a model that was reasonable for production</li> | |
<li>Parallel models are hard to maintain as the steps of prediction increase</li> | |
</ul> | |
<li>Decision: Recursive Modele</li> | |
</ul>""") | |
gr.Markdown("### My Model Choice: XGBoost") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Reasons for choosing:</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Industry standard for regression</li> | |
<li>Lightweight and relatively fast</li> | |
<li>Many parameters to tune, such as tree depth and regularization</li> | |
<li>Scale invariant - Data does not have to be scaled</li> | |
<li>Allows NaN values and categorical features without encodings (unused in my implementation)</li> | |
<li>Provides key explainability in its feature importance metrics</li> | |
</ul> | |
<li>Decision: Use XGBoost</li> | |
</ul>""") | |
with gr.Tab("Model Training") as model_data_split: | |
gr.HTML("<h1 style=\"text-align: center;\">Training the Model</h1>") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Overfitting</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Main Cause: High number of variables and XGBoost's tendency to overfit without tuning</li> | |
<li>While training, effort was made to watch the validation and training set's relative performance</li> | |
<li>Steps Taken to avoid Overfitting</li> | |
<ul style="padding-inline-start: 40px; font-size: 18px;"> | |
<li>Low Learning Rate</li> | |
<li>Low Tree Depth</li> | |
<li>Keeping Val score relatively close to Training score</li> | |
<li>Increased l2-lambda parameter, boosting regularization</li> | |
<li>Many trials to get best set of parameters</li> | |
<li>Implementing Early Stopping</li> | |
</ul> | |
</ul>""") | |
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Choosing a Metric</h3>") | |
gr.HTML(""" | |
<ul style="font-size: 18px"> | |
<li>Three metrics I considered: MAPE, MAE and MSE</li> | |
<li>MAPE seemed to show the most consistent and visually accurate results</li> | |
<li>Decision: MAPE</li> | |
<li>Justification: 311 Service volume is quite noisy and MAPE better estimates fit to a very noisy curve than the others</li> | |
</ul>""") | |
with gr.Tab("Model Prediction") as model_data_split: | |
gr.HTML("<h1 style=\"text-align: center;\">Recursive Model Prediction</h1>") | |
gr.Markdown(""" | |
* Below is the code I wrote to implement the Recursive prediction explained in previous tabs | |
* Predictions are made one step at a time, where the prediction t depends on prediction t-1 | |
* To view the final predictions made by the model see below | |
""") | |
gr.Code(open("code/recurse_predict.py","r").read()) | |
with gr.Accordion("View 7 Day Model Forecast", open=False): | |
gr.Plot(plot_final_predictions()) | |
with gr.Tab("Model Evaluation") as model_eval_page: | |
gr.HTML("<h1 style=\"text-align: center;\">Forecast Results</h1>") | |
gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.") | |
gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.") | |
gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.") | |
with gr.Accordion("Model Prediction Scores", open=False): | |
gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val}) | |
gr.Image("figures/model_performance.png", show_download_button=False) | |
with gr.Tab("Feature Importance") as model_eval_page: | |
gr.HTML("<h1 style=\"text-align: center;\">Feature Importance</h1>") | |
gr.Markdown(""" | |
* Below you can view the feature importance metrics from the XGBoost model | |
* It seems there is significant impact of the weather variables on 311 Service Call Volume | |
* Interestingly, it seems some categories were more impactful than others as well | |
""") | |
gr.Plot(plot_model_feature_importance()) | |
with gr.Tab("Future Work & Limitations") as future_limitations_page: | |
gr.Markdown("# Future Work") | |
gr.Markdown(""" | |
* **Multi-Variate Time Series Forecasting** rather than imputing values naively | |
* Testing more kinds of models such as LightGBM | |
* Robustly testing parameters of current model using GridSearchCV | |
* Comparing performance of my forecast model to others | |
* More Data! Having more 311 Call data may help find other indicators | |
""") | |
gr.Markdown("# Future Deployments") | |
gr.Markdown(""" | |
* Containerize the model and load onto an API for ingestion | |
* Containerize data preprocessing and load into a Spark Cluster | |
* Create triggers and view tables to verify data preprocessing | |
* Create functions to monitor model performance | |
""") | |
with gr.Tab("Appendix") as future_limitations_page: | |
with gr.Tab("Weather Data Analysis") as dp_weather: | |
dp_weather_state = gr.State("weather") | |
with gr.Column(): | |
with gr.Row(): | |
dp_weather_category = gr.Dropdown( | |
choices=["2011-2018", "2016-2018"], | |
value="2011-2018", | |
label="Time Range" | |
) | |
dp_weather_var = gr.Dropdown( | |
choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"], | |
value = "MeanTemp", | |
label = "Variable" | |
) | |
dp_weather_btn = gr.Button("Run") | |
dp_weather_report = gr.HTML(value=iframe_dp_weather) | |
dp_weather_btn.click( | |
run_report, | |
[dp_weather_state, dp_weather_var, dp_weather_category], | |
dp_weather_report, | |
) | |
with gr.Tab("Service Data Analysis") as dp_service: | |
dp_service_state = gr.State("service") | |
dp_service_category = gr.State("full") | |
with gr.Column(): | |
dp_service_var = gr.Dropdown( | |
choices = [ | |
"Created Date", "Closed Date", "Agency", "Agency Name", | |
"Complaint Type", "Descriptor", "Location Type", "Landmark", | |
"Facility Type", "Status", "Community Board", "Borough", | |
"Open Data Channel Type", "Park Facility Name", "Park Borough", | |
"Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location", | |
"Bridge Highway Name", "Bridge Highway Direction", "Road ramp", | |
"Bridge Highway Segment" | |
], | |
value = "Created Date", | |
label = "Select Variable and Run" | |
) | |
dp_service_btn = gr.Button("Run") | |
dp_service_report = gr.HTML(value=iframe_dp_service) | |
dp_service_btn.click( | |
run_report, | |
[dp_service_state, dp_service_var, dp_service_category], | |
dp_service_report, | |
) | |
def main(): | |
app.launch(share=False) | |
return app | |
if __name__=="__main__": | |
main() |