dna-casestudy / app.py
davidna22's picture
Upload folder using huggingface_hub
dad00c5 verified
raw
history blame
67.1 kB
import sys
import inspect
import math
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib
import utils
from matplotlib import pyplot as plt
import sklearn
import gradio as gr
from IPython.display import display
import plotly.figure_factory as ff
from sklearn.impute import SimpleImputer
from utils import create_seasons
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from bertopic import BERTopic
import html
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from utils import find_variable_data, build_temporal_features, create_datetime, map_vals
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import folium
import gc
import json
from utils import MyNaiveImputer
matplotlib.use('agg')
dark_mode = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
# Imputation Variables
wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0)
wd_full_local = wd_full_local.reset_index()
wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d")
wd_full_local = build_temporal_features(wd_full_local, "Datetime")
impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
'Percipitation', 'WindSpeed', 'MaxSustainedWind',
'Gust', 'Rain', 'SnowDepth', 'SnowIce']
my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7)
imputers = {
"Mean": my_imputer.impute_all(impute_cols, strategy="mean"),
"Median": my_imputer.impute_all(impute_cols, strategy="median"),
"Max": my_imputer.impute_all(impute_cols, strategy="max"),
"Min": my_imputer.impute_all(impute_cols, strategy="min")
}
# Merged Data Variables
data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0)
data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d")
data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name()
data_merged["Year String"] = data_merged["Year"].astype(str)
data_merged["Month String"] = data_merged["Datetime"].dt.month_name()
data_merged["Rain Bool"] = data_merged["Rain"].astype(bool)
data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool)
data_merged = data_merged.set_index("Datetime")
weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy()
data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)]
# Feature Preprocessing
data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy()
data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3)
data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5)
data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic")
data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"]
data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7)
data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"]
data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"]
data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p)
data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7)
data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"]
# Final Preprocessed Variables
data_final = pd.read_csv("data/data_final.csv")
data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d")
data_final = data_final.set_index("Datetime")
test = data_final[-7:]
dataset = data_final[:-7]
split_point = int(len(data_final[:-7])*0.75)
train, val = dataset[:split_point], dataset[split_point:]
X_train, y_train = train.drop(columns="Target"), train["Target"]
X_val, y_val = val.drop(columns="Target"), val["Target"]
X_test, y_test = test.drop(columns="Target"), test["Target"]
forecast_model = xgb.XGBRegressor()
forecast_model.load_model("models/final_model.json")
# Current Predictions
global r2_val, r2_train, mape_train, mape_val
r2_train = 0.8691238468740025
mape_train = 0.04889510400934162
r2_val = 0.6072642783665692
mape_val = 0.6072642783665692
# Initial Variables
reports = {
"weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"),
"weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"),
"service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser")
}
iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp")
iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date")
# Code Variables to show in app
load_code = """
# Load Weather Data in pandas
# No need for polars because data is sufficiently small
weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv")
# Load Service data in polars for speed optimization
# Loading directly with polars leads to errors
# Load in pandas then convert to polars
service_data_pd = pd.read_csv("data/311-2016-2018.csv")
assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)
# This casting is done just because of some errors when loading pl from pandas
service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
service_data = pl.DataFrame(service_data_pd)
# Clear some ram
del service_data_pd
gc.collect()"""
map_code = """
lat_min = service_data["Latitude"].min()
lat_max = service_data["Latitude"].max()
long_min = service_data["Longitude"].min()
long_max = service_data["Longitude"].max()
mincon_lat = weather_data["Latitude"] >= lat_min
maxcon_lat = weather_data["Latitude"] <= lat_max
mincon_long = weather_data["Longitude"] >= long_min
maxcon_long = weather_data["Longitude"] <= long_max
wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
"""
Closed_Ticket_Code = """
# Fill null and Typos with mean time diff (13 days)
service_data = service_data.with_columns(
Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date") > pl.duration(days=1))
.then(pl.col("Created Date") + pl.duration(days=mean_diff))
.otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))
)
# Check for no null values
assert service_data["Closed_Date_New"].is_null().sum() == 0
# Pair wise GroupBy and Filter
closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \
.agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here
.sort("Closed_Date_New") \ # Sort by new column Closed Date New
.filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window
.group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering
ct_df = closed_tickets.with_columns(
pl.col("num_closed_tickets") # Rename Column
)
"""
global topic_model
topic_model = BERTopic.load("models/BERTopic")
def plot_imputations(var, data, imputers=imputers):
plt.close('all')
fig = plt.figure(figsize=(15,5))
plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual")
plt.title(f"{var} Imputation")
for method in imputers:
plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method)
plt.legend()
return gr.update(value=fig)
def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600):
plt.close('all')
if var == "":
return gr.update()
from utils import plot_timeseries
fig = plot_timeseries(data, var, data_name, all_vars, height, width)
return gr.update(value=fig)
def plot_bivariate(data, x, y, subset=None, trendline=True):
plt.close('all')
map_var = {
"Year": "Year String",
"Season": "Season",
"Month": "Month String",
"Day Of Week": "Day Of Week",
"Weekend": "is_weekend",
"Holiday": "is_holiday",
"Rain": "Rain Bool",
"SnowIce": "SnowIce Bool",
"None": None,
"": None,
}
subset = map_var[subset]
from utils import plot_bivariate
fig = plot_bivariate(data, x, y, subset, trendline)
return gr.update(value=fig)
def plot_seasonality(data, x, y, show_box=True, show_outliers=False):
plt.close('all')
map_var = {
"Year": "Year String",
"Season": "Season",
"Month": "Month String",
"Day Of Week": "Day Of Week",
"Weekend": "is_weekend",
"Holiday": "is_holiday",
"Rain": "Rain Bool",
"SnowIce": "SnowIce Bool",
"None": None,
}
x = map_var[x]
from utils import plot_seasonality
fig = plot_seasonality(data, x, y, show_box, show_outliers)
return gr.update(value=fig)
def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"):
plt.close('all')
from utils import plot_correlations
fig = plot_correlations(data, covar, target, lags, method)
return gr.update(value=fig)
def plot_autocorr(data, var, apply=None):
plt.close('all')
from utils import plot_acf, plot_pacf
time_series = data.loc[:, var].to_frame().copy()
if apply:
time_series[var] = time_series[var].apply(apply)
fig, ax = plt.subplots(2, 1, figsize=(12, 8))
_ = plot_acf(time_series[var], lags=30, ax=ax[0])
_ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1])
_ = plt.suptitle(f"{var}", y=0.95)
return gr.update(value=fig)
def plot_all_correlations(data, data_name="weather", method="pearson"):
plt.close('all')
from utils import plot_all_correlations
fig = plot_all_correlations(data, data_name, method)
return fig
def run_report(report_base, variable_name, report_category="full"):
report_name = report_base + "_" + report_category
iframe, _ = find_variable_data(reports[report_name], variable_name)
return gr.update(value=iframe)
def test_stationary(data, var):
from utils import test_stationary
df = test_stationary(data, var)
return df
def plot_interpolation(data):
plt.close('all')
from utils import plot_gust_interpolation
fig = plot_gust_interpolation(data)
return fig
def plot_model_feature_importance():
plt.close('all')
from utils import plot_final_feature_importance
fig = plot_final_feature_importance(forecast_model)
return fig
def plot_final_predictions():
plt.close('all')
from utils import predict_recurse
next_7_day_prediction = predict_recurse(dataset, test, forecast_model)
fig = plt.subplots(figsize=(15, 5))
data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction
ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction")
data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax)
ax.legend()
curr_fig = plt.gcf()
plt.close()
return curr_fig
def plot_train_split():
plt.close('all')
from utils import plot_train_split
fig = plot_train_split(train, val)
return fig
def plot_val_predicitons():
data = val.copy()
data["Prediction"] = preds_val
from utils import plot_predictions
fig = plot_predictions(train, val, preds_val)
return fig
curr_theme = gr.themes.Default(
text_size=gr.themes.sizes.text_lg
)
with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app:
title = gr.HTML("""<h1 align="center">Point72 Case Study</h1>""")
with gr.Tabs() as pages:
with gr.Tab("Overview") as toc_page:
gr.Markdown("# My Point72 Case Study Results")
gr.Markdown("""
* Please follow the tabs sequentially left to right to get the full story of my work
* There will be many interactive parts where you will be able to test and view different parameters
* This app may also be built and ran locally
* This app is hosted and served from a cloud server VM Instance
* Any questions please email me: [email protected]
""")
with gr.Tab("Data Preprocessing") as data_preprocessing_page:
with gr.Tab("Data Loading") as dp_overview:
gr.HTML("<h1 style=\"text-align: center;\">Loading the Data</h1>")
gr.Markdown("## Goal: Load the Data as efficiently as possible")
gr.Markdown("""
* Using Pandas alone is **slow and inefficient**.
* With small datasets, pandas is great because the API is robust.
* With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster.
* As data gets even larger, multi-processing languages like Spark are required.
* For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility.
""")
with gr.Accordion("Code", open=False):
gr.Code(load_code, language="python")
with gr.Tab("Location Mapping") as dp_overview:
src_doc = html.escape(open("figures/map1.html","r").read())
iframe1 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/map2.html","r").read())
iframe2 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/bounded_map.html","r").read())
iframe3 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/final_map.html","r").read())
iframe4 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
gr.HTML("<h1 style=\"text-align: center;\">Location Mapping for Both Datasets</h1>")
with gr.Row(elem_classes="map-legend"):
gr.Markdown("""
**Legend:**
* <span style=\"color: red\">Red:</span> Weather records
* <span style=\"color: #5989ff\">Blue:</span> 311 Service records
""", elem_classes="map-legend-text")
with gr.Row():
with gr.Column():
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York State</h1>")
map1 = gr.HTML(iframe1, elem_classes="map")
with gr.Column():
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York City</h1>")
map2 = gr.HTML(iframe2, elem_classes="map")
with gr.Row():
gr.Markdown("""
Juxtaposing these two maps and seeing the approximate distributions of data observations,
its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset.
Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds
from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within
these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure
congruity between the two. **Below you can see the bounding box I created and how the new weather data
observations fit in this bounding box.**
""")
with gr.Row():
with gr.Column():
map3 = gr.HTML(iframe3, elem_classes="map")
with gr.Column():
map4 = gr.HTML(iframe4, elem_classes="map")
with gr.Accordion("Code", open=False):
gr.Code(map_code, language="python")
with gr.Tab("Variable Pruning") as var_pruning:
gr.HTML("<h1 style=\"text-align: center;\">How I pruned the datasets</h1>")
gr.Markdown("## Goal: Remove as many useless features as possible")
gr.HTML("<h3 style=\"color: darkorange;\">Key Factors for Feature Removal</h3>")
gr.Markdown("""
* Percentage of missing data points
* Distribution Imbalance
* Irrelevance
* Number of distinct categories
* Another variable was chosen as replacement <br/><br/>
NOTE: Look in the appendix for visualizations of individual variables
""")
droped_var_df = pd.read_excel("data/drop_vars.xlsx")
gr.Dataframe(
droped_var_df,
wrap=True,
label="Dropped Variables & Justification (Weather on Bottom)"
)
with gr.Tab("Time Aggregation") as time_agg:
gr.HTML("<h1 style=\"text-align: center;\">Aggregate Data by Date</h1>")
gr.Markdown("## Goal: Aggregate data by Date")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Data must be aggregated by day to find ticket counts</li>
<li>Covariate features need a special transformation</li>
<li>Final Aggregations Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>Created Date ==> groupby.count ==> Target (Created ticket count)</li>
<li>Closed Date ==> Agg* ==> Number of closed tickets (Agg* explained in next tabs)</li>
<li>Agency ==> Agg* ==> Number of tickets by Agency (Agg* explained in next tabs)</li>
<li>Borough ==> Agg* ==> Number of tickets by Boroguh (Agg* explained in next tabs)</li>
<li>Descriptor ==> Agg* ==> Number of tickets by Descriptor Group/Category (Agg* explained in next tabs)</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Weather data is not aggregated by day</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>To merge with 311 Service data, both datasets must be aggregated</li>
<li>Additional transformations may be applied only after time aggregation</li>
<li>Aggregation function needs to be handled feature by feature</li>
<li>Final Aggregation Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>MaxTemp, MaxSustainedWind ==> groupby.max ==> Variables have an inherent max feature</li>
<li>MinTemp ==> groupby.min ==> Variable has an inherent min feature</li>
<li>Rain, SnowIce ==> groupby.mean.round ==> Binary variables are first aggregated than rounded back to binary</li>
<li>All Other Variables ==> groupy.mean ==> Mean used by default as it is the least lossy pooling method</li>
</ul>
</ul>""")
with gr.Tab("Weather Data: Imputation") as wd_impute:
gr.HTML("<h1 style=\"text-align: center;\">Data Imputation</h1>")
gr.Markdown("## Goal: Impute missing values in Weather Data")
gr.HTML("<h3 style=\"color: darkorange;\">Issue: Weather data is incomplete, 49 days are missing in 2018</h3>")
gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"")
gr.HTML("""
<ul style="font-size: 18px">
<li>Use a simple imputer rather than a robust imputation method to reduce model complexity</li>
<ul style="padding-inline-start: 40px;">
<li>Using a robust imputer = Conducting a multivariate forcast, Very complex & can be slow</li>
<li>Using a simple imputer = Low complexity, low latency</li>
</ul>
<li>Simple imputer applies an aggregate function using Day Of Year (1-366) as the interval</li>
<li>4 different Imputation Methods: Mean, Median, Min, Max</li>
<li>7 additional days are imputed so the weather data can be used as a future covariate in our model</li>
<li>Final Aggregation Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>WindSpeed, MaxSustainedWind, Gust, SnowDepth => Use Mean => Noisy Variables, Non-Mean/Median methods are too biased, curve best fit with Mean</li>
<li>Rain => Use Max => Binary Variables with noise, min/mean/median imputes 0, which does not follow the trend</li>
<li>SnowIce => Use Min (impute 0) => Binary variables but mostly 0's, any other imputation is visually inaccurate</li>
<li>MeanTemp, MinTemp, MaxTemp, DewPoint, Percipitation => Use Min => Perhaps helping to remove non-stationarity (global warming), Winter is colder now than before, Curve best fits with min</li>
</ul>
</ul>""")
gr.Markdown("Use plots below to view the plots used to help justify above reasoning")
with gr.Accordion("Show Plots", open=False):
impute_data = gr.State(wd_full_local)
impute_choices = ["None"]
impute_choices.extend(impute_cols)
wd_impute_col = gr.Dropdown(
choices=impute_choices,
value="None",
label="Choose a Variable to plot all imputation methods"
)
wd_impute_plot = gr.Plot()
wd_impute_col.change(
plot_imputations,
[wd_impute_col, impute_data],
[wd_impute_plot]
)
with gr.Tab("311: Closed Ticket Counting") as ct_date:
gr.HTML("<h1 style=\"text-align: center;\">Closed Ticket Feature</h1>")
gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Data Error, Typos, and/or Null valuess</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Number of Null Values: </li>
<li>Number of Closed Dates where Closed Date > Created Date: </li>
<ul style="padding-inline-start: 40px;">
<li>These values were most likely typos/data recording errors</li>
<li>For instance, some of these values dated to 1900</li>
</ul>
<li>SOLUTION: For every data error, impute with the mean difference (recompute Closed Date based off Created)</li>
<li>Mean is calculated as the mean time differential between all valid Closed & Created Dates</li>
<li>Mean Time Differential: 13 Days</li>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Data Leakage - Future into Past</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Most of the Closed Date values are 13 days ahead relative to Created Date</li>
<li>GroupBy Closed Date only will lead to some closed ticket counts leaking into future created dates</li>
<li>SOLUTION: GroupBy [Closed Date, Created Date] pairwise, filter so Created Date < Closed Date</li>
</ul>""")
with gr.Accordion("Code", open=False):
gr.Code(Closed_Ticket_Code, language="python")
with gr.Tab("311: Categorical Grouping") as cat_groups:
BERTopic = gr.State(BERTopic.load("models/BERTopic"))
gr.HTML("<h1 style=\"text-align: center;\">Categorical Features</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Categorical Features have too many categories</h3>")
gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)")
gr.HTML("""
<ul style="font-size: 18px">
<li>Borough:</li>
<ul style="padding-inline-start: 40px;">
<li>Only 9 Categories without grouping</li>
<li>Four Categories are either typos or just null => Group all into OTHER</li>
</ul>
<li>Agency:</li>
<ul style="padding-inline-start: 40px;">
<li>30 Agencies in total are listed</li>
<li>Manual Research to group each Agency by Category of what they typically do</li>
<li>30 Agencies down to 7 Agency Groupings, based on frequency and research</li>
</ul>
<li>Complaint Type: Removed because analysis showed complaints were too related to the agency</li>
<ul style="padding-inline-start: 40px;">
<li>299 unique pairs out of 271 unique complaints => only ~10% difference in distribution</li>
</ul>
<li>Descriptor: Over 1000+ unique categories. Only way to realistically group is to use NLP</li>
<ul style="padding-inline-start: 40px;">
<li>Pretrained a BERTopic model to extract topics from the text</li>
<li>BERTopic uses TF-IDF & Transformers to extract topics from text</li>
<li>BERTopic reduced 1000 categories into 8 groups</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day</h3>")
gr.Markdown("#### One Hot Encode and Sum per category")
gr.HTML("""
<ul style="font-size: 18px">
<li>Step 1: One hot encode all the features before aggregation</li>
<li>Step 2: GroupBy date and Sum for each encoding</li>
<ul style="padding-inline-start: 40px;">
<li>Example: A categorical group with 4 categories</li>
<li>One Sum column per category representing the frequency of that category per day</li>
</ul>
<li>Main Downside: Highly correlated with Created Ticket data; aggregation method was essentially the same</li>
<ul style="padding-inline-start: 40px;">
<li>Summing across the four feature categories in the example above would just equal the ticket count</li>
</ul>
<li>Solution: Leave some categories out of final vector to reduce bias (Shown in feature engineering stage)</li>
</ul>""")
with gr.Accordion("View Feature Groups", open=False):
with gr.Accordion("Borough", open=False):
gr.JSON(json.loads(open("code/Borough.json", "r").read()))
with gr.Accordion("Agency", open=False):
gr.JSON(open("code/Agency.json", "r").read())
with gr.Accordion("Descriptor", open=False):
gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]])
gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1))))
with gr.Tab("All Code") as code_preprocess:
gr.Markdown("# View Full Code for building Weather Data")
with gr.Accordion(open=False):
gr.Code(open("code/build_weather.py", "r").read())
gr.Markdown("# View Full Code for building 311 Service Data")
with gr.Accordion(open=False):
gr.Code(open("code/build_service.py", "r").read())
with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page:
bivar_data = gr.State(data_merged_eda)
with gr.Tab("Overview", id="eda_overview") as eda_overview:
gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations")
gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.")
gr.HTML("<h3 style=\"color: darkorange;\">Key Insights</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Missing Values:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Gust if used may need interpolation to fill missing values</li>
</ul>
<li>Stationarity</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Trends are clear for some like Temperature and DewPoint</li>
<li>Possible cause of constant non-stationarity are factors such as global warming</li>
</ul>
<li>311 Calls may exhibit some forms of weekly non-stationarity</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Potentially weekly and monthly non-stationarity</li>
<li>Affected by Holidays and Weekends</li>
<li>More robust tests needed</li>
</ul>
<li>Action Item: Test for stationarity and remove</li>
</ul>
<li>Bivariate Interactions:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>311 Calls have stronger relationships with certain Agency, Borough and Descriptor categories</li>
<li>311 calls exhibit weak overal linear relationships with weather</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Monthly and Seasonal relationship is strongest in winter months</li>
<li>Month Of January: strongest linear relationship between MinTemp, DewPoint</li>
</ul>
</ul>
<li>Seasonality:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Weather variables exhibit a strong Yearly and Seasonal seasonality</li>
<li>311 Service Variables exhibit Weekly Seasonality</li>
<li>311 Variables affected strongly by holidays and weekends (less 311 calls on weekends and holidays)</li>
</ul>
<li>Correlation:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Heavy Collinearity among weather variables (especially Min, Mean, MaxTemp)</li>
<li>Varying degrees of correlation among 311 covariates and 311 volume</li>
</ul>
<li>Lags & Autocorrelation:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>311 Service Calls have highest correlation with 7,14,21 weekly lags</li>
<li>6,8 day lag intervals second strongest relationship. 8 day exhibits some negative correlation</li>
<li>1 day lag exhibits similar correlation with 6,7 day lags</li>
</ul>
</ul>""")
with gr.Tab("Univariate", id="eda_univar") as eda_univar:
with gr.Tab("Weather Data") as eda_uni_weather:
eda_univar_weatherdf = gr.State(weather_full_df)
gr.Markdown("# Use the Interactive plot below")
eda_uni_weather_name = gr.State("Weather")
weather_vars = [
"", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind'
]
select_weather_var = gr.Dropdown(
choices=weather_vars,
value="",
label="Select a Variable to View"
)
weather_uniplot = gr.Plot()
select_weather_var.change(
plot_timeseries,
inputs=[
eda_univar_weatherdf,
select_weather_var,
eda_uni_weather_name
],
outputs=[
weather_uniplot
]
)
with gr.Tab("311 Service Data") as eda_uni_weather:
eda_univar_servicedf = gr.State(data_merged_eda)
gr.Markdown("# Use the Interactive plot below")
gr.Markdown("**NOTE: Target is the count of 311 service records**")
eda_uni_service_name = gr.State("Weather")
service_vars = [
"", 'Target', 'num_closed_tickets',
# Agency Group Counts
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
'AG_Parks', 'AG_Security', 'AG_Transportation',
'AG_Other',
# Borough Counts
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
'Borough_QUEENS', 'Borough_STATEN ISLAND',
'Borough_OTHER',
# Descriptor Group Counts
'DG_damaged_sign_sidewalk_missing',
'DG_english_emergency_spanish_chinese',
'DG_exemption_commercial_tax_business',
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
'DG_water_basin_litter_missed'
]
select_service_var = gr.Dropdown(
choices=service_vars,
value="",
label="Select a Variable to View"
)
service_uniplot = gr.Plot()
select_service_var.change(
plot_timeseries,
inputs=[
eda_univar_servicedf,
select_service_var,
eda_uni_service_name
],
outputs=[
service_uniplot
]
)
with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar:
gr.Markdown("# Use the Interactive plot below")
gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate")
with gr.Column():
with gr.Row() as bivar_params:
bivar_dist_target = gr.Dropdown(
choices=["Target"],
value="Target",
label="Target Variable (One option)"
)
all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"]
all_bivars.extend(weather_vars)
all_bivars = sorted(all_bivars)
all_bivars = all_bivars[1:]
bivar_dist_cov = gr.Dropdown(
choices=all_bivars,
value="MeanTemp",
label="Select Covariate"
)
bivar_trendline = gr.Dropdown(
choices=[True, False],
value=True,
label="Graph with OLS Trendline"
)
with gr.Accordion("Add Seasonality", open=False):
bivar_subset = gr.Dropdown(
choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"],
value="None",
label="Seasonality Options (Disabled for Agency, Borough and Descriptor)"
)
bivar_submit = gr.Button("Run")
bivar_plot = gr.Plot()
bivar_submit.click(
plot_bivariate,
[bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline],
bivar_plot
)
with gr.Tab("Seasonality") as bivar_season:
gr.Markdown("## Exploring the affect of Seasonality")
with gr.Row() as bivar_season_params:
bivar_season_var = gr.Dropdown(
choices=["Target", 'MeanTemp', 'DewPoint',
'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind'],
value="Target",
label="Variable"
)
bivar_season_cov = gr.Dropdown(
choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"],
value="Year",
label="Seasonality"
)
with gr.Column():
season_boxplot = gr.Checkbox(value=True, label="Show Boxplot")
season_outlier = gr.Checkbox(value=False, label="Show Outliers")
bivar_season_btn = gr.Button("Run")
bivar_season_plot = gr.Plot()
bivar_season_btn.click(
plot_seasonality,
[bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier],
[bivar_season_plot]
)
with gr.Tab("Correlation") as corr:
with gr.Tab("Weather Correlations") as corr_weather:
gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson"))
with gr.Tab("311 Service Correlations") as corr_service:
gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson"))
with gr.Tab("Lag Correlations") as corr_dynamic:
gr.Markdown("## Use this to dynamically view correlations based on Lag")
gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable")
gr.Markdown("Scroll Down For AutoCorrelation Graphs")
with gr.Row():
corr_vars = [
"None", 'Target', 'num_closed_tickets',
# Weather Variables
'MeanTemp', 'DewPoint', 'Percipitation',
'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind',
# Agency Group Counts
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
'AG_Parks', 'AG_Security', 'AG_Transportation',
'AG_Other',
# Borough Counts
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
'Borough_QUEENS', 'Borough_STATEN ISLAND',
'Borough_OTHER',
# Descriptor Group Counts
'DG_damaged_sign_sidewalk_missing',
'DG_english_emergency_spanish_chinese',
'DG_exemption_commercial_tax_business',
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
'DG_water_basin_litter_missed'
]
corr_vars = gr.Dropdown(
choices=corr_vars,
value="Target",
label="Variable"
)
corr_btn = gr.Button("Run")
corr_plot = gr.Plot()
autocorr_plot = gr.Plot()
corr_btn.click(
plot_correlations,
[bivar_data, corr_vars],
[corr_plot]
)
corr_btn.click(
plot_autocorr,
[bivar_data, corr_vars],
[autocorr_plot]
)
with gr.Tab("Feature Engineering") as feature_engineer_page:
with gr.Tab("Feature Selection") as feature_select:
gr.HTML("<h1 style=\"text-align: center;\">Select Features Based on EDA</h1>")
gr.Markdown("### Below is the logic used in our model feature selection")
gr.HTML("""
<ul style="font-size: 18px">
<li>Weather Covariates</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
<li>MeanTemp, MaxTemp: High collinearity with MinTemp. MinTemp has highest correlation of 3 => REMOVE</li>
<ul style="padding-inline-start: 50px; font-size: 18px;">
<li>Possible Reason: High temps, people stay indoors. A/C doesn't break nowadays. Lower Temps lead to building/tech failure more often</li>
</ul>
<li>Percipitation: Bivariate plot shows weak relationship, outliers no effect on 311 => REMOVE</li>
<li>SnowDepth: High number missing values, low correlation => REMOVE</li>
<li>Rain, SnowIce: Binary, plots (look in Seasonality Tab) show weak relationship, SnowIce heavily imbalanced (99% 0's) => REMOVE</li>
</ul>
<li>311 Service Covariates:</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>LOO (Leave One - or many - Out) Encoding:</li>
<ul style="padding-inline-start: 50px; font-size: 18px;">
<li>Remove weakest features from our categorical covariates</li>
<li>Reduces bias and removes multicollinearity inherent to One-Hot Encoding</li>
<li>Candidates For Removal:</li>
<ul style="padding-inline-start: 70px; font-size: 18px;">
<li>AG_Health, AG_Other: Lowest Correlation, lowest counts => REMOVE</li>
<li>AG_Parks: Lowest Correlation, but low multi-collinearity => KEEP</li>
<li>Borough_OTHER: Weakest Correlation, lowest count => REMOVE</li>
<li>DG_english_emergency, DG_exemption_commercial: Weakest Correlation, lowest counts => REMOVE</li>
<li>DG_odor_food_air_smoke: Lowest Count, but high correlation => KEEP</li>
</ul>
</ul>
</ul>
</ul>""")
with gr.Accordion("Show Final Variable List", open=False):
gr.JSON(json.loads(open("code/all_vars.json","r").read()))
with gr.Tab("Feature Preprocessing") as feature_prep:
data_feature_prep = gr.State(data_preprocess)
gr.HTML("<h1 style=\"text-align: center;\">Preprocess Features</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Missing Values</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Only One value has missing values to impute: Gust</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Various interpolation methods were tested</li>
<li>Methods like Spline and Polynomial over-estimated some values, breaking inherent data ranges</li>
<li>Turns out Simple Linear interpolation was best</li>
</ul>
<li>SOLUTION: Interpolate Gust with Linear method</li>
</ul>""")
with gr.Accordion("Show Interpolation Plots", open=False):
gr.Plot(plot_interpolation(data_preprocess))
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Remove Non-Stationarity</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Variables that are non-stationary change over time, they have a trend</li>
<li>Ideal to transform non-stationarity variables for modeling</li>
<li>Ignore Categorical Variables (simply to keep model complexity low)</li>
<li>Numerical Variables were tested for Non-Stationarity using two methods: ADF and KPSS</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Using ADF and KPSS together can reveal what kind of trend exists in the data</li>
<li>Only 1 Case Met: Pass KPSS, Fail ADF = Trend Stationary (most likely by season)</li>
</ul>
<li>Only Two Variables failed the tests: DewPoint & MinTemp</li>
<li>SOLUTION: Use Differencing (7d lag) + Log for MinTemp and Differencing (7d lag) for DewPoint (Log caused many NaNs)</li>
</ul>""")
with gr.Accordion("View Results Below", open=False):
gr.Markdown("### MinTemp (Log) Tests Before and After Transformation")
with gr.Row():
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments")
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing")
gr.Markdown("### DewPoint Tests Before and After Transformation")
with gr.Row():
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments")
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing")
with gr.Tab("Feature Engineering") as feature_eng:
with gr.Tab("Past Covariates") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
gr.Markdown("""
* Past Covariates are datapoints that are implied to be only related to past information
* For Instance, using past sales of product B to predict futures sales of product A
* There are two ways to use past covariates
* *Option 1:* Build a multi-variate forecast to predict these variables simultaneously
* *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts)
""")
gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
gr.Markdown("""
* By using lags, I can shift my data in a way to avoid leaking past data into the future
* For predicting 7 days into the future, I must lag my data by at least 7 days
* Use a rolling window that will reset over time
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Curse of Dimensionality</h3>")
gr.Markdown("""
* Possible to use many variations of lags, rolling and differences to generate many features
* Too many features leads to the curse of dimensionality, i.e. Overfitting
* Thus, I keep my Feature Set as simple as possible
""")
gr.Markdown("""
### Feature Set
* Lags: 7D, 14D, 21D
* Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days)
* Differencing (7D difference = 7D lag - 14D lag): 7D
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/past_features.py","r").read())
with gr.Tab("Future Covariates") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
gr.Markdown("""
* Future Covariates are data that I have about the future
* For Instance, I can use the projected revenue of Company A to predict daily sales
* For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days.
* I apply a rolling and expanding window as more features
* Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Curse of Dimensionality</h3>")
gr.Markdown("""
* Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible
* The more features, the more we may overfit
""")
gr.Markdown("""
### Feature Set
* Lags: 0D, 1D, 2D
* Rolling: Mean & Min of last 14D
* Expanding Window: Max, Min (min-length of 14)
* Differencing already performed to remove trends
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/future_features.py","r").read())
with gr.Tab("Target Variable") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">311 Service Calls Features</h1>")
gr.Markdown("""
* For providing feature transformations of our Target, we can follow a similar process as above
* Main Difference: Lags of < prediction window need to be recomputed at each iteration
* So, for predicting at time (t+1) we need the predicted value at time (t)
* For a recursive prediction model, this means the model cannot make batch predictions without iterating
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: More variables increase complexity for prediction</h3>")
gr.Markdown("""
* The more features, the more overfitting & more computation
* As I will use a recursive model, these values must be recomputed at each step t+1
* In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation)
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
gr.Markdown("""
* Must be careful about how these features are computed
* For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum
* For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2)
""")
gr.Markdown("""
### Feature Set
* Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality)
* Differencing: 7D, 14D
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/target_features.py","r").read())
with gr.Tab("Forecast Model") as model_select_train_page:
with gr.Tab("Splitting the data") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Splitting Time-Series Data</h1>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Splitting Time-Series Data is different than splitting other data</li>
<li>Rather than splitting on random samples, you split the data by time with order consistent</li>
<li>I took a 75% splitting approach where I split my data at the date that sits on the 75% of data length</li>
</ul>""")
gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data")
gr.Plot(plot_train_split())
with gr.Tab("Model Selection") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Choosing the Right Model</h1>")
gr.Markdown("### Types of Forecast Models for Multi-Step Prediction")
gr.HTML("""
<ul style="font-size: 18px">
<li>Parallel Models: Train a model for each prediction (one for 1 day ahead, another for 2, etc.)</li>
<li>Recursive Models: Model makes a forecast, fills any values it needs for the next prediction, predicts again</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>One of the assumptions was to build a model that was reasonable for production</li>
<li>Parallel models are hard to maintain as the steps of prediction increase</li>
</ul>
<li>Decision: Recursive Modele</li>
</ul>""")
gr.Markdown("### My Model Choice: XGBoost")
gr.HTML("""
<ul style="font-size: 18px">
<li>Reasons for choosing:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Industry standard for regression</li>
<li>Lightweight and relatively fast</li>
<li>Many parameters to tune, such as tree depth and regularization</li>
<li>Scale invariant - Data does not have to be scaled</li>
<li>Allows NaN values and categorical features without encodings (unused in my implementation)</li>
<li>Provides key explainability in its feature importance metrics</li>
</ul>
<li>Decision: Use XGBoost</li>
</ul>""")
with gr.Tab("Model Training") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Training the Model</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Overfitting</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Main Cause: High number of variables and XGBoost's tendency to overfit without tuning</li>
<li>While training, effort was made to watch the validation and training set's relative performance</li>
<li>Steps Taken to avoid Overfitting</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Low Learning Rate</li>
<li>Low Tree Depth</li>
<li>Keeping Val score relatively close to Training score</li>
<li>Increased l2-lambda parameter, boosting regularization</li>
<li>Many trials to get best set of parameters</li>
<li>Implementing Early Stopping</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Choosing a Metric</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Three metrics I considered: MAPE, MAE and MSE</li>
<li>MAPE seemed to show the most consistent and visually accurate results</li>
<li>Decision: MAPE</li>
<li>Justification: 311 Service volume is quite noisy and MAPE better estimates fit to a very noisy curve than the others</li>
</ul>""")
with gr.Tab("Model Prediction") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Recursive Model Prediction</h1>")
gr.Markdown("""
* Below is the code I wrote to implement the Recursive prediction explained in previous tabs
* Predictions are made one step at a time, where the prediction t depends on prediction t-1
* To view the final predictions made by the model see below
""")
gr.Code(open("code/recurse_predict.py","r").read())
with gr.Accordion("View 7 Day Model Forecast", open=False):
gr.Plot(plot_final_predictions())
with gr.Tab("Model Evaluation") as model_eval_page:
gr.HTML("<h1 style=\"text-align: center;\">Forecast Results</h1>")
gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.")
gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.")
gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.")
with gr.Accordion("Model Prediction Scores", open=False):
gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val})
gr.Image("figures/model_performance.png", show_download_button=False)
with gr.Tab("Feature Importance") as model_eval_page:
gr.HTML("<h1 style=\"text-align: center;\">Feature Importance</h1>")
gr.Markdown("""
* Below you can view the feature importance metrics from the XGBoost model
* It seems there is significant impact of the weather variables on 311 Service Call Volume
* Interestingly, it seems some categories were more impactful than others as well
""")
gr.Plot(plot_model_feature_importance())
with gr.Tab("Future Work & Limitations") as future_limitations_page:
gr.Markdown("# Future Work")
gr.Markdown("""
* **Multi-Variate Time Series Forecasting** rather than imputing values naively
* Testing more kinds of models such as LightGBM
* Robustly testing parameters of current model using GridSearchCV
* Comparing performance of my forecast model to others
* More Data! Having more 311 Call data may help find other indicators
""")
gr.Markdown("# Future Deployments")
gr.Markdown("""
* Containerize the model and load onto an API for ingestion
* Containerize data preprocessing and load into a Spark Cluster
* Create triggers and view tables to verify data preprocessing
* Create functions to monitor model performance
""")
with gr.Tab("Appendix") as future_limitations_page:
with gr.Tab("Weather Data Analysis") as dp_weather:
dp_weather_state = gr.State("weather")
with gr.Column():
with gr.Row():
dp_weather_category = gr.Dropdown(
choices=["2011-2018", "2016-2018"],
value="2011-2018",
label="Time Range"
)
dp_weather_var = gr.Dropdown(
choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"],
value = "MeanTemp",
label = "Variable"
)
dp_weather_btn = gr.Button("Run")
dp_weather_report = gr.HTML(value=iframe_dp_weather)
dp_weather_btn.click(
run_report,
[dp_weather_state, dp_weather_var, dp_weather_category],
dp_weather_report,
)
with gr.Tab("Service Data Analysis") as dp_service:
dp_service_state = gr.State("service")
dp_service_category = gr.State("full")
with gr.Column():
dp_service_var = gr.Dropdown(
choices = [
"Created Date", "Closed Date", "Agency", "Agency Name",
"Complaint Type", "Descriptor", "Location Type", "Landmark",
"Facility Type", "Status", "Community Board", "Borough",
"Open Data Channel Type", "Park Facility Name", "Park Borough",
"Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
"Bridge Highway Name", "Bridge Highway Direction", "Road ramp",
"Bridge Highway Segment"
],
value = "Created Date",
label = "Select Variable and Run"
)
dp_service_btn = gr.Button("Run")
dp_service_report = gr.HTML(value=iframe_dp_service)
dp_service_btn.click(
run_report,
[dp_service_state, dp_service_var, dp_service_category],
dp_service_report,
)
def main():
app.launch(share=False)
return app
if __name__=="__main__":
main()