import sys import inspect import math import pandas as pd import numpy as np import polars as pl import seaborn as sns import matplotlib import utils from matplotlib import pyplot as plt import sklearn import gradio as gr from IPython.display import display import plotly.figure_factory as ff from sklearn.impute import SimpleImputer from utils import create_seasons from bs4 import BeautifulSoup from IPython.display import display, HTML from bertopic import BERTopic import html import xgboost as xgb from xgboost import plot_importance from sklearn.metrics import r2_score, mean_absolute_percentage_error from utils import find_variable_data, build_temporal_features, create_datetime, map_vals import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from plotly.subplots import make_subplots import plotly.io as pio import folium import gc import json from utils import MyNaiveImputer matplotlib.use('agg') dark_mode = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ # Imputation Variables wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0) wd_full_local = wd_full_local.reset_index() wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d") wd_full_local = build_temporal_features(wd_full_local, "Datetime") impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'MaxSustainedWind', 'Gust', 'Rain', 'SnowDepth', 'SnowIce'] my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7) imputers = { "Mean": my_imputer.impute_all(impute_cols, strategy="mean"), "Median": my_imputer.impute_all(impute_cols, strategy="median"), "Max": my_imputer.impute_all(impute_cols, strategy="max"), "Min": my_imputer.impute_all(impute_cols, strategy="min") } # Merged Data Variables data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0) data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d") data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name() data_merged["Year String"] = data_merged["Year"].astype(str) data_merged["Month String"] = data_merged["Datetime"].dt.month_name() data_merged["Rain Bool"] = data_merged["Rain"].astype(bool) data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool) data_merged = data_merged.set_index("Datetime") weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy() data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)] # Feature Preprocessing data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy() data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear") data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3) data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5) data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic") data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear") data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"] data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7) data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"] data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"] data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p) data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7) data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"] # Final Preprocessed Variables data_final = pd.read_csv("data/data_final.csv") data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d") data_final = data_final.set_index("Datetime") test = data_final[-7:] dataset = data_final[:-7] split_point = int(len(data_final[:-7])*0.75) train, val = dataset[:split_point], dataset[split_point:] X_train, y_train = train.drop(columns="Target"), train["Target"] X_val, y_val = val.drop(columns="Target"), val["Target"] X_test, y_test = test.drop(columns="Target"), test["Target"] forecast_model = xgb.XGBRegressor() forecast_model.load_model("models/final_model.json") # Current Predictions global r2_val, r2_train, mape_train, mape_val r2_train = 0.8691238468740025 mape_train = 0.04889510400934162 r2_val = 0.6072642783665692 mape_val = 0.6072642783665692 # Initial Variables reports = { "weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"), "weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"), "service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser") } iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp") iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date") # Code Variables to show in app load_code = """ # Load Weather Data in pandas # No need for polars because data is sufficiently small weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv") # Load Service data in polars for speed optimization # Loading directly with polars leads to errors # Load in pandas then convert to polars service_data_pd = pd.read_csv("data/311-2016-2018.csv") assert service_data_pd["Unique Key"].nunique() == len(service_data_pd) # This casting is done just because of some errors when loading pl from pandas service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string") service_data_pd["BBL"] = service_data_pd["BBL"].astype("string") service_data = pl.DataFrame(service_data_pd) # Clear some ram del service_data_pd gc.collect()""" map_code = """ lat_min = service_data["Latitude"].min() lat_max = service_data["Latitude"].max() long_min = service_data["Longitude"].min() long_max = service_data["Longitude"].max() mincon_lat = weather_data["Latitude"] >= lat_min maxcon_lat = weather_data["Latitude"] <= lat_max mincon_long = weather_data["Longitude"] >= long_min maxcon_long = weather_data["Longitude"] <= long_max wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long] """ Closed_Ticket_Code = """ # Fill null and Typos with mean time diff (13 days) service_data = service_data.with_columns( Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date") > pl.duration(days=1)) .then(pl.col("Created Date") + pl.duration(days=mean_diff)) .otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff)) ) # Check for no null values assert service_data["Closed_Date_New"].is_null().sum() == 0 # Pair wise GroupBy and Filter closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \ .agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here .sort("Closed_Date_New") \ # Sort by new column Closed Date New .filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window .group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering ct_df = closed_tickets.with_columns( pl.col("num_closed_tickets") # Rename Column ) """ global topic_model topic_model = BERTopic.load("models/BERTopic") def plot_imputations(var, data, imputers=imputers): plt.close('all') fig = plt.figure(figsize=(15,5)) plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual") plt.title(f"{var} Imputation") for method in imputers: plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method) plt.legend() return gr.update(value=fig) def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600): plt.close('all') if var == "": return gr.update() from utils import plot_timeseries fig = plot_timeseries(data, var, data_name, all_vars, height, width) return gr.update(value=fig) def plot_bivariate(data, x, y, subset=None, trendline=True): plt.close('all') map_var = { "Year": "Year String", "Season": "Season", "Month": "Month String", "Day Of Week": "Day Of Week", "Weekend": "is_weekend", "Holiday": "is_holiday", "Rain": "Rain Bool", "SnowIce": "SnowIce Bool", "None": None, "": None, } subset = map_var[subset] from utils import plot_bivariate fig = plot_bivariate(data, x, y, subset, trendline) return gr.update(value=fig) def plot_seasonality(data, x, y, show_box=True, show_outliers=False): plt.close('all') map_var = { "Year": "Year String", "Season": "Season", "Month": "Month String", "Day Of Week": "Day Of Week", "Weekend": "is_weekend", "Holiday": "is_holiday", "Rain": "Rain Bool", "SnowIce": "SnowIce Bool", "None": None, } x = map_var[x] from utils import plot_seasonality fig = plot_seasonality(data, x, y, show_box, show_outliers) return gr.update(value=fig) def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"): plt.close('all') from utils import plot_correlations fig = plot_correlations(data, covar, target, lags, method) return gr.update(value=fig) def plot_autocorr(data, var, apply=None): plt.close('all') from utils import plot_acf, plot_pacf time_series = data.loc[:, var].to_frame().copy() if apply: time_series[var] = time_series[var].apply(apply) fig, ax = plt.subplots(2, 1, figsize=(12, 8)) _ = plot_acf(time_series[var], lags=30, ax=ax[0]) _ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1]) _ = plt.suptitle(f"{var}", y=0.95) return gr.update(value=fig) def plot_all_correlations(data, data_name="weather", method="pearson"): plt.close('all') from utils import plot_all_correlations fig = plot_all_correlations(data, data_name, method) return fig def run_report(report_base, variable_name, report_category="full"): report_name = report_base + "_" + report_category iframe, _ = find_variable_data(reports[report_name], variable_name) return gr.update(value=iframe) def test_stationary(data, var): from utils import test_stationary df = test_stationary(data, var) return df def plot_interpolation(data): plt.close('all') from utils import plot_gust_interpolation fig = plot_gust_interpolation(data) return fig def plot_model_feature_importance(): plt.close('all') from utils import plot_final_feature_importance fig = plot_final_feature_importance(forecast_model) return fig def plot_final_predictions(): plt.close('all') from utils import predict_recurse next_7_day_prediction = predict_recurse(dataset, test, forecast_model) fig = plt.subplots(figsize=(15, 5)) data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction") data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax) ax.legend() curr_fig = plt.gcf() plt.close() return curr_fig def plot_train_split(): plt.close('all') from utils import plot_train_split fig = plot_train_split(train, val) return fig def plot_val_predicitons(): data = val.copy() data["Prediction"] = preds_val from utils import plot_predictions fig = plot_predictions(train, val, preds_val) return fig curr_theme = gr.themes.Default( text_size=gr.themes.sizes.text_lg ) with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app: title = gr.HTML("""

Point72 Case Study

""") with gr.Tabs() as pages: with gr.Tab("Overview") as toc_page: gr.Markdown("# My Point72 Case Study Results") gr.Markdown(""" * Please follow the tabs sequentially left to right to get the full story of my work * There will be many interactive parts where you will be able to test and view different parameters * This app may also be built and ran locally * This app is hosted and served from a cloud server VM Instance * Any questions please email me: davidna22@gmail.com """) with gr.Tab("Data Preprocessing") as data_preprocessing_page: with gr.Tab("Data Loading") as dp_overview: gr.HTML("

Loading the Data

") gr.Markdown("## Goal: Load the Data as efficiently as possible") gr.Markdown(""" * Using Pandas alone is **slow and inefficient**. * With small datasets, pandas is great because the API is robust. * With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster. * As data gets even larger, multi-processing languages like Spark are required. * For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility. """) with gr.Accordion("Code", open=False): gr.Code(load_code, language="python") with gr.Tab("Location Mapping") as dp_overview: src_doc = html.escape(open("figures/map1.html","r").read()) iframe1 = f'' src_doc = html.escape(open("figures/map2.html","r").read()) iframe2 = f'' src_doc = html.escape(open("figures/bounded_map.html","r").read()) iframe3 = f'' src_doc = html.escape(open("figures/final_map.html","r").read()) iframe4 = f'' gr.HTML("

Location Mapping for Both Datasets

") with gr.Row(elem_classes="map-legend"): gr.Markdown(""" **Legend:** * Red: Weather records * Blue: 311 Service records """, elem_classes="map-legend-text") with gr.Row(): with gr.Column(): gr.HTML("

Map of New York State

") map1 = gr.HTML(iframe1, elem_classes="map") with gr.Column(): gr.HTML("

Map of New York City

") map2 = gr.HTML(iframe2, elem_classes="map") with gr.Row(): gr.Markdown(""" Juxtaposing these two maps and seeing the approximate distributions of data observations, its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset. Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure congruity between the two. **Below you can see the bounding box I created and how the new weather data observations fit in this bounding box.** """) with gr.Row(): with gr.Column(): map3 = gr.HTML(iframe3, elem_classes="map") with gr.Column(): map4 = gr.HTML(iframe4, elem_classes="map") with gr.Accordion("Code", open=False): gr.Code(map_code, language="python") with gr.Tab("Variable Pruning") as var_pruning: gr.HTML("

How I pruned the datasets

") gr.Markdown("## Goal: Remove as many useless features as possible") gr.HTML("

Key Factors for Feature Removal

") gr.Markdown(""" * Percentage of missing data points * Distribution Imbalance * Irrelevance * Number of distinct categories * Another variable was chosen as replacement

NOTE: Look in the appendix for visualizations of individual variables """) droped_var_df = pd.read_excel("data/drop_vars.xlsx") gr.Dataframe( droped_var_df, wrap=True, label="Dropped Variables & Justification (Weather on Bottom)" ) with gr.Tab("Time Aggregation") as time_agg: gr.HTML("

Aggregate Data by Date

") gr.Markdown("## Goal: Aggregate data by Date") gr.HTML("

Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts

") gr.HTML(""" """) gr.HTML("

Issue 2: Weather data is not aggregated by day

") gr.HTML(""" """) with gr.Tab("Weather Data: Imputation") as wd_impute: gr.HTML("

Data Imputation

") gr.Markdown("## Goal: Impute missing values in Weather Data") gr.HTML("

Issue: Weather data is incomplete, 49 days are missing in 2018

") gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"") gr.HTML(""" """) gr.Markdown("Use plots below to view the plots used to help justify above reasoning") with gr.Accordion("Show Plots", open=False): impute_data = gr.State(wd_full_local) impute_choices = ["None"] impute_choices.extend(impute_cols) wd_impute_col = gr.Dropdown( choices=impute_choices, value="None", label="Choose a Variable to plot all imputation methods" ) wd_impute_plot = gr.Plot() wd_impute_col.change( plot_imputations, [wd_impute_col, impute_data], [wd_impute_plot] ) with gr.Tab("311: Closed Ticket Counting") as ct_date: gr.HTML("

Closed Ticket Feature

") gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume") gr.HTML("

Issue 1: Data Error, Typos, and/or Null valuess

") gr.HTML(""" """) gr.HTML("

Issue 2: Data Leakage - Future into Past

") gr.HTML(""" """) with gr.Accordion("Code", open=False): gr.Code(Closed_Ticket_Code, language="python") with gr.Tab("311: Categorical Grouping") as cat_groups: BERTopic = gr.State(BERTopic.load("models/BERTopic")) gr.HTML("

Categorical Features

") gr.HTML("

Issue 1: Categorical Features have too many categories

") gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)") gr.HTML(""" """) gr.HTML("

Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day

") gr.Markdown("#### One Hot Encode and Sum per category") gr.HTML(""" """) with gr.Accordion("View Feature Groups", open=False): with gr.Accordion("Borough", open=False): gr.JSON(json.loads(open("code/Borough.json", "r").read())) with gr.Accordion("Agency", open=False): gr.JSON(open("code/Agency.json", "r").read()) with gr.Accordion("Descriptor", open=False): gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]]) gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1)))) with gr.Tab("All Code") as code_preprocess: gr.Markdown("# View Full Code for building Weather Data") with gr.Accordion(open=False): gr.Code(open("code/build_weather.py", "r").read()) gr.Markdown("# View Full Code for building 311 Service Data") with gr.Accordion(open=False): gr.Code(open("code/build_service.py", "r").read()) with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page: bivar_data = gr.State(data_merged_eda) with gr.Tab("Overview", id="eda_overview") as eda_overview: gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations") gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.") gr.HTML("

Key Insights

") gr.HTML(""" """) with gr.Tab("Univariate", id="eda_univar") as eda_univar: with gr.Tab("Weather Data") as eda_uni_weather: eda_univar_weatherdf = gr.State(weather_full_df) gr.Markdown("# Use the Interactive plot below") eda_uni_weather_name = gr.State("Weather") weather_vars = [ "", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', 'MinTemp', 'MaxTemp', 'MaxSustainedWind' ] select_weather_var = gr.Dropdown( choices=weather_vars, value="", label="Select a Variable to View" ) weather_uniplot = gr.Plot() select_weather_var.change( plot_timeseries, inputs=[ eda_univar_weatherdf, select_weather_var, eda_uni_weather_name ], outputs=[ weather_uniplot ] ) with gr.Tab("311 Service Data") as eda_uni_weather: eda_univar_servicedf = gr.State(data_merged_eda) gr.Markdown("# Use the Interactive plot below") gr.Markdown("**NOTE: Target is the count of 311 service records**") eda_uni_service_name = gr.State("Weather") service_vars = [ "", 'Target', 'num_closed_tickets', # Agency Group Counts 'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health', 'AG_Parks', 'AG_Security', 'AG_Transportation', 'AG_Other', # Borough Counts 'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN', 'Borough_QUEENS', 'Borough_STATEN ISLAND', 'Borough_OTHER', # Descriptor Group Counts 'DG_damaged_sign_sidewalk_missing', 'DG_english_emergency_spanish_chinese', 'DG_exemption_commercial_tax_business', 'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead', 'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition', 'DG_water_basin_litter_missed' ] select_service_var = gr.Dropdown( choices=service_vars, value="", label="Select a Variable to View" ) service_uniplot = gr.Plot() select_service_var.change( plot_timeseries, inputs=[ eda_univar_servicedf, select_service_var, eda_uni_service_name ], outputs=[ service_uniplot ] ) with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar: gr.Markdown("# Use the Interactive plot below") gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate") with gr.Column(): with gr.Row() as bivar_params: bivar_dist_target = gr.Dropdown( choices=["Target"], value="Target", label="Target Variable (One option)" ) all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"] all_bivars.extend(weather_vars) all_bivars = sorted(all_bivars) all_bivars = all_bivars[1:] bivar_dist_cov = gr.Dropdown( choices=all_bivars, value="MeanTemp", label="Select Covariate" ) bivar_trendline = gr.Dropdown( choices=[True, False], value=True, label="Graph with OLS Trendline" ) with gr.Accordion("Add Seasonality", open=False): bivar_subset = gr.Dropdown( choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"], value="None", label="Seasonality Options (Disabled for Agency, Borough and Descriptor)" ) bivar_submit = gr.Button("Run") bivar_plot = gr.Plot() bivar_submit.click( plot_bivariate, [bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline], bivar_plot ) with gr.Tab("Seasonality") as bivar_season: gr.Markdown("## Exploring the affect of Seasonality") with gr.Row() as bivar_season_params: bivar_season_var = gr.Dropdown( choices=["Target", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', 'MinTemp', 'MaxTemp', 'MaxSustainedWind'], value="Target", label="Variable" ) bivar_season_cov = gr.Dropdown( choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"], value="Year", label="Seasonality" ) with gr.Column(): season_boxplot = gr.Checkbox(value=True, label="Show Boxplot") season_outlier = gr.Checkbox(value=False, label="Show Outliers") bivar_season_btn = gr.Button("Run") bivar_season_plot = gr.Plot() bivar_season_btn.click( plot_seasonality, [bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier], [bivar_season_plot] ) with gr.Tab("Correlation") as corr: with gr.Tab("Weather Correlations") as corr_weather: gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson")) with gr.Tab("311 Service Correlations") as corr_service: gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson")) with gr.Tab("Lag Correlations") as corr_dynamic: gr.Markdown("## Use this to dynamically view correlations based on Lag") gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable") gr.Markdown("Scroll Down For AutoCorrelation Graphs") with gr.Row(): corr_vars = [ "None", 'Target', 'num_closed_tickets', # Weather Variables 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', 'MinTemp', 'MaxTemp', 'MaxSustainedWind', # Agency Group Counts 'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health', 'AG_Parks', 'AG_Security', 'AG_Transportation', 'AG_Other', # Borough Counts 'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN', 'Borough_QUEENS', 'Borough_STATEN ISLAND', 'Borough_OTHER', # Descriptor Group Counts 'DG_damaged_sign_sidewalk_missing', 'DG_english_emergency_spanish_chinese', 'DG_exemption_commercial_tax_business', 'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead', 'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition', 'DG_water_basin_litter_missed' ] corr_vars = gr.Dropdown( choices=corr_vars, value="Target", label="Variable" ) corr_btn = gr.Button("Run") corr_plot = gr.Plot() autocorr_plot = gr.Plot() corr_btn.click( plot_correlations, [bivar_data, corr_vars], [corr_plot] ) corr_btn.click( plot_autocorr, [bivar_data, corr_vars], [autocorr_plot] ) with gr.Tab("Feature Engineering") as feature_engineer_page: with gr.Tab("Feature Selection") as feature_select: gr.HTML("

Select Features Based on EDA

") gr.Markdown("### Below is the logic used in our model feature selection") gr.HTML(""" """) with gr.Accordion("Show Final Variable List", open=False): gr.JSON(json.loads(open("code/all_vars.json","r").read())) with gr.Tab("Feature Preprocessing") as feature_prep: data_feature_prep = gr.State(data_preprocess) gr.HTML("

Preprocess Features

") gr.HTML("

Issue 1: Missing Values

") gr.HTML(""" """) with gr.Accordion("Show Interpolation Plots", open=False): gr.Plot(plot_interpolation(data_preprocess)) gr.HTML("

Issue 2: Remove Non-Stationarity

") gr.HTML(""" """) with gr.Accordion("View Results Below", open=False): gr.Markdown("### MinTemp (Log) Tests Before and After Transformation") with gr.Row(): with gr.Column(): gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments") with gr.Column(): gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing") gr.Markdown("### DewPoint Tests Before and After Transformation") with gr.Row(): with gr.Column(): gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments") with gr.Column(): gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing") with gr.Tab("Feature Engineering") as feature_eng: with gr.Tab("Past Covariates") as fe_past: gr.HTML("

Past Covariate Features

") gr.Markdown(""" * Past Covariates are datapoints that are implied to be only related to past information * For Instance, using past sales of product B to predict futures sales of product A * There are two ways to use past covariates * *Option 1:* Build a multi-variate forecast to predict these variables simultaneously * *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts) """) gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**") gr.HTML("

Issue 1: Leaking Future Data into the past

") gr.Markdown(""" * By using lags, I can shift my data in a way to avoid leaking past data into the future * For predicting 7 days into the future, I must lag my data by at least 7 days * Use a rolling window that will reset over time """) gr.HTML("

Issue 2: Curse of Dimensionality

") gr.Markdown(""" * Possible to use many variations of lags, rolling and differences to generate many features * Too many features leads to the curse of dimensionality, i.e. Overfitting * Thus, I keep my Feature Set as simple as possible """) gr.Markdown(""" ### Feature Set * Lags: 7D, 14D, 21D * Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days) * Differencing (7D difference = 7D lag - 14D lag): 7D """) with gr.Accordion("Open to view implementation code", open=False): gr.Code(open("code/past_features.py","r").read()) with gr.Tab("Future Covariates") as fe_past: gr.HTML("

Past Covariate Features

") gr.Markdown(""" * Future Covariates are data that I have about the future * For Instance, I can use the projected revenue of Company A to predict daily sales * For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days. * I apply a rolling and expanding window as more features * Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume """) gr.HTML("

Issue 1: Curse of Dimensionality

") gr.Markdown(""" * Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible * The more features, the more we may overfit """) gr.Markdown(""" ### Feature Set * Lags: 0D, 1D, 2D * Rolling: Mean & Min of last 14D * Expanding Window: Max, Min (min-length of 14) * Differencing already performed to remove trends """) with gr.Accordion("Open to view implementation code", open=False): gr.Code(open("code/future_features.py","r").read()) with gr.Tab("Target Variable") as fe_past: gr.HTML("

311 Service Calls Features

") gr.Markdown(""" * For providing feature transformations of our Target, we can follow a similar process as above * Main Difference: Lags of < prediction window need to be recomputed at each iteration * So, for predicting at time (t+1) we need the predicted value at time (t) * For a recursive prediction model, this means the model cannot make batch predictions without iterating """) gr.HTML("

Issue 1: More variables increase complexity for prediction

") gr.Markdown(""" * The more features, the more overfitting & more computation * As I will use a recursive model, these values must be recomputed at each step t+1 * In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation) """) gr.HTML("

Issue 1: Leaking Future Data into the past

") gr.Markdown(""" * Must be careful about how these features are computed * For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum * For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2) """) gr.Markdown(""" ### Feature Set * Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality) * Differencing: 7D, 14D """) with gr.Accordion("Open to view implementation code", open=False): gr.Code(open("code/target_features.py","r").read()) with gr.Tab("Forecast Model") as model_select_train_page: with gr.Tab("Splitting the data") as model_data_split: gr.HTML("

Splitting Time-Series Data

") gr.HTML(""" """) gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data") gr.Plot(plot_train_split()) with gr.Tab("Model Selection") as model_data_split: gr.HTML("

Choosing the Right Model

") gr.Markdown("### Types of Forecast Models for Multi-Step Prediction") gr.HTML(""" """) gr.Markdown("### My Model Choice: XGBoost") gr.HTML(""" """) with gr.Tab("Model Training") as model_data_split: gr.HTML("

Training the Model

") gr.HTML("

Issue 1: Overfitting

") gr.HTML(""" """) gr.HTML("

Issue 2: Choosing a Metric

") gr.HTML(""" """) with gr.Tab("Model Prediction") as model_data_split: gr.HTML("

Recursive Model Prediction

") gr.Markdown(""" * Below is the code I wrote to implement the Recursive prediction explained in previous tabs * Predictions are made one step at a time, where the prediction t depends on prediction t-1 * To view the final predictions made by the model see below """) gr.Code(open("code/recurse_predict.py","r").read()) with gr.Accordion("View 7 Day Model Forecast", open=False): gr.Plot(plot_final_predictions()) with gr.Tab("Model Evaluation") as model_eval_page: gr.HTML("

Forecast Results

") gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.") gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.") gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.") with gr.Accordion("Model Prediction Scores", open=False): gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val}) gr.Image("figures/model_performance.png", show_download_button=False) with gr.Tab("Feature Importance") as model_eval_page: gr.HTML("

Feature Importance

") gr.Markdown(""" * Below you can view the feature importance metrics from the XGBoost model * It seems there is significant impact of the weather variables on 311 Service Call Volume * Interestingly, it seems some categories were more impactful than others as well """) gr.Plot(plot_model_feature_importance()) with gr.Tab("Future Work & Limitations") as future_limitations_page: gr.Markdown("# Future Work") gr.Markdown(""" * **Multi-Variate Time Series Forecasting** rather than imputing values naively * Testing more kinds of models such as LightGBM * Robustly testing parameters of current model using GridSearchCV * Comparing performance of my forecast model to others * More Data! Having more 311 Call data may help find other indicators """) gr.Markdown("# Future Deployments") gr.Markdown(""" * Containerize the model and load onto an API for ingestion * Containerize data preprocessing and load into a Spark Cluster * Create triggers and view tables to verify data preprocessing * Create functions to monitor model performance """) with gr.Tab("Appendix") as future_limitations_page: with gr.Tab("Weather Data Analysis") as dp_weather: dp_weather_state = gr.State("weather") with gr.Column(): with gr.Row(): dp_weather_category = gr.Dropdown( choices=["2011-2018", "2016-2018"], value="2011-2018", label="Time Range" ) dp_weather_var = gr.Dropdown( choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"], value = "MeanTemp", label = "Variable" ) dp_weather_btn = gr.Button("Run") dp_weather_report = gr.HTML(value=iframe_dp_weather) dp_weather_btn.click( run_report, [dp_weather_state, dp_weather_var, dp_weather_category], dp_weather_report, ) with gr.Tab("Service Data Analysis") as dp_service: dp_service_state = gr.State("service") dp_service_category = gr.State("full") with gr.Column(): dp_service_var = gr.Dropdown( choices = [ "Created Date", "Closed Date", "Agency", "Agency Name", "Complaint Type", "Descriptor", "Location Type", "Landmark", "Facility Type", "Status", "Community Board", "Borough", "Open Data Channel Type", "Park Facility Name", "Park Borough", "Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location", "Bridge Highway Name", "Bridge Highway Direction", "Road ramp", "Bridge Highway Segment" ], value = "Created Date", label = "Select Variable and Run" ) dp_service_btn = gr.Button("Run") dp_service_report = gr.HTML(value=iframe_dp_service) dp_service_btn.click( run_report, [dp_service_state, dp_service_var, dp_service_category], dp_service_report, ) def main(): app.launch(share=False) return app if __name__=="__main__": main()