Spaces:
Running
Running
File size: 67,052 Bytes
dad00c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 |
import sys
import inspect
import math
import pandas as pd
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib
import utils
from matplotlib import pyplot as plt
import sklearn
import gradio as gr
from IPython.display import display
import plotly.figure_factory as ff
from sklearn.impute import SimpleImputer
from utils import create_seasons
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from bertopic import BERTopic
import html
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from utils import find_variable_data, build_temporal_features, create_datetime, map_vals
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio
import folium
import gc
import json
from utils import MyNaiveImputer
matplotlib.use('agg')
dark_mode = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
# Imputation Variables
wd_full_local = pd.read_csv("data/weather_aggregated_2010-2018.csv", index_col=0)
wd_full_local = wd_full_local.reset_index()
wd_full_local["Datetime"] = pd.to_datetime(wd_full_local["Datetime"], format="%Y-%m-%d")
wd_full_local = build_temporal_features(wd_full_local, "Datetime")
impute_cols = ['MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint',
'Percipitation', 'WindSpeed', 'MaxSustainedWind',
'Gust', 'Rain', 'SnowDepth', 'SnowIce']
my_imputer = MyNaiveImputer(wd_full_local, time_steps=49+7)
imputers = {
"Mean": my_imputer.impute_all(impute_cols, strategy="mean"),
"Median": my_imputer.impute_all(impute_cols, strategy="median"),
"Max": my_imputer.impute_all(impute_cols, strategy="max"),
"Min": my_imputer.impute_all(impute_cols, strategy="min")
}
# Merged Data Variables
data_merged = pd.read_csv("data/data_merged_full.csv", index_col=0)
data_merged = create_datetime(data_merged, "Datetime", format="%Y-%m-%d")
data_merged["Day Of Week"] = data_merged["Datetime"].dt.day_name()
data_merged["Year String"] = data_merged["Year"].astype(str)
data_merged["Month String"] = data_merged["Datetime"].dt.month_name()
data_merged["Rain Bool"] = data_merged["Rain"].astype(bool)
data_merged["SnowIce Bool"] = data_merged["SnowIce"].astype(bool)
data_merged = data_merged.set_index("Datetime")
weather_full_df = data_merged.loc[data_merged["Year"] <= 2018].copy()
data_merged_eda = data_merged.loc[(data_merged["Year"] <= 2018) & (data_merged["Year"] >= 2016)]
# Feature Preprocessing
data_preprocess = data_merged.loc[(data_merged["Year"] >= 2016)].copy()
data_preprocess["Gust_lin"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["Gust_spline3"] = data_preprocess["Gust"].interpolate(method="spline", order=3)
data_preprocess["Gust_spline5"] = data_preprocess["Gust"].interpolate(method="spline", order=5)
data_preprocess["Gust_quad"] = data_preprocess["Gust"].interpolate(method="quadratic")
data_preprocess["Gust"] = data_preprocess["Gust"].interpolate(method="linear")
data_preprocess["DewPoint_old"] = data_preprocess["DewPoint"]
data_preprocess["DewPoint_diff7d"] = data_preprocess["DewPoint"] - data_preprocess["DewPoint"].shift(7)
data_preprocess["DewPoint"] = data_preprocess["DewPoint_diff7d"]
data_preprocess["MinTemp_old"] = data_preprocess["MinTemp"]
data_preprocess["MinTemp_log"] = data_preprocess["MinTemp"].apply(np.log1p)
data_preprocess["MinTemp_log_diff7d"] = data_preprocess["MinTemp_log"] - data_preprocess["MinTemp_log"].shift(7)
data_preprocess["MinTemp"] = data_preprocess["MinTemp_log_diff7d"]
# Final Preprocessed Variables
data_final = pd.read_csv("data/data_final.csv")
data_final = create_datetime(data_final, "Datetime", format="%Y-%m-%d")
data_final = data_final.set_index("Datetime")
test = data_final[-7:]
dataset = data_final[:-7]
split_point = int(len(data_final[:-7])*0.75)
train, val = dataset[:split_point], dataset[split_point:]
X_train, y_train = train.drop(columns="Target"), train["Target"]
X_val, y_val = val.drop(columns="Target"), val["Target"]
X_test, y_test = test.drop(columns="Target"), test["Target"]
forecast_model = xgb.XGBRegressor()
forecast_model.load_model("models/final_model.json")
# Current Predictions
global r2_val, r2_train, mape_train, mape_val
r2_train = 0.8691238468740025
mape_train = 0.04889510400934162
r2_val = 0.6072642783665692
mape_val = 0.6072642783665692
# Initial Variables
reports = {
"weather_2011-2018": BeautifulSoup(open("reports/weather_data_ts.html"), "html.parser"),
"weather_2016-2018": BeautifulSoup(open("reports/weather_data_after2016_ts.html"), "html.parser"),
"service_full": BeautifulSoup(open("reports/311_data_1.html"), "html.parser")
}
iframe_dp_weather, _ = find_variable_data(reports["weather_2011-2018"], "MeanTemp")
iframe_dp_service, _ = find_variable_data(reports["service_full"], "Created Date")
# Code Variables to show in app
load_code = """
# Load Weather Data in pandas
# No need for polars because data is sufficiently small
weather_data = pd.read_csv("data/weather_NY_2010_2018Nov.csv")
# Load Service data in polars for speed optimization
# Loading directly with polars leads to errors
# Load in pandas then convert to polars
service_data_pd = pd.read_csv("data/311-2016-2018.csv")
assert service_data_pd["Unique Key"].nunique() == len(service_data_pd)
# This casting is done just because of some errors when loading pl from pandas
service_data_pd["Incident Zip"] = service_data_pd["Incident Zip"].astype("string")
service_data_pd["BBL"] = service_data_pd["BBL"].astype("string")
service_data = pl.DataFrame(service_data_pd)
# Clear some ram
del service_data_pd
gc.collect()"""
map_code = """
lat_min = service_data["Latitude"].min()
lat_max = service_data["Latitude"].max()
long_min = service_data["Longitude"].min()
long_max = service_data["Longitude"].max()
mincon_lat = weather_data["Latitude"] >= lat_min
maxcon_lat = weather_data["Latitude"] <= lat_max
mincon_long = weather_data["Longitude"] >= long_min
maxcon_long = weather_data["Longitude"] <= long_max
wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long]
"""
Closed_Ticket_Code = """
# Fill null and Typos with mean time diff (13 days)
service_data = service_data.with_columns(
Closed_Date_New = pl.when(pl.col("Created Date") - pl.col("Closed Date") > pl.duration(days=1))
.then(pl.col("Created Date") + pl.duration(days=mean_diff))
.otherwise(pl.col("Closed Date")).fill_null(pl.col("Created Date") + pl.duration(days=mean_diff))
)
# Check for no null values
assert service_data["Closed_Date_New"].is_null().sum() == 0
# Pair wise GroupBy and Filter
closed_tickets = service_data.group_by(["Closed_Date_New", "Created Date"]) \
.agg((pl.when(pl.col("Created Date") <= pl.col("Closed_Date_New")).then(1).otherwise(0)).sum().alias("count")) \ # FILTER Created Date < Closed Date Here
.sort("Closed_Date_New") \ # Sort by new column Closed Date New
.filter((pl.col("Closed_Date_New").dt.year() >= 2016) & (pl.col("Closed_Date_New").dt.year() < 2019)) \ # Filter for only Closed Dates in time window
.group_by("Closed_Date_New").agg(pl.col("count").sum().alias("num_closed_tickets")) # Final Group By Closed date after filtering
ct_df = closed_tickets.with_columns(
pl.col("num_closed_tickets") # Rename Column
)
"""
global topic_model
topic_model = BERTopic.load("models/BERTopic")
def plot_imputations(var, data, imputers=imputers):
plt.close('all')
fig = plt.figure(figsize=(15,5))
plt.plot(data["Datetime"][-800:], data[var][-800:], label="Actual")
plt.title(f"{var} Imputation")
for method in imputers:
plt.plot(imputers[method]["Datetime"], imputers[method][var], label=method)
plt.legend()
return gr.update(value=fig)
def plot_timeseries(data, var, data_name="My", all_vars=[], height=800, width=600):
plt.close('all')
if var == "":
return gr.update()
from utils import plot_timeseries
fig = plot_timeseries(data, var, data_name, all_vars, height, width)
return gr.update(value=fig)
def plot_bivariate(data, x, y, subset=None, trendline=True):
plt.close('all')
map_var = {
"Year": "Year String",
"Season": "Season",
"Month": "Month String",
"Day Of Week": "Day Of Week",
"Weekend": "is_weekend",
"Holiday": "is_holiday",
"Rain": "Rain Bool",
"SnowIce": "SnowIce Bool",
"None": None,
"": None,
}
subset = map_var[subset]
from utils import plot_bivariate
fig = plot_bivariate(data, x, y, subset, trendline)
return gr.update(value=fig)
def plot_seasonality(data, x, y, show_box=True, show_outliers=False):
plt.close('all')
map_var = {
"Year": "Year String",
"Season": "Season",
"Month": "Month String",
"Day Of Week": "Day Of Week",
"Weekend": "is_weekend",
"Holiday": "is_holiday",
"Rain": "Rain Bool",
"SnowIce": "SnowIce Bool",
"None": None,
}
x = map_var[x]
from utils import plot_seasonality
fig = plot_seasonality(data, x, y, show_box, show_outliers)
return gr.update(value=fig)
def plot_correlations(data, covar, target="Target", lags=[0,1,2,3,4,5,6,7,8,13,14,15,21], method="pearson"):
plt.close('all')
from utils import plot_correlations
fig = plot_correlations(data, covar, target, lags, method)
return gr.update(value=fig)
def plot_autocorr(data, var, apply=None):
plt.close('all')
from utils import plot_acf, plot_pacf
time_series = data.loc[:, var].to_frame().copy()
if apply:
time_series[var] = time_series[var].apply(apply)
fig, ax = plt.subplots(2, 1, figsize=(12, 8))
_ = plot_acf(time_series[var], lags=30, ax=ax[0])
_ = plot_pacf(time_series[var], lags=30, method="ols-adjusted", ax=ax[1])
_ = plt.suptitle(f"{var}", y=0.95)
return gr.update(value=fig)
def plot_all_correlations(data, data_name="weather", method="pearson"):
plt.close('all')
from utils import plot_all_correlations
fig = plot_all_correlations(data, data_name, method)
return fig
def run_report(report_base, variable_name, report_category="full"):
report_name = report_base + "_" + report_category
iframe, _ = find_variable_data(reports[report_name], variable_name)
return gr.update(value=iframe)
def test_stationary(data, var):
from utils import test_stationary
df = test_stationary(data, var)
return df
def plot_interpolation(data):
plt.close('all')
from utils import plot_gust_interpolation
fig = plot_gust_interpolation(data)
return fig
def plot_model_feature_importance():
plt.close('all')
from utils import plot_final_feature_importance
fig = plot_final_feature_importance(forecast_model)
return fig
def plot_final_predictions():
plt.close('all')
from utils import predict_recurse
next_7_day_prediction = predict_recurse(dataset, test, forecast_model)
fig = plt.subplots(figsize=(15, 5))
data_final.loc[data_final.index[-7:], "Target"]= next_7_day_prediction
ax = data_final.loc[data_final.index[-96:-6], "Target"].plot(label="Real", title="311 Service Volume: 7 Day Prediction")
data_final.loc[data_final.index[-7:], "Target"].plot(label="Forecast", ax=ax)
ax.legend()
curr_fig = plt.gcf()
plt.close()
return curr_fig
def plot_train_split():
plt.close('all')
from utils import plot_train_split
fig = plot_train_split(train, val)
return fig
def plot_val_predicitons():
data = val.copy()
data["Prediction"] = preds_val
from utils import plot_predictions
fig = plot_predictions(train, val, preds_val)
return fig
curr_theme = gr.themes.Default(
text_size=gr.themes.sizes.text_lg
)
with gr.Blocks(theme=curr_theme, js=dark_mode, css=open("custom.css", "r").read()) as app:
title = gr.HTML("""<h1 align="center">Point72 Case Study</h1>""")
with gr.Tabs() as pages:
with gr.Tab("Overview") as toc_page:
gr.Markdown("# My Point72 Case Study Results")
gr.Markdown("""
* Please follow the tabs sequentially left to right to get the full story of my work
* There will be many interactive parts where you will be able to test and view different parameters
* This app may also be built and ran locally
* This app is hosted and served from a cloud server VM Instance
* Any questions please email me: [email protected]
""")
with gr.Tab("Data Preprocessing") as data_preprocessing_page:
with gr.Tab("Data Loading") as dp_overview:
gr.HTML("<h1 style=\"text-align: center;\">Loading the Data</h1>")
gr.Markdown("## Goal: Load the Data as efficiently as possible")
gr.Markdown("""
* Using Pandas alone is **slow and inefficient**.
* With small datasets, pandas is great because the API is robust.
* With medium datasets, using a library like polars (a Rust based module with 10x pandas speed) is much faster.
* As data gets even larger, multi-processing languages like Spark are required.
* For this dataset, I use pandas for the weather data and polars for the 311 data. After the aggregation and merge, I revert back to pandas for API compatibility.
""")
with gr.Accordion("Code", open=False):
gr.Code(load_code, language="python")
with gr.Tab("Location Mapping") as dp_overview:
src_doc = html.escape(open("figures/map1.html","r").read())
iframe1 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/map2.html","r").read())
iframe2 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/bounded_map.html","r").read())
iframe3 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
src_doc = html.escape(open("figures/final_map.html","r").read())
iframe4 = f'<iframe width="500px" height="500px" srcdoc="{src_doc}" frameborder="0"></iframe>'
gr.HTML("<h1 style=\"text-align: center;\">Location Mapping for Both Datasets</h1>")
with gr.Row(elem_classes="map-legend"):
gr.Markdown("""
**Legend:**
* <span style=\"color: red\">Red:</span> Weather records
* <span style=\"color: #5989ff\">Blue:</span> 311 Service records
""", elem_classes="map-legend-text")
with gr.Row():
with gr.Column():
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York State</h1>")
map1 = gr.HTML(iframe1, elem_classes="map")
with gr.Column():
gr.HTML("<h1 style=\"text-align: center; margin: 0px;\">Map of New York City</h1>")
map2 = gr.HTML(iframe2, elem_classes="map")
with gr.Row():
gr.Markdown("""
Juxtaposing these two maps and seeing the approximate distributions of data observations,
its easy to see the problem. The weather dataset encompasses a larger area than the 311 Service call dataset.
Once this problem was diagnosed the solution was simple. First you find the max coordinate (Lat, Long) bounds
from the 311 Service Dataset. Then, you just filter the weather dataset to only include points from within
these bounds. This was one of my initial discoveries when analyzing the dataset and crucial to ensure
congruity between the two. **Below you can see the bounding box I created and how the new weather data
observations fit in this bounding box.**
""")
with gr.Row():
with gr.Column():
map3 = gr.HTML(iframe3, elem_classes="map")
with gr.Column():
map4 = gr.HTML(iframe4, elem_classes="map")
with gr.Accordion("Code", open=False):
gr.Code(map_code, language="python")
with gr.Tab("Variable Pruning") as var_pruning:
gr.HTML("<h1 style=\"text-align: center;\">How I pruned the datasets</h1>")
gr.Markdown("## Goal: Remove as many useless features as possible")
gr.HTML("<h3 style=\"color: darkorange;\">Key Factors for Feature Removal</h3>")
gr.Markdown("""
* Percentage of missing data points
* Distribution Imbalance
* Irrelevance
* Number of distinct categories
* Another variable was chosen as replacement <br/><br/>
NOTE: Look in the appendix for visualizations of individual variables
""")
droped_var_df = pd.read_excel("data/drop_vars.xlsx")
gr.Dataframe(
droped_var_df,
wrap=True,
label="Dropped Variables & Justification (Weather on Bottom)"
)
with gr.Tab("Time Aggregation") as time_agg:
gr.HTML("<h1 style=\"text-align: center;\">Aggregate Data by Date</h1>")
gr.Markdown("## Goal: Aggregate data by Date")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: 311 Service data is not inherently formatted to provide Created Ticket Counts</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Data must be aggregated by day to find ticket counts</li>
<li>Covariate features need a special transformation</li>
<li>Final Aggregations Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>Created Date ==> groupby.count ==> Target (Created ticket count)</li>
<li>Closed Date ==> Agg* ==> Number of closed tickets (Agg* explained in next tabs)</li>
<li>Agency ==> Agg* ==> Number of tickets by Agency (Agg* explained in next tabs)</li>
<li>Borough ==> Agg* ==> Number of tickets by Boroguh (Agg* explained in next tabs)</li>
<li>Descriptor ==> Agg* ==> Number of tickets by Descriptor Group/Category (Agg* explained in next tabs)</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Weather data is not aggregated by day</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>To merge with 311 Service data, both datasets must be aggregated</li>
<li>Additional transformations may be applied only after time aggregation</li>
<li>Aggregation function needs to be handled feature by feature</li>
<li>Final Aggregation Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>MaxTemp, MaxSustainedWind ==> groupby.max ==> Variables have an inherent max feature</li>
<li>MinTemp ==> groupby.min ==> Variable has an inherent min feature</li>
<li>Rain, SnowIce ==> groupby.mean.round ==> Binary variables are first aggregated than rounded back to binary</li>
<li>All Other Variables ==> groupy.mean ==> Mean used by default as it is the least lossy pooling method</li>
</ul>
</ul>""")
with gr.Tab("Weather Data: Imputation") as wd_impute:
gr.HTML("<h1 style=\"text-align: center;\">Data Imputation</h1>")
gr.Markdown("## Goal: Impute missing values in Weather Data")
gr.HTML("<h3 style=\"color: darkorange;\">Issue: Weather data is incomplete, 49 days are missing in 2018</h3>")
gr.Markdown("#### Proposed Solution: Use a simple imputer to fill these missing days + 7 more days into the \"future\"")
gr.HTML("""
<ul style="font-size: 18px">
<li>Use a simple imputer rather than a robust imputation method to reduce model complexity</li>
<ul style="padding-inline-start: 40px;">
<li>Using a robust imputer = Conducting a multivariate forcast, Very complex & can be slow</li>
<li>Using a simple imputer = Low complexity, low latency</li>
</ul>
<li>Simple imputer applies an aggregate function using Day Of Year (1-366) as the interval</li>
<li>4 different Imputation Methods: Mean, Median, Min, Max</li>
<li>7 additional days are imputed so the weather data can be used as a future covariate in our model</li>
<li>Final Aggregation Mapping</li>
<ul style="padding-inline-start: 40px;">
<li>WindSpeed, MaxSustainedWind, Gust, SnowDepth => Use Mean => Noisy Variables, Non-Mean/Median methods are too biased, curve best fit with Mean</li>
<li>Rain => Use Max => Binary Variables with noise, min/mean/median imputes 0, which does not follow the trend</li>
<li>SnowIce => Use Min (impute 0) => Binary variables but mostly 0's, any other imputation is visually inaccurate</li>
<li>MeanTemp, MinTemp, MaxTemp, DewPoint, Percipitation => Use Min => Perhaps helping to remove non-stationarity (global warming), Winter is colder now than before, Curve best fits with min</li>
</ul>
</ul>""")
gr.Markdown("Use plots below to view the plots used to help justify above reasoning")
with gr.Accordion("Show Plots", open=False):
impute_data = gr.State(wd_full_local)
impute_choices = ["None"]
impute_choices.extend(impute_cols)
wd_impute_col = gr.Dropdown(
choices=impute_choices,
value="None",
label="Choose a Variable to plot all imputation methods"
)
wd_impute_plot = gr.Plot()
wd_impute_col.change(
plot_imputations,
[wd_impute_col, impute_data],
[wd_impute_plot]
)
with gr.Tab("311: Closed Ticket Counting") as ct_date:
gr.HTML("<h1 style=\"text-align: center;\">Closed Ticket Feature</h1>")
gr.Markdown("## The Closed Ticket Feature is built from the Closed Date column similarly to how Created Date was used to generate new 311 Call Volume")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Data Error, Typos, and/or Null valuess</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Number of Null Values: </li>
<li>Number of Closed Dates where Closed Date > Created Date: </li>
<ul style="padding-inline-start: 40px;">
<li>These values were most likely typos/data recording errors</li>
<li>For instance, some of these values dated to 1900</li>
</ul>
<li>SOLUTION: For every data error, impute with the mean difference (recompute Closed Date based off Created)</li>
<li>Mean is calculated as the mean time differential between all valid Closed & Created Dates</li>
<li>Mean Time Differential: 13 Days</li>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Data Leakage - Future into Past</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Most of the Closed Date values are 13 days ahead relative to Created Date</li>
<li>GroupBy Closed Date only will lead to some closed ticket counts leaking into future created dates</li>
<li>SOLUTION: GroupBy [Closed Date, Created Date] pairwise, filter so Created Date < Closed Date</li>
</ul>""")
with gr.Accordion("Code", open=False):
gr.Code(Closed_Ticket_Code, language="python")
with gr.Tab("311: Categorical Grouping") as cat_groups:
BERTopic = gr.State(BERTopic.load("models/BERTopic"))
gr.HTML("<h1 style=\"text-align: center;\">Categorical Features</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Categorical Features have too many categories</h3>")
gr.Markdown("#### Create a mapping of categories into groups to reduce total number (Viewable at the bottom of the page)")
gr.HTML("""
<ul style="font-size: 18px">
<li>Borough:</li>
<ul style="padding-inline-start: 40px;">
<li>Only 9 Categories without grouping</li>
<li>Four Categories are either typos or just null => Group all into OTHER</li>
</ul>
<li>Agency:</li>
<ul style="padding-inline-start: 40px;">
<li>30 Agencies in total are listed</li>
<li>Manual Research to group each Agency by Category of what they typically do</li>
<li>30 Agencies down to 7 Agency Groupings, based on frequency and research</li>
</ul>
<li>Complaint Type: Removed because analysis showed complaints were too related to the agency</li>
<ul style="padding-inline-start: 40px;">
<li>299 unique pairs out of 271 unique complaints => only ~10% difference in distribution</li>
</ul>
<li>Descriptor: Over 1000+ unique categories. Only way to realistically group is to use NLP</li>
<ul style="padding-inline-start: 40px;">
<li>Pretrained a BERTopic model to extract topics from the text</li>
<li>BERTopic uses TF-IDF & Transformers to extract topics from text</li>
<li>BERTopic reduced 1000 categories into 8 groups</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: How do we aggregate by day these features when there are multiple repeated categories per day</h3>")
gr.Markdown("#### One Hot Encode and Sum per category")
gr.HTML("""
<ul style="font-size: 18px">
<li>Step 1: One hot encode all the features before aggregation</li>
<li>Step 2: GroupBy date and Sum for each encoding</li>
<ul style="padding-inline-start: 40px;">
<li>Example: A categorical group with 4 categories</li>
<li>One Sum column per category representing the frequency of that category per day</li>
</ul>
<li>Main Downside: Highly correlated with Created Ticket data; aggregation method was essentially the same</li>
<ul style="padding-inline-start: 40px;">
<li>Summing across the four feature categories in the example above would just equal the ticket count</li>
</ul>
<li>Solution: Leave some categories out of final vector to reduce bias (Shown in feature engineering stage)</li>
</ul>""")
with gr.Accordion("View Feature Groups", open=False):
with gr.Accordion("Borough", open=False):
gr.JSON(json.loads(open("code/Borough.json", "r").read()))
with gr.Accordion("Agency", open=False):
gr.JSON(open("code/Agency.json", "r").read())
with gr.Accordion("Descriptor", open=False):
gr.Dataframe(topic_model.get_topic_info().loc[:, ["Count", "Name", "Representation"]])
gr.Plot(topic_model.visualize_barchart(list(range(-1,6,1))))
with gr.Tab("All Code") as code_preprocess:
gr.Markdown("# View Full Code for building Weather Data")
with gr.Accordion(open=False):
gr.Code(open("code/build_weather.py", "r").read())
gr.Markdown("# View Full Code for building 311 Service Data")
with gr.Accordion(open=False):
gr.Code(open("code/build_service.py", "r").read())
with gr.Tab("Exploratory Data Analysis", id="eda_page") as eda_page:
bivar_data = gr.State(data_merged_eda)
with gr.Tab("Overview", id="eda_overview") as eda_overview:
gr.Markdown("# The EDA Section is intended to be a set of interactive visualizations")
gr.Markdown("The tabs are interactive plots and tables that were used to generate the key insights below.")
gr.HTML("<h3 style=\"color: darkorange;\">Key Insights</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Missing Values:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Gust if used may need interpolation to fill missing values</li>
</ul>
<li>Stationarity</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Trends are clear for some like Temperature and DewPoint</li>
<li>Possible cause of constant non-stationarity are factors such as global warming</li>
</ul>
<li>311 Calls may exhibit some forms of weekly non-stationarity</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Potentially weekly and monthly non-stationarity</li>
<li>Affected by Holidays and Weekends</li>
<li>More robust tests needed</li>
</ul>
<li>Action Item: Test for stationarity and remove</li>
</ul>
<li>Bivariate Interactions:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>311 Calls have stronger relationships with certain Agency, Borough and Descriptor categories</li>
<li>311 calls exhibit weak overal linear relationships with weather</li>
<ul style="padding-inline-start: 60px; font-size: 18px;">
<li>Monthly and Seasonal relationship is strongest in winter months</li>
<li>Month Of January: strongest linear relationship between MinTemp, DewPoint</li>
</ul>
</ul>
<li>Seasonality:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Weather variables exhibit a strong Yearly and Seasonal seasonality</li>
<li>311 Service Variables exhibit Weekly Seasonality</li>
<li>311 Variables affected strongly by holidays and weekends (less 311 calls on weekends and holidays)</li>
</ul>
<li>Correlation:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Heavy Collinearity among weather variables (especially Min, Mean, MaxTemp)</li>
<li>Varying degrees of correlation among 311 covariates and 311 volume</li>
</ul>
<li>Lags & Autocorrelation:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>311 Service Calls have highest correlation with 7,14,21 weekly lags</li>
<li>6,8 day lag intervals second strongest relationship. 8 day exhibits some negative correlation</li>
<li>1 day lag exhibits similar correlation with 6,7 day lags</li>
</ul>
</ul>""")
with gr.Tab("Univariate", id="eda_univar") as eda_univar:
with gr.Tab("Weather Data") as eda_uni_weather:
eda_univar_weatherdf = gr.State(weather_full_df)
gr.Markdown("# Use the Interactive plot below")
eda_uni_weather_name = gr.State("Weather")
weather_vars = [
"", 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind'
]
select_weather_var = gr.Dropdown(
choices=weather_vars,
value="",
label="Select a Variable to View"
)
weather_uniplot = gr.Plot()
select_weather_var.change(
plot_timeseries,
inputs=[
eda_univar_weatherdf,
select_weather_var,
eda_uni_weather_name
],
outputs=[
weather_uniplot
]
)
with gr.Tab("311 Service Data") as eda_uni_weather:
eda_univar_servicedf = gr.State(data_merged_eda)
gr.Markdown("# Use the Interactive plot below")
gr.Markdown("**NOTE: Target is the count of 311 service records**")
eda_uni_service_name = gr.State("Weather")
service_vars = [
"", 'Target', 'num_closed_tickets',
# Agency Group Counts
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
'AG_Parks', 'AG_Security', 'AG_Transportation',
'AG_Other',
# Borough Counts
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
'Borough_QUEENS', 'Borough_STATEN ISLAND',
'Borough_OTHER',
# Descriptor Group Counts
'DG_damaged_sign_sidewalk_missing',
'DG_english_emergency_spanish_chinese',
'DG_exemption_commercial_tax_business',
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
'DG_water_basin_litter_missed'
]
select_service_var = gr.Dropdown(
choices=service_vars,
value="",
label="Select a Variable to View"
)
service_uniplot = gr.Plot()
select_service_var.change(
plot_timeseries,
inputs=[
eda_univar_servicedf,
select_service_var,
eda_uni_service_name
],
outputs=[
service_uniplot
]
)
with gr.Tab("Bivariate", id="eda_bivar") as eda_bivar:
gr.Markdown("# Use the Interactive plot below")
gr.Markdown("Use this tab to view relationships between the Target variable (number of tickets created daily) and a Covariate")
with gr.Column():
with gr.Row() as bivar_params:
bivar_dist_target = gr.Dropdown(
choices=["Target"],
value="Target",
label="Target Variable (One option)"
)
all_bivars = ['num_closed_tickets', "Agency", "Borough", "Descriptor"]
all_bivars.extend(weather_vars)
all_bivars = sorted(all_bivars)
all_bivars = all_bivars[1:]
bivar_dist_cov = gr.Dropdown(
choices=all_bivars,
value="MeanTemp",
label="Select Covariate"
)
bivar_trendline = gr.Dropdown(
choices=[True, False],
value=True,
label="Graph with OLS Trendline"
)
with gr.Accordion("Add Seasonality", open=False):
bivar_subset = gr.Dropdown(
choices=["None", "Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday"],
value="None",
label="Seasonality Options (Disabled for Agency, Borough and Descriptor)"
)
bivar_submit = gr.Button("Run")
bivar_plot = gr.Plot()
bivar_submit.click(
plot_bivariate,
[bivar_data, bivar_dist_cov, bivar_dist_target, bivar_subset, bivar_trendline],
bivar_plot
)
with gr.Tab("Seasonality") as bivar_season:
gr.Markdown("## Exploring the affect of Seasonality")
with gr.Row() as bivar_season_params:
bivar_season_var = gr.Dropdown(
choices=["Target", 'MeanTemp', 'DewPoint',
'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind'],
value="Target",
label="Variable"
)
bivar_season_cov = gr.Dropdown(
choices=["Year", "Season", "Month", "Day Of Week", "Weekend", "Holiday", "Rain", "SnowIce"],
value="Year",
label="Seasonality"
)
with gr.Column():
season_boxplot = gr.Checkbox(value=True, label="Show Boxplot")
season_outlier = gr.Checkbox(value=False, label="Show Outliers")
bivar_season_btn = gr.Button("Run")
bivar_season_plot = gr.Plot()
bivar_season_btn.click(
plot_seasonality,
[bivar_data, bivar_season_cov, bivar_season_var, season_boxplot, season_outlier],
[bivar_season_plot]
)
with gr.Tab("Correlation") as corr:
with gr.Tab("Weather Correlations") as corr_weather:
gr.Plot(plot_all_correlations(data_merged_eda, "weather", method="pearson"))
with gr.Tab("311 Service Correlations") as corr_service:
gr.Plot(plot_all_correlations(data_merged_eda, "service", method="pearson"))
with gr.Tab("Lag Correlations") as corr_dynamic:
gr.Markdown("## Use this to dynamically view correlations based on Lag")
gr.Markdown("By Default, we will analyze lags of [0,1,2,3,4,5,6,7,8,13,14,15,21] days for chosen variable")
gr.Markdown("Scroll Down For AutoCorrelation Graphs")
with gr.Row():
corr_vars = [
"None", 'Target', 'num_closed_tickets',
# Weather Variables
'MeanTemp', 'DewPoint', 'Percipitation',
'WindSpeed', 'Gust', 'SnowDepth',
'MinTemp', 'MaxTemp', 'MaxSustainedWind',
# Agency Group Counts
'AG_Buildings', 'AG_Environment & Sanitation', 'AG_Health',
'AG_Parks', 'AG_Security', 'AG_Transportation',
'AG_Other',
# Borough Counts
'Borough_BRONX', 'Borough_BROOKLYN', 'Borough_MANHATTAN',
'Borough_QUEENS', 'Borough_STATEN ISLAND',
'Borough_OTHER',
# Descriptor Group Counts
'DG_damaged_sign_sidewalk_missing',
'DG_english_emergency_spanish_chinese',
'DG_exemption_commercial_tax_business',
'DG_license_complaint_illegal_violation', 'DG_noise_animal_truck_dead',
'DG_odor_food_air_smoke', 'DG_order_property_inspection_condition',
'DG_water_basin_litter_missed'
]
corr_vars = gr.Dropdown(
choices=corr_vars,
value="Target",
label="Variable"
)
corr_btn = gr.Button("Run")
corr_plot = gr.Plot()
autocorr_plot = gr.Plot()
corr_btn.click(
plot_correlations,
[bivar_data, corr_vars],
[corr_plot]
)
corr_btn.click(
plot_autocorr,
[bivar_data, corr_vars],
[autocorr_plot]
)
with gr.Tab("Feature Engineering") as feature_engineer_page:
with gr.Tab("Feature Selection") as feature_select:
gr.HTML("<h1 style=\"text-align: center;\">Select Features Based on EDA</h1>")
gr.Markdown("### Below is the logic used in our model feature selection")
gr.HTML("""
<ul style="font-size: 18px">
<li>Weather Covariates</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Weather variables exhibit various levels of non-stationarity (mostly based on trend but some constant)</li>
<li>MeanTemp, MaxTemp: High collinearity with MinTemp. MinTemp has highest correlation of 3 => REMOVE</li>
<ul style="padding-inline-start: 50px; font-size: 18px;">
<li>Possible Reason: High temps, people stay indoors. A/C doesn't break nowadays. Lower Temps lead to building/tech failure more often</li>
</ul>
<li>Percipitation: Bivariate plot shows weak relationship, outliers no effect on 311 => REMOVE</li>
<li>SnowDepth: High number missing values, low correlation => REMOVE</li>
<li>Rain, SnowIce: Binary, plots (look in Seasonality Tab) show weak relationship, SnowIce heavily imbalanced (99% 0's) => REMOVE</li>
</ul>
<li>311 Service Covariates:</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>LOO (Leave One - or many - Out) Encoding:</li>
<ul style="padding-inline-start: 50px; font-size: 18px;">
<li>Remove weakest features from our categorical covariates</li>
<li>Reduces bias and removes multicollinearity inherent to One-Hot Encoding</li>
<li>Candidates For Removal:</li>
<ul style="padding-inline-start: 70px; font-size: 18px;">
<li>AG_Health, AG_Other: Lowest Correlation, lowest counts => REMOVE</li>
<li>AG_Parks: Lowest Correlation, but low multi-collinearity => KEEP</li>
<li>Borough_OTHER: Weakest Correlation, lowest count => REMOVE</li>
<li>DG_english_emergency, DG_exemption_commercial: Weakest Correlation, lowest counts => REMOVE</li>
<li>DG_odor_food_air_smoke: Lowest Count, but high correlation => KEEP</li>
</ul>
</ul>
</ul>
</ul>""")
with gr.Accordion("Show Final Variable List", open=False):
gr.JSON(json.loads(open("code/all_vars.json","r").read()))
with gr.Tab("Feature Preprocessing") as feature_prep:
data_feature_prep = gr.State(data_preprocess)
gr.HTML("<h1 style=\"text-align: center;\">Preprocess Features</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Missing Values</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Only One value has missing values to impute: Gust</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Various interpolation methods were tested</li>
<li>Methods like Spline and Polynomial over-estimated some values, breaking inherent data ranges</li>
<li>Turns out Simple Linear interpolation was best</li>
</ul>
<li>SOLUTION: Interpolate Gust with Linear method</li>
</ul>""")
with gr.Accordion("Show Interpolation Plots", open=False):
gr.Plot(plot_interpolation(data_preprocess))
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Remove Non-Stationarity</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Variables that are non-stationary change over time, they have a trend</li>
<li>Ideal to transform non-stationarity variables for modeling</li>
<li>Ignore Categorical Variables (simply to keep model complexity low)</li>
<li>Numerical Variables were tested for Non-Stationarity using two methods: ADF and KPSS</li>
<ul style="padding-inline-start: 30px; font-size: 18px;">
<li>Using ADF and KPSS together can reveal what kind of trend exists in the data</li>
<li>Only 1 Case Met: Pass KPSS, Fail ADF = Trend Stationary (most likely by season)</li>
</ul>
<li>Only Two Variables failed the tests: DewPoint & MinTemp</li>
<li>SOLUTION: Use Differencing (7d lag) + Log for MinTemp and Differencing (7d lag) for DewPoint (Log caused many NaNs)</li>
</ul>""")
with gr.Accordion("View Results Below", open=False):
gr.Markdown("### MinTemp (Log) Tests Before and After Transformation")
with gr.Row():
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "MinTemp_old"), label="MinTemp No Augments")
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "MinTemp"), label="Log + 7 Day Lag Differencing")
gr.Markdown("### DewPoint Tests Before and After Transformation")
with gr.Row():
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "DewPoint_old"), label="DewPoint No Augments")
with gr.Column():
gr.Dataframe(test_stationary(data_preprocess, "DewPoint"), label="7 Day Lag Differencing")
with gr.Tab("Feature Engineering") as feature_eng:
with gr.Tab("Past Covariates") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
gr.Markdown("""
* Past Covariates are datapoints that are implied to be only related to past information
* For Instance, using past sales of product B to predict futures sales of product A
* There are two ways to use past covariates
* *Option 1:* Build a multi-variate forecast to predict these variables simultaneously
* *Option 2:* Use a sliding window and lags to provide past data (especially for multi-step forecasts)
""")
gr.Markdown("**I will use Option 2 to avoid building a very complex multi-variate model**")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
gr.Markdown("""
* By using lags, I can shift my data in a way to avoid leaking past data into the future
* For predicting 7 days into the future, I must lag my data by at least 7 days
* Use a rolling window that will reset over time
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Curse of Dimensionality</h3>")
gr.Markdown("""
* Possible to use many variations of lags, rolling and differences to generate many features
* Too many features leads to the curse of dimensionality, i.e. Overfitting
* Thus, I keep my Feature Set as simple as possible
""")
gr.Markdown("""
### Feature Set
* Lags: 7D, 14D, 21D
* Rolling (Shifted 7 Days forward): Mean of 14D (14 because mean(Created - Closed Date) = 13 days)
* Differencing (7D difference = 7D lag - 14D lag): 7D
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/past_features.py","r").read())
with gr.Tab("Future Covariates") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">Past Covariate Features</h1>")
gr.Markdown("""
* Future Covariates are data that I have about the future
* For Instance, I can use the projected revenue of Company A to predict daily sales
* For Future Covariates, I do not need to shift variables. I will provide a shift up to 2 days.
* I apply a rolling and expanding window as more features
* Also, I use mean and min to follow the logic learned in EDA. Minimum temp values seem to be more impactful on 311 volume
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Curse of Dimensionality</h3>")
gr.Markdown("""
* Similar to the Past Covaraiates, I keep my features as simple as possible with as little as possible
* The more features, the more we may overfit
""")
gr.Markdown("""
### Feature Set
* Lags: 0D, 1D, 2D
* Rolling: Mean & Min of last 14D
* Expanding Window: Max, Min (min-length of 14)
* Differencing already performed to remove trends
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/future_features.py","r").read())
with gr.Tab("Target Variable") as fe_past:
gr.HTML("<h1 style=\"text-align: center;\">311 Service Calls Features</h1>")
gr.Markdown("""
* For providing feature transformations of our Target, we can follow a similar process as above
* Main Difference: Lags of < prediction window need to be recomputed at each iteration
* So, for predicting at time (t+1) we need the predicted value at time (t)
* For a recursive prediction model, this means the model cannot make batch predictions without iterating
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: More variables increase complexity for prediction</h3>")
gr.Markdown("""
* The more features, the more overfitting & more computation
* As I will use a recursive model, these values must be recomputed at each step t+1
* In favor of a less complex model, I will choose as minimal features as possible (excluding rolling features as its prone to error with recalculation)
""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Leaking Future Data into the past</h3>")
gr.Markdown("""
* Must be careful about how these features are computed
* For instance, for rolling mean, I would shift the data up by 1 lag first then compute the rolling sum
* For differencing, a 7D lag difference is really the 1D - 8D lag. (For t=8, 7D diff = t7-t1 not t8-t2)
""")
gr.Markdown("""
### Feature Set
* Lags: 1D, 6D, 7D, 8D, 14D, 21D (based on highest correlations and weekly seasonality)
* Differencing: 7D, 14D
""")
with gr.Accordion("Open to view implementation code", open=False):
gr.Code(open("code/target_features.py","r").read())
with gr.Tab("Forecast Model") as model_select_train_page:
with gr.Tab("Splitting the data") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Splitting Time-Series Data</h1>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Splitting Time-Series Data is different than splitting other data</li>
<li>Rather than splitting on random samples, you split the data by time with order consistent</li>
<li>I took a 75% splitting approach where I split my data at the date that sits on the 75% of data length</li>
</ul>""")
gr.Markdown("#### As an example, I provide a graph showing exactly how I split my data")
gr.Plot(plot_train_split())
with gr.Tab("Model Selection") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Choosing the Right Model</h1>")
gr.Markdown("### Types of Forecast Models for Multi-Step Prediction")
gr.HTML("""
<ul style="font-size: 18px">
<li>Parallel Models: Train a model for each prediction (one for 1 day ahead, another for 2, etc.)</li>
<li>Recursive Models: Model makes a forecast, fills any values it needs for the next prediction, predicts again</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>One of the assumptions was to build a model that was reasonable for production</li>
<li>Parallel models are hard to maintain as the steps of prediction increase</li>
</ul>
<li>Decision: Recursive Modele</li>
</ul>""")
gr.Markdown("### My Model Choice: XGBoost")
gr.HTML("""
<ul style="font-size: 18px">
<li>Reasons for choosing:</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Industry standard for regression</li>
<li>Lightweight and relatively fast</li>
<li>Many parameters to tune, such as tree depth and regularization</li>
<li>Scale invariant - Data does not have to be scaled</li>
<li>Allows NaN values and categorical features without encodings (unused in my implementation)</li>
<li>Provides key explainability in its feature importance metrics</li>
</ul>
<li>Decision: Use XGBoost</li>
</ul>""")
with gr.Tab("Model Training") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Training the Model</h1>")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 1: Overfitting</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Main Cause: High number of variables and XGBoost's tendency to overfit without tuning</li>
<li>While training, effort was made to watch the validation and training set's relative performance</li>
<li>Steps Taken to avoid Overfitting</li>
<ul style="padding-inline-start: 40px; font-size: 18px;">
<li>Low Learning Rate</li>
<li>Low Tree Depth</li>
<li>Keeping Val score relatively close to Training score</li>
<li>Increased l2-lambda parameter, boosting regularization</li>
<li>Many trials to get best set of parameters</li>
<li>Implementing Early Stopping</li>
</ul>
</ul>""")
gr.HTML("<h3 style=\"color: darkorange;\">Issue 2: Choosing a Metric</h3>")
gr.HTML("""
<ul style="font-size: 18px">
<li>Three metrics I considered: MAPE, MAE and MSE</li>
<li>MAPE seemed to show the most consistent and visually accurate results</li>
<li>Decision: MAPE</li>
<li>Justification: 311 Service volume is quite noisy and MAPE better estimates fit to a very noisy curve than the others</li>
</ul>""")
with gr.Tab("Model Prediction") as model_data_split:
gr.HTML("<h1 style=\"text-align: center;\">Recursive Model Prediction</h1>")
gr.Markdown("""
* Below is the code I wrote to implement the Recursive prediction explained in previous tabs
* Predictions are made one step at a time, where the prediction t depends on prediction t-1
* To view the final predictions made by the model see below
""")
gr.Code(open("code/recurse_predict.py","r").read())
with gr.Accordion("View 7 Day Model Forecast", open=False):
gr.Plot(plot_final_predictions())
with gr.Tab("Model Evaluation") as model_eval_page:
gr.HTML("<h1 style=\"text-align: center;\">Forecast Results</h1>")
gr.Markdown("Overall, the model seemed to have performed pretty well. The MAPE is also <10% for both Validation and Training sets.")
gr.Markdown("The model did suffer from a low validation R2, but this was difficult to resolve without compromising overall performance of the model.")
gr.Markdown("The predictions seem to visually pass most backtests, which can be viewed in the graph below.")
with gr.Accordion("Model Prediction Scores", open=False):
gr.JSON({"Train R2": r2_train, "Train MAPE": mape_train, "Validation R2": r2_val, "Validation MAPE": mape_val})
gr.Image("figures/model_performance.png", show_download_button=False)
with gr.Tab("Feature Importance") as model_eval_page:
gr.HTML("<h1 style=\"text-align: center;\">Feature Importance</h1>")
gr.Markdown("""
* Below you can view the feature importance metrics from the XGBoost model
* It seems there is significant impact of the weather variables on 311 Service Call Volume
* Interestingly, it seems some categories were more impactful than others as well
""")
gr.Plot(plot_model_feature_importance())
with gr.Tab("Future Work & Limitations") as future_limitations_page:
gr.Markdown("# Future Work")
gr.Markdown("""
* **Multi-Variate Time Series Forecasting** rather than imputing values naively
* Testing more kinds of models such as LightGBM
* Robustly testing parameters of current model using GridSearchCV
* Comparing performance of my forecast model to others
* More Data! Having more 311 Call data may help find other indicators
""")
gr.Markdown("# Future Deployments")
gr.Markdown("""
* Containerize the model and load onto an API for ingestion
* Containerize data preprocessing and load into a Spark Cluster
* Create triggers and view tables to verify data preprocessing
* Create functions to monitor model performance
""")
with gr.Tab("Appendix") as future_limitations_page:
with gr.Tab("Weather Data Analysis") as dp_weather:
dp_weather_state = gr.State("weather")
with gr.Column():
with gr.Row():
dp_weather_category = gr.Dropdown(
choices=["2011-2018", "2016-2018"],
value="2011-2018",
label="Time Range"
)
dp_weather_var = gr.Dropdown(
choices = ["MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation", "WindSpeed", "MaxSustainedWind", "Gust", "Rain", "SnowDepth", "SnowIce"],
value = "MeanTemp",
label = "Variable"
)
dp_weather_btn = gr.Button("Run")
dp_weather_report = gr.HTML(value=iframe_dp_weather)
dp_weather_btn.click(
run_report,
[dp_weather_state, dp_weather_var, dp_weather_category],
dp_weather_report,
)
with gr.Tab("Service Data Analysis") as dp_service:
dp_service_state = gr.State("service")
dp_service_category = gr.State("full")
with gr.Column():
dp_service_var = gr.Dropdown(
choices = [
"Created Date", "Closed Date", "Agency", "Agency Name",
"Complaint Type", "Descriptor", "Location Type", "Landmark",
"Facility Type", "Status", "Community Board", "Borough",
"Open Data Channel Type", "Park Facility Name", "Park Borough",
"Vehicle Type", "Taxi Company Borough", "Taxi Pick Up Location",
"Bridge Highway Name", "Bridge Highway Direction", "Road ramp",
"Bridge Highway Segment"
],
value = "Created Date",
label = "Select Variable and Run"
)
dp_service_btn = gr.Button("Run")
dp_service_report = gr.HTML(value=iframe_dp_service)
dp_service_btn.click(
run_report,
[dp_service_state, dp_service_var, dp_service_category],
dp_service_report,
)
def main():
app.launch(share=False)
return app
if __name__=="__main__":
main() |