# Build all weather data from file def build_weather_data(filename): # Use pandas to read file weather_data = pd.read_csv(filename) # Quickly aggregate Year, Month, Day into a datetime object # This is because the 311 data uses datetime weather_data["Datetime"] = weather_data["Year"].astype("str") + "-" + weather_data["Month"].astype("str") + "-" + weather_data["Day"].astype("str") weather_data = create_datetime(weather_data, "Datetime", format="%Y-%m-%d") # LOCALIZE # Pre-recorded min/max values from the service data (so we don't need again) lat_min = 40.49804421521046 lat_max = 40.91294056699566 long_min = -74.25521082506387 long_max = -73.70038354802529 # Create the conditions for location matching mincon_lat = weather_data["Latitude"] >= lat_min maxcon_lat = weather_data["Latitude"] <= lat_max mincon_long = weather_data["Longitude"] >= long_min maxcon_long = weather_data["Longitude"] <= long_max # Localize our data to match the service data wd_localized = weather_data.loc[mincon_lat & maxcon_lat & mincon_long & maxcon_long] drop_cols = [ "USAF", "WBAN", "StationName", "State", "Latitude", "Longitude" ] wd_localized = wd_localized.drop(columns=drop_cols) # AGGREGATE # Map columns with aggregation method mean_cols = [ 'MeanTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'Gust', 'SnowDepth', ] min_cols = [ 'MinTemp' ] max_cols = [ 'MaxTemp', 'MaxSustainedWind' ] round_cols = [ 'Rain', 'SnowIce' ] # Perform Aggregation mean_df = wd_localized.groupby("Datetime")[mean_cols].mean() min_df = wd_localized.groupby("Datetime")[min_cols].min() max_df = wd_localized.groupby("Datetime")[max_cols].max() round_df = wd_localized.groupby("Datetime")[round_cols].mean().round().astype(np.int8) wd_full = pd.concat([mean_df, min_df, max_df, round_df], axis=1) # Add seasonal features wd_full = build_temporal_features(wd_full, "Datetime") wd_full["Season"] = wd_full["Season"].astype("category") wd_full = wd_full.set_index("Datetime") # We will calculate the imputation for the next 7 days after 12/31/2018 # Along with the 49 missing days # This will act as our "Weather Forecast" time_steps = 49 + 7 # Impute Cols impute_cols = [ 'MeanTemp', 'MinTemp', 'MaxTemp', 'DewPoint', 'Percipitation', 'WindSpeed', 'MaxSustainedWind', 'Gust', 'Rain', 'SnowDepth', 'SnowIce', ] # Mean Vars mean_vars = ["WindSpeed", "MaxSustainedWind", "Gust", "SnowDepth"] min_vars = ["SnowIce", "MeanTemp", "MinTemp", "MaxTemp", "DewPoint", "Percipitation"] max_vars = ["Rain"] # Use the imported function to create the imputed data preds_mean = impute_missing_weather(wd_full, strategy="mean", time_steps=time_steps, impute_cols=mean_vars) preds_min = impute_missing_weather(wd_full, strategy="min", time_steps=time_steps, impute_cols=min_vars) preds_max = impute_missing_weather(wd_full, strategy="max", time_steps=time_steps, impute_cols=max_vars) all_preds = pd.concat([preds_mean, preds_min, preds_max], axis=1) all_preds = build_temporal_features(all_preds.loc[:, impute_cols], "Datetime") all_preds = all_preds.set_index("Datetime") wd_curr = wd_full.loc[wd_full["Year"] >= 2016] wd_df = pd.concat([wd_full, all_preds], axis=0, join="outer") return wd_df