import os import logging import pandas as pd # define logger logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[ logging.FileHandler("process_data.log"), logging.StreamHandler(), ], ) # change these to paths if you want to generate the map_data.csv separately from the app DATA_RAW = os.path.join("2024-08-21_musterdatenkatalog.json") CITIES_ENRICHED = os.path.join("data", "cities_enriched_manually.csv") OUTPUT = os.path.join("data", "preprocessed", "map_data.csv") def load_data(path: str = DATA_RAW) -> pd.DataFrame: df = pd.read_json(path) counts = df["ORG"].value_counts().reset_index() counts.columns = ["ORG", "Count"] return counts def merge_geoemtry(data_in: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame: data = data_in.merge(cities, left_on="ORG", right_on="Kommune", how="left") if data["Geometry"].isna().sum() > 0: logging.warning( f"Missing {data['Geometry'].isna().sum()} geometries in the data." ) for row in data[data["Geometry"].isna()].itertuples(): if row.ORG in cities["name"].values: data.at[row.Index, "Geometry"] = cities[cities["name"] == row.ORG][ "Geometry" ].values[0] logging.info("data found in citiesname.") return data def add_coor(data: pd.DataFrame): # very experminetal, but works for row in data.itertuples(): if type(row.Geometry) == str: data.at[row.Index, "Geometry"] = [ item for item in row.Geometry.strip("[]").split() ] else: logging.info(f"{row.Geometry}, {row.Geometry}") # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0]) data["lat"] = data["Geometry"].apply( lambda x: float(x[0]) if x is not None else None ) data["lon"] = data["Geometry"].apply( lambda x: float(x[1]) if x is not None else None ) return data if __name__ == "__main__": extraction = load_data() # extraction.to_csv( # os.path.join("data", "preprocessed", "map_data.csv"), index=False) logging.info("Extraction data loaded.") extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED)) logging.info("Data merged with Geometry from cities.csv.") # extraction = extraction[extraction["Geometry"].notna()] extraction_enriched = add_coor(extraction) logging.info("Extra columns for lat/lon created from Geometry column.") extraction_enriched.to_csv(OUTPUT, index=False) logging.info("Data enriched and saved.")