Spaces:
Sleeping
Sleeping
File size: 2,099 Bytes
d689310 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import os
import logging
import pandas as pd
# define logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("process_data.log"),
logging.StreamHandler(),
],
)
# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("data.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")
OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")
def load_data(path: str = DATA_RAW) -> pd.DataFrame:
df = pd.read_json(path)
counts = df["ORG"].value_counts().reset_index()
counts.columns = ["ORG", "Count"]
return counts
def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left")
return data
def add_coor(data: pd.DataFrame):
# very experminetal, but works
if type(data["Geometry"].iloc[0]) == str:
data["Geometry"] = data["Geometry"].apply(
lambda x: [
float(item) if type(item) != float else None
for item in x.strip("[]").split()
]
)
# print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None)
data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None)
return data
if __name__ == "__main__":
extraction = load_data()
# extraction.to_csv(
# os.path.join("data", "preprocessed", "map_data.csv"), index=False)
logging.info("Extraction data loaded.")
extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
logging.info("Data merged with Geometry from cities.csv.")
# extraction = extraction[extraction["Geometry"].notna()]
extraction_enriched = add_coor(extraction)
logging.info("Extra columns for lat/lon created from Geometry column.")
extraction_enriched.to_csv(OUTPUT, index=False)
logging.info("Data enriched and saved.")
|