File size: 2,099 Bytes
d689310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import logging
import pandas as pd


# define logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("process_data.log"),
        logging.StreamHandler(),
    ],
)

# change these to paths if you want to generate the map_data.csv separately from the app
DATA_RAW = os.path.join("data.json")
CITIES_ENRICHED = os.path.join("data", "cities_enriched_final.csv")

OUTPUT = os.path.join("data", "preprocessed", "map_data.csv")


def load_data(path: str = DATA_RAW) -> pd.DataFrame:
    df = pd.read_json(path)
    counts = df["ORG"].value_counts().reset_index()
    counts.columns = ["ORG", "Count"]
    return counts


def merge_geoemtry(data: pd.DataFrame, cities: pd.DataFrame) -> pd.DataFrame:
    data = data.merge(cities, left_on="ORG", right_on="Kommune", how="left")
    return data


def add_coor(data: pd.DataFrame):
    # very experminetal, but works
    if type(data["Geometry"].iloc[0]) == str:
        data["Geometry"] = data["Geometry"].apply(
            lambda x: [
                float(item) if type(item) != float else None
                for item in x.strip("[]").split()
            ]
        )
        # print(type(data["Geometry"].iloc[0]), data["Geometry"].iloc[0])
    data["lat"] = data["Geometry"].apply(lambda x: float(x[0]) if x != [] else None)
    data["lon"] = data["Geometry"].apply(lambda x: float(x[1]) if x != [] else None)
    return data


if __name__ == "__main__":
    extraction = load_data()
    # extraction.to_csv(
    #    os.path.join("data", "preprocessed", "map_data.csv"), index=False)
    logging.info("Extraction data loaded.")
    extraction = merge_geoemtry(extraction, pd.read_csv(CITIES_ENRICHED))
    logging.info("Data merged with Geometry from cities.csv.")
    # extraction = extraction[extraction["Geometry"].notna()]
    extraction_enriched = add_coor(extraction)
    logging.info("Extra columns for lat/lon created from Geometry column.")
    extraction_enriched.to_csv(OUTPUT, index=False)
    logging.info("Data enriched and saved.")