LocationFinder / src /subset_data.py
mattupson's picture
chg: Extract locations from Wellcome examples
b8d16b2 unverified
raw
history blame
1.01 kB
import numpy as np
import pandas as pd
import yaml
def load_config(config_file: str) -> dict:
with open(config_file) as f:
config = yaml.safe_load(f)
return config
INPUT_FILE = "data/raw/Wellcome-grants-awarded-1-October-2005-to-04-05-2022.csv"
OUTPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
def subset_docs(input_file: str, output_file: str, sample: int):
print(f"Reading data from {input_file}")
data = pd.read_csv(input_file)
data = (
data[["Description"]]
.replace("Not available", np.nan)
.dropna()
.drop_duplicates()
.reset_index(drop=True)
.sample(sample)
)
print(f"Number of rows: {data.shape[0]}")
print(f"Number of unique rows: {data['Description'].nunique()}")
print(f"Saving file to {output_file}")
data.to_csv(output_file, index=False)
if __name__ == "__main__":
params = load_config("params.yaml")
subset_docs(INPUT_FILE, OUTPUT_FILE, sample=params["n_docs"])