gourisankar85 commited on
Commit
1ac5ab9
·
verified ·
1 Parent(s): ba71758

Update data/load_dataset.py

Browse files
Files changed (1) hide show
  1. data/load_dataset.py +26 -26
data/load_dataset.py CHANGED
@@ -1,26 +1,26 @@
1
- import os
2
- import logging
3
- import pickle
4
- from datasets import load_dataset
5
- from config import ConfigConstants # For saving the dataset locally
6
-
7
- def load_data(data_set_name):
8
- local_path = ConfigConstants.DATA_SET_PATH + 'local_datasets'
9
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
10
- dataset_file = os.path.join(local_path, f"{data_set_name}_test.pkl")
11
-
12
- if os.path.exists(dataset_file):
13
- logging.info("Loading dataset {data_set_name} from local storage")
14
- with open(dataset_file, "rb") as f:
15
- dataset = pickle.load(f)
16
- else:
17
- logging.info("Loading dataset from Hugging Face")
18
- dataset = load_dataset("rungalileo/ragbench", data_set_name, split="test")
19
- logging.info(f"Saving {data_set_name} dataset locally")
20
- with open(dataset_file, "wb") as f:
21
- pickle.dump(dataset, f)
22
-
23
- logging.info("Dataset loaded successfully")
24
- logging.info(f"Number of documents found: {dataset.num_rows}")
25
- return dataset
26
-
 
1
+ import os
2
+ import logging
3
+ import pickle
4
+ from datasets import load_dataset
5
+ from config import ConfigConstants # For saving the dataset locally
6
+
7
+ def load_data(data_set_name):
8
+ local_path = ConfigConstants.DATA_SET_PATH + 'local_datasets'
9
+ os.makedirs(local_path, exist_ok=True)
10
+ dataset_file = os.path.join(local_path, f"{data_set_name}_test.pkl")
11
+
12
+ if os.path.exists(dataset_file):
13
+ logging.info("Loading dataset {data_set_name} from local storage")
14
+ with open(dataset_file, "rb") as f:
15
+ dataset = pickle.load(f)
16
+ else:
17
+ logging.info("Loading dataset from Hugging Face")
18
+ dataset = load_dataset("rungalileo/ragbench", data_set_name, split="test")
19
+ logging.info(f"Saving {data_set_name} dataset locally")
20
+ with open(dataset_file, "wb") as f:
21
+ pickle.dump(dataset, f)
22
+
23
+ logging.info("Dataset loaded successfully")
24
+ logging.info(f"Number of documents found: {dataset.num_rows}")
25
+ return dataset
26
+