saving datasets in json
Browse files- embeddings.py +33 -0
embeddings.py
CHANGED
@@ -3,11 +3,44 @@ from transformers import AutoTokenizer, AutoModel
|
|
3 |
import faiss
|
4 |
import torch
|
5 |
import numpy as np
|
|
|
|
|
6 |
|
7 |
def log(message):
|
8 |
print(f"β
{message}")
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
# β
Load datasets
|
12 |
datasets = {
|
13 |
"sales": load_dataset("goendalf666/sales-conversations"),
|
|
|
3 |
import faiss
|
4 |
import torch
|
5 |
import numpy as np
|
6 |
+
import os
|
7 |
+
import json
|
8 |
|
9 |
def log(message):
|
10 |
print(f"β
{message}")
|
11 |
|
12 |
|
13 |
+
|
14 |
+
# β
Ensure data folder exists
|
15 |
+
DATA_DIR = "data"
|
16 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
17 |
+
|
18 |
+
# β
List of datasets
|
19 |
+
datasets_list = {
|
20 |
+
"sales": "goendalf666/sales-conversations",
|
21 |
+
"blended": "blended_skill_talk",
|
22 |
+
"dialog": "daily_dialog",
|
23 |
+
"multiwoz": "multi_woz_v22",
|
24 |
+
}
|
25 |
+
|
26 |
+
def save_dataset_to_file(dataset_name, dataset):
|
27 |
+
"""Save dataset to a local JSON file."""
|
28 |
+
file_path = os.path.join(DATA_DIR, f"{dataset_name}.json")
|
29 |
+
|
30 |
+
with open(file_path, "w") as f:
|
31 |
+
json.dump(dataset["train"].to_dict(), f)
|
32 |
+
|
33 |
+
print(f"β
Saved {dataset_name} to {file_path}")
|
34 |
+
|
35 |
+
# β
Load & Save all datasets
|
36 |
+
for name, dataset_id in datasets_list.items():
|
37 |
+
dataset = load_dataset(dataset_id, split="train")
|
38 |
+
save_dataset_to_file(name, dataset)
|
39 |
+
|
40 |
+
print("β
All datasets saved locally!")
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
# β
Load datasets
|
45 |
datasets = {
|
46 |
"sales": load_dataset("goendalf666/sales-conversations"),
|