abrah926 commited on
Commit
707dafd
Β·
verified Β·
1 Parent(s): 8b6991a

saving datasets in json

Browse files
Files changed (1) hide show
  1. embeddings.py +33 -0
embeddings.py CHANGED
@@ -3,11 +3,44 @@ from transformers import AutoTokenizer, AutoModel
3
  import faiss
4
  import torch
5
  import numpy as np
 
 
6
 
7
  def log(message):
8
  print(f"βœ… {message}")
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # βœ… Load datasets
12
  datasets = {
13
  "sales": load_dataset("goendalf666/sales-conversations"),
 
3
  import faiss
4
  import torch
5
  import numpy as np
6
+ import os
7
+ import json
8
 
9
  def log(message):
10
  print(f"βœ… {message}")
11
 
12
 
13
+
14
+ # βœ… Ensure data folder exists
15
+ DATA_DIR = "data"
16
+ os.makedirs(DATA_DIR, exist_ok=True)
17
+
18
+ # βœ… List of datasets
19
+ datasets_list = {
20
+ "sales": "goendalf666/sales-conversations",
21
+ "blended": "blended_skill_talk",
22
+ "dialog": "daily_dialog",
23
+ "multiwoz": "multi_woz_v22",
24
+ }
25
+
26
+ def save_dataset_to_file(dataset_name, dataset):
27
+ """Save dataset to a local JSON file."""
28
+ file_path = os.path.join(DATA_DIR, f"{dataset_name}.json")
29
+
30
+ with open(file_path, "w") as f:
31
+ json.dump(dataset["train"].to_dict(), f)
32
+
33
+ print(f"βœ… Saved {dataset_name} to {file_path}")
34
+
35
+ # βœ… Load & Save all datasets
36
+ for name, dataset_id in datasets_list.items():
37
+ dataset = load_dataset(dataset_id, split="train")
38
+ save_dataset_to_file(name, dataset)
39
+
40
+ print("βœ… All datasets saved locally!")
41
+
42
+
43
+
44
  # βœ… Load datasets
45
  datasets = {
46
  "sales": load_dataset("goendalf666/sales-conversations"),