abrah926 commited on
Commit
e925ddf
Β·
verified Β·
1 Parent(s): 29d1f72
Files changed (1) hide show
  1. app.py +28 -19
app.py CHANGED
@@ -14,30 +14,39 @@ def log(message):
14
  print(f"βœ… {message}")
15
 
16
 
 
 
 
 
17
  DATA_DIR = "data"
 
18
 
19
- def load_local_dataset(dataset_name):
20
- """Load a dataset from a JSON file."""
21
- file_path = os.path.join(DATA_DIR, f"{dataset_name}.json")
22
-
23
- if os.path.exists(file_path):
24
- with open(file_path, "r") as f:
25
- data = json.load(f)
26
- print(f"βœ… Loaded {dataset_name} from {file_path}")
27
- return data
28
- else:
29
- print(f"❌ ERROR: {dataset_name} file not found!")
30
- return None
31
-
32
- # βœ… Load all datasets from local storage
33
  datasets = {
34
- "sales": load_local_dataset("sales"),
35
- "blended": load_local_dataset("blended"),
36
- "dialog": load_local_dataset("dialog"),
37
- "multiwoz": load_local_dataset("multiwoz"),
38
  }
39
 
40
- print("βœ… Datasets loaded from local storage!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
43
  # βœ… Step 1: Run Embedding Script (Import and Run)
 
14
  print(f"βœ… {message}")
15
 
16
 
17
+ import os
18
+ import json
19
+ from datasets import load_dataset
20
+
21
  DATA_DIR = "data"
22
+ os.makedirs(DATA_DIR, exist_ok=True) # Ensure directory exists
23
 
24
+ # βœ… List of datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  datasets = {
26
+ "sales": "goendalf666/sales-conversations",
27
+ "blended": "blended_skill_talk",
28
+ "dialog": "daily_dialog",
29
+ "multiwoz": "multi_woz_v22",
30
  }
31
 
32
+ # βœ… Save datasets to JSON
33
+ for name, hf_name in datasets.items():
34
+ print(f"πŸ“₯ Downloading {name} dataset...")
35
+ dataset = load_dataset(hf_name)
36
+
37
+ # Extract training data
38
+ train_data = dataset["train"]
39
+
40
+ # Convert dataset to list of dictionaries
41
+ data_list = [dict(row) for row in train_data]
42
+
43
+ # Save to JSON
44
+ file_path = os.path.join(DATA_DIR, f"{name}.json")
45
+ with open(file_path, "w") as f:
46
+ json.dump(data_list, f, indent=2)
47
+
48
+ print(f"βœ… {name} dataset saved to {file_path}")
49
+
50
 
51
 
52
  # βœ… Step 1: Run Embedding Script (Import and Run)