data.dir
Browse files
app.py
CHANGED
@@ -14,30 +14,39 @@ def log(message):
|
|
14 |
print(f"β
{message}")
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
17 |
DATA_DIR = "data"
|
|
|
18 |
|
19 |
-
|
20 |
-
"""Load a dataset from a JSON file."""
|
21 |
-
file_path = os.path.join(DATA_DIR, f"{dataset_name}.json")
|
22 |
-
|
23 |
-
if os.path.exists(file_path):
|
24 |
-
with open(file_path, "r") as f:
|
25 |
-
data = json.load(f)
|
26 |
-
print(f"β
Loaded {dataset_name} from {file_path}")
|
27 |
-
return data
|
28 |
-
else:
|
29 |
-
print(f"β ERROR: {dataset_name} file not found!")
|
30 |
-
return None
|
31 |
-
|
32 |
-
# β
Load all datasets from local storage
|
33 |
datasets = {
|
34 |
-
"sales":
|
35 |
-
"blended":
|
36 |
-
"dialog":
|
37 |
-
"multiwoz":
|
38 |
}
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
|
43 |
# β
Step 1: Run Embedding Script (Import and Run)
|
|
|
14 |
print(f"β
{message}")
|
15 |
|
16 |
|
17 |
+
import os
|
18 |
+
import json
|
19 |
+
from datasets import load_dataset
|
20 |
+
|
21 |
DATA_DIR = "data"
|
22 |
+
os.makedirs(DATA_DIR, exist_ok=True) # Ensure directory exists
|
23 |
|
24 |
+
# β
List of datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
datasets = {
|
26 |
+
"sales": "goendalf666/sales-conversations",
|
27 |
+
"blended": "blended_skill_talk",
|
28 |
+
"dialog": "daily_dialog",
|
29 |
+
"multiwoz": "multi_woz_v22",
|
30 |
}
|
31 |
|
32 |
+
# β
Save datasets to JSON
|
33 |
+
for name, hf_name in datasets.items():
|
34 |
+
print(f"π₯ Downloading {name} dataset...")
|
35 |
+
dataset = load_dataset(hf_name)
|
36 |
+
|
37 |
+
# Extract training data
|
38 |
+
train_data = dataset["train"]
|
39 |
+
|
40 |
+
# Convert dataset to list of dictionaries
|
41 |
+
data_list = [dict(row) for row in train_data]
|
42 |
+
|
43 |
+
# Save to JSON
|
44 |
+
file_path = os.path.join(DATA_DIR, f"{name}.json")
|
45 |
+
with open(file_path, "w") as f:
|
46 |
+
json.dump(data_list, f, indent=2)
|
47 |
+
|
48 |
+
print(f"β
{name} dataset saved to {file_path}")
|
49 |
+
|
50 |
|
51 |
|
52 |
# β
Step 1: Run Embedding Script (Import and Run)
|