File size: 1,425 Bytes
fe5ff1b
 
5d7eb35
7656238
8904c8e
 
 
fe5ff1b
 
8904c8e
 
 
 
 
fe5ff1b
8904c8e
 
 
 
 
 
 
 
 
 
 
 
fe5ff1b
 
8904c8e
 
 
 
 
 
 
 
 
 
 
 
c5a0831
8904c8e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from datasets import load_dataset
import pandas as pd
import os

# --- Configuration ---

# 1. Hardcode the name of the Hugging Face dataset
dataset_name = "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset"

# 2. Define the name for the local file where the data will be saved
local_file_path = "trendyol_cybersecurity_dataset.csv"

# 3. Define the port for the Embedding Atlas server
port = 7860

# --- Script Logic ---

# Step 1: Load the dataset from Hugging Face
print(f"Loading dataset '{dataset_name}' from the Hub...")
try:
    dataset = load_dataset(dataset_name, split="train")
except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    exit()

# Step 2: Convert the dataset to a Pandas DataFrame
print("Converting dataset to Pandas DataFrame...")
df = dataset.to_pandas()

# Step 3: Save the DataFrame to a local CSV file
# This is the crucial step. The CLI tool will read from this file.
# We use index=False to avoid saving the pandas index as an extra column.
print(f"Saving DataFrame to a local file: '{local_file_path}'")
df.to_csv(local_file_path, index=False)
print("Save complete.")

# Step 4: Construct and run the CLI command using the LOCAL file path
command = f"embedding-atlas {local_file_path} --port {port}"
print(f"\nLaunching Embedding Atlas...")
print(f"Running command: {command}")
print(f"Access the UI in your browser at: http://127.0.0.1:{port}")

os.system(command)