Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
import pandas as pd | |
import os | |
# --- Configuration --- | |
# 1. Hardcode the name of the Hugging Face dataset | |
dataset_name = "Trendyol/Trendyol-Cybersecurity-Instruction-Tuning-Dataset" | |
# 2. Define the name for the local file where the data will be saved | |
local_file_path = "trendyol_cybersecurity_dataset.csv" | |
# 3. Define the port for the Embedding Atlas server | |
port = 7860 | |
# --- Script Logic --- | |
# Step 1: Load the dataset from Hugging Face | |
print(f"Loading dataset '{dataset_name}' from the Hub...") | |
try: | |
dataset = load_dataset(dataset_name, split="train") | |
except Exception as e: | |
print(f"Failed to load dataset. Error: {e}") | |
exit() | |
# Step 2: Convert the dataset to a Pandas DataFrame | |
print("Converting dataset to Pandas DataFrame...") | |
df = dataset.to_pandas() | |
# Step 3: Save the DataFrame to a local CSV file | |
# This is the crucial step. The CLI tool will read from this file. | |
# We use index=False to avoid saving the pandas index as an extra column. | |
print(f"Saving DataFrame to a local file: '{local_file_path}'") | |
df.to_csv(local_file_path, index=False) | |
print("Save complete.") | |
# Step 4: Construct and run the CLI command using the LOCAL file path | |
command = f"embedding-atlas {local_file_path} --port {port}" | |
print(f"\nLaunching Embedding Atlas...") | |
print(f"Running command: {command}") | |
print(f"Access the UI in your browser at: http://127.0.0.1:{port}") | |
os.system(command) |