Spaces:
Runtime error
Runtime error
Alexander Watson
commited on
Commit
·
faef657
1
Parent(s):
63a19c9
Add HF data support to SDK code generator
Browse files
app.py
CHANGED
|
@@ -115,7 +115,12 @@ def main():
|
|
| 115 |
)
|
| 116 |
|
| 117 |
df = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
if data_source == "Upload a file":
|
|
|
|
| 119 |
uploaded_file = st.file_uploader(
|
| 120 |
"Upload a CSV, JSON, or JSONL file",
|
| 121 |
type=["csv", "json", "jsonl"],
|
|
@@ -132,16 +137,19 @@ def main():
|
|
| 132 |
st.success(f"File uploaded successfully: {uploaded_file.name}")
|
| 133 |
|
| 134 |
elif data_source == "Select a dataset from Hugging Face":
|
|
|
|
| 135 |
huggingface_dataset = st.text_input(
|
| 136 |
"Hugging Face Dataset Repository",
|
| 137 |
help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
|
| 138 |
)
|
|
|
|
| 139 |
|
| 140 |
huggingface_split = st.selectbox(
|
| 141 |
"Dataset Split",
|
| 142 |
options=["train", "validation", "test"],
|
| 143 |
help="Select the dataset split to use",
|
| 144 |
)
|
|
|
|
| 145 |
|
| 146 |
if st.button("Load Hugging Face Dataset"):
|
| 147 |
if huggingface_dataset:
|
|
@@ -160,6 +168,7 @@ def main():
|
|
| 160 |
st.warning("Please provide a Hugging Face dataset repository name.")
|
| 161 |
|
| 162 |
elif data_source == "Use a sample dataset":
|
|
|
|
| 163 |
st.write("Try a sample dataset to get started quickly.")
|
| 164 |
if st.button("Try Sample Dataset"):
|
| 165 |
try:
|
|
@@ -422,14 +431,27 @@ def main():
|
|
| 422 |
import logging
|
| 423 |
import pandas as pd
|
| 424 |
from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
|
|
|
|
| 425 |
|
| 426 |
# Configure the logger
|
| 427 |
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
| 428 |
|
| 429 |
-
DATASET = "YOUR_DATASET"
|
| 430 |
API_KEY = "YOUR_API_KEY"
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
# Create the instruction response configuration
|
| 435 |
config = InstructionResponseConfig(
|
|
@@ -583,7 +605,7 @@ new_df = synthesizer.generate()
|
|
| 583 |
|
| 584 |
time.sleep(0.1)
|
| 585 |
logger.removeHandler(handler)
|
| 586 |
-
st.success("Data
|
| 587 |
st.stop()
|
| 588 |
|
| 589 |
if stop_button:
|
|
@@ -638,7 +660,7 @@ new_df = synthesizer.generate()
|
|
| 638 |
zip_file.write(log_file_path, "complete_logs.jsonl")
|
| 639 |
if synthesized_data_jsonl:
|
| 640 |
zip_file.write(
|
| 641 |
-
synthesized_data_file_path, "
|
| 642 |
)
|
| 643 |
zip_file.write(sdk_file_path, "data_synthesis_code.py")
|
| 644 |
|
|
|
|
| 115 |
)
|
| 116 |
|
| 117 |
df = None
|
| 118 |
+
dataset_source_type = ""
|
| 119 |
+
huggingface_dataset = ""
|
| 120 |
+
huggingface_split = ""
|
| 121 |
+
|
| 122 |
if data_source == "Upload a file":
|
| 123 |
+
dataset_source_type = "uploaded"
|
| 124 |
uploaded_file = st.file_uploader(
|
| 125 |
"Upload a CSV, JSON, or JSONL file",
|
| 126 |
type=["csv", "json", "jsonl"],
|
|
|
|
| 137 |
st.success(f"File uploaded successfully: {uploaded_file.name}")
|
| 138 |
|
| 139 |
elif data_source == "Select a dataset from Hugging Face":
|
| 140 |
+
dataset_source_type = "huggingface"
|
| 141 |
huggingface_dataset = st.text_input(
|
| 142 |
"Hugging Face Dataset Repository",
|
| 143 |
help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
|
| 144 |
)
|
| 145 |
+
st.session_state.huggingface_dataset = huggingface_dataset
|
| 146 |
|
| 147 |
huggingface_split = st.selectbox(
|
| 148 |
"Dataset Split",
|
| 149 |
options=["train", "validation", "test"],
|
| 150 |
help="Select the dataset split to use",
|
| 151 |
)
|
| 152 |
+
st.session_state.huggingface_split = huggingface_split
|
| 153 |
|
| 154 |
if st.button("Load Hugging Face Dataset"):
|
| 155 |
if huggingface_dataset:
|
|
|
|
| 168 |
st.warning("Please provide a Hugging Face dataset repository name.")
|
| 169 |
|
| 170 |
elif data_source == "Use a sample dataset":
|
| 171 |
+
dataset_source_type = "sample"
|
| 172 |
st.write("Try a sample dataset to get started quickly.")
|
| 173 |
if st.button("Try Sample Dataset"):
|
| 174 |
try:
|
|
|
|
| 431 |
import logging
|
| 432 |
import pandas as pd
|
| 433 |
from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
|
| 434 |
+
from datasets import load_dataset
|
| 435 |
|
| 436 |
# Configure the logger
|
| 437 |
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
| 438 |
|
|
|
|
| 439 |
API_KEY = "YOUR_API_KEY"
|
| 440 |
+
DATASET_SOURCE = "{dataset_source_type}"
|
| 441 |
+
HUGGINGFACE_DATASET = "{huggingface_dataset}"
|
| 442 |
+
HUGGINGFACE_SPLIT = "{huggingface_split}"
|
| 443 |
+
SAMPLE_DATASET_URL = "{SAMPLE_DATASET_URL}"
|
| 444 |
+
|
| 445 |
+
# Load dataset
|
| 446 |
+
if DATASET_SOURCE == 'uploaded':
|
| 447 |
+
df = pd.read_csv("YOUR_UPLOADED_FILE_PATH") # Replace with the actual file path
|
| 448 |
+
elif DATASET_SOURCE == 'huggingface':
|
| 449 |
+
dataset = load_dataset(HUGGINGFACE_DATASET, split=HUGGINGFACE_SPLIT)
|
| 450 |
+
df = dataset.to_pandas()
|
| 451 |
+
elif DATASET_SOURCE == 'sample':
|
| 452 |
+
df = pd.read_csv(SAMPLE_DATASET_URL)
|
| 453 |
+
else:
|
| 454 |
+
raise ValueError("Invalid DATASET_SOURCE specified")
|
| 455 |
|
| 456 |
# Create the instruction response configuration
|
| 457 |
config = InstructionResponseConfig(
|
|
|
|
| 605 |
|
| 606 |
time.sleep(0.1)
|
| 607 |
logger.removeHandler(handler)
|
| 608 |
+
st.success("Data synthesis completed!")
|
| 609 |
st.stop()
|
| 610 |
|
| 611 |
if stop_button:
|
|
|
|
| 660 |
zip_file.write(log_file_path, "complete_logs.jsonl")
|
| 661 |
if synthesized_data_jsonl:
|
| 662 |
zip_file.write(
|
| 663 |
+
synthesized_data_file_path, "synthetic_data.jsonl"
|
| 664 |
)
|
| 665 |
zip_file.write(sdk_file_path, "data_synthesis_code.py")
|
| 666 |
|