Spaces:

gretelai
/

enhance-ai-training-data

Runtime error

App Files Files Community

Alexander Watson commited on Jul 16, 2024

Commit

faef657

1 Parent(s): 63a19c9

Add HF data support to SDK code generator

Browse files

Files changed (1) hide show

app.py +27 -5

app.py CHANGED Viewed

@@ -115,7 +115,12 @@ def main():
         )
         df = None
         if data_source == "Upload a file":
             uploaded_file = st.file_uploader(
                 "Upload a CSV, JSON, or JSONL file",
                 type=["csv", "json", "jsonl"],
@@ -132,16 +137,19 @@ def main():
                 st.success(f"File uploaded successfully: {uploaded_file.name}")
         elif data_source == "Select a dataset from Hugging Face":
             huggingface_dataset = st.text_input(
                 "Hugging Face Dataset Repository",
                 help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
             )
             huggingface_split = st.selectbox(
                 "Dataset Split",
                 options=["train", "validation", "test"],
                 help="Select the dataset split to use",
             )
             if st.button("Load Hugging Face Dataset"):
                 if huggingface_dataset:
@@ -160,6 +168,7 @@ def main():
                     st.warning("Please provide a Hugging Face dataset repository name.")
         elif data_source == "Use a sample dataset":
             st.write("Try a sample dataset to get started quickly.")
             if st.button("Try Sample Dataset"):
                 try:
@@ -422,14 +431,27 @@ def main():
 import logging
 import pandas as pd
 from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
 # Configure the logger
 logging.basicConfig(level=logging.INFO, format="%(message)s")
-DATASET = "YOUR_DATASET"
 API_KEY = "YOUR_API_KEY"
-df = pd.read_csv(DATASET)
 # Create the instruction response configuration
 config = InstructionResponseConfig(
@@ -583,7 +605,7 @@ new_df = synthesizer.generate()
                         time.sleep(0.1)
                 logger.removeHandler(handler)
-                st.success("Data synthetic completed!")
             st.stop()
         if stop_button:
@@ -638,7 +660,7 @@ new_df = synthesizer.generate()
                     zip_file.write(log_file_path, "complete_logs.jsonl")
                     if synthesized_data_jsonl:
                         zip_file.write(
-                            synthesized_data_file_path, "synthesized_data.jsonl"
                         )
                     zip_file.write(sdk_file_path, "data_synthesis_code.py")

         )
         df = None
+        dataset_source_type = ""
+        huggingface_dataset = ""
+        huggingface_split = ""
         if data_source == "Upload a file":
+            dataset_source_type = "uploaded"
             uploaded_file = st.file_uploader(
                 "Upload a CSV, JSON, or JSONL file",
                 type=["csv", "json", "jsonl"],
                 st.success(f"File uploaded successfully: {uploaded_file.name}")
         elif data_source == "Select a dataset from Hugging Face":
+            dataset_source_type = "huggingface"
             huggingface_dataset = st.text_input(
                 "Hugging Face Dataset Repository",
                 help="Enter the name of the Hugging Face dataset repository (e.g., 'squad')",
             )
+            st.session_state.huggingface_dataset = huggingface_dataset
             huggingface_split = st.selectbox(
                 "Dataset Split",
                 options=["train", "validation", "test"],
                 help="Select the dataset split to use",
             )
+            st.session_state.huggingface_split = huggingface_split
             if st.button("Load Hugging Face Dataset"):
                 if huggingface_dataset:
                     st.warning("Please provide a Hugging Face dataset repository name.")
         elif data_source == "Use a sample dataset":
+            dataset_source_type = "sample"
             st.write("Try a sample dataset to get started quickly.")
             if st.button("Try Sample Dataset"):
                 try:
 import logging
 import pandas as pd
 from navigator_helpers import InstructionResponseConfig, TrainingDataSynthesizer
+from datasets import load_dataset
 # Configure the logger
 logging.basicConfig(level=logging.INFO, format="%(message)s")
 API_KEY = "YOUR_API_KEY"
+DATASET_SOURCE = "{dataset_source_type}"
+HUGGINGFACE_DATASET = "{huggingface_dataset}"
+HUGGINGFACE_SPLIT = "{huggingface_split}"
+SAMPLE_DATASET_URL = "{SAMPLE_DATASET_URL}"
+# Load dataset
+if DATASET_SOURCE == 'uploaded':
+    df = pd.read_csv("YOUR_UPLOADED_FILE_PATH")  # Replace with the actual file path
+elif DATASET_SOURCE == 'huggingface':
+    dataset = load_dataset(HUGGINGFACE_DATASET, split=HUGGINGFACE_SPLIT)
+    df = dataset.to_pandas()
+elif DATASET_SOURCE == 'sample':
+    df = pd.read_csv(SAMPLE_DATASET_URL)
+else:
+    raise ValueError("Invalid DATASET_SOURCE specified")
 # Create the instruction response configuration
 config = InstructionResponseConfig(
                         time.sleep(0.1)
                 logger.removeHandler(handler)
+                st.success("Data synthesis completed!")
             st.stop()
         if stop_button:
                     zip_file.write(log_file_path, "complete_logs.jsonl")
                     if synthesized_data_jsonl:
                         zip_file.write(
+                            synthesized_data_file_path, "synthetic_data.jsonl"
                         )
                     zip_file.write(sdk_file_path, "data_synthesis_code.py")