Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Feb 12

Commit

d5fe7d9

unverified ·

2 Parent(s): 9bc9bf6 57b7e7b

Merge branch 'main' into main

Browse files

Files changed (12) hide show

README.md +4 -0
examples/fine-tune-deepseek-reasoning-sft.ipynb +0 -0
examples/ollama-different-model-for-completion.py +2 -2
pyproject.toml +2 -1
src/synthetic_dataset_generator/app.py +5 -4
src/synthetic_dataset_generator/apps/base.py +5 -1
src/synthetic_dataset_generator/apps/chat.py +135 -6
src/synthetic_dataset_generator/apps/rag.py +139 -4
src/synthetic_dataset_generator/apps/textcat.py +117 -4
src/synthetic_dataset_generator/constants.py +3 -0
src/synthetic_dataset_generator/pipelines/base.py +15 -12
src/synthetic_dataset_generator/pipelines/textcat.py +1 -1

README.md CHANGED Viewed

@@ -104,6 +104,10 @@ Optionally, you can also push your datasets to Argilla for further curation by s
 - `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla.
 - `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla.
 ### Argilla integration
 Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).

 - `ARGILLA_API_KEY`: Your Argilla API key to push your datasets to Argilla.
 - `ARGILLA_API_URL`: Your Argilla API URL to push your datasets to Argilla.
+To save the generated datasets to a local directory instead of pushing them to the Hugging Face Hub, set the following environment variable:
+- `SAVE_LOCAL_DIR`: The local directory to save the generated datasets to.
 ### Argilla integration
 Argilla is an open source tool for data curation. It allows you to annotate and review datasets, and push curated datasets to the Hugging Face Hub. You can easily get started with Argilla by following the [quickstart guide](https://docs.argilla.io/latest/getting_started/quickstart/).

examples/fine-tune-deepseek-reasoning-sft.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/ollama-different-model-for-completion.py CHANGED Viewed

@@ -18,8 +18,8 @@ os.environ["OLLAMA_BASE_URL"] = (
 os.environ["MODEL"] = "llama3.2" # model for instruction generation
 os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation
-os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for instruction generation
-os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for completion generation
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation

 os.environ["MODEL"] = "llama3.2" # model for instruction generation
 os.environ["MODEL_COMPLETION"] = "llama3.2:1b" # model for completion generation
+os.environ["TOKENIZER_ID"] = "meta-llama/Llama-3.2-3B-Instruct" # tokenizer for instruction generation
+os.environ["TOKENIZER_ID_COMPLETION"] = "meta-llama/Llama-3.2-1B-Instruct" # tokenizer for completion generation
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3" # magpie template required for instruction generation

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "synthetic-dataset-generator"
-version = "0.1.7"
 description = "Build datasets using natural language"
 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
@@ -22,6 +22,7 @@ dependencies = [
     "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00",
     "gradio[oauth]>=5.4.0,<6.0.0",
     "gradio-huggingfacehub-search>=0.0.12,<1.0.0",
     "model2vec>=0.2.4,<1.0.0",
     "nltk>=3.9.1,<4.0.0",
     "pydantic>=2.10.5,<3.0.0",

 [project]
 name = "synthetic-dataset-generator"
+version = "0.1.8"
 description = "Build datasets using natural language"
 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
     "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm,vision]>=1.5.0,<2.00",
     "gradio[oauth]>=5.4.0,<6.0.0",
     "gradio-huggingfacehub-search>=0.0.12,<1.0.0",
+    "huggingface-hub>=0.26.0,<0.28.0",
     "model2vec>=0.2.4,<1.0.0",
     "nltk>=3.9.1,<4.0.0",
     "pydantic>=2.10.5,<3.0.0",

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -12,15 +12,16 @@ css = """
 .main_ui_logged_out{opacity: 0.3; pointer-events: none}
 button[role="tab"][aria-selected="true"] { border: 0; background: var(--button-primary-background-fill); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
 button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill); background: var(var(--button-primary-background-fill-hover))}
-.tabitem { border: 0; padding-inline: 0}
 .gallery-item {background: var(--background-fill-secondary); text-align: left}
-.table-wrap .tbody td { vertical-align: top }
-#system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
-#sign_in_button { flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
 .gradio-container { width: 100% !important; }
 .gradio-row { display: flex !important; flex-direction: row !important; }
 .gradio-column { flex: 1 !important; min-width: 0 !important; }
 """
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""

 .main_ui_logged_out{opacity: 0.3; pointer-events: none}
 button[role="tab"][aria-selected="true"] { border: 0; background: var(--button-primary-background-fill); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
 button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill); background: var(var(--button-primary-background-fill-hover))}
+.tabitem {border: 0; padding-inline: 0}
 .gallery-item {background: var(--background-fill-secondary); text-align: left}
+.table-wrap .tbody td {vertical-align: top}
+#system_prompt_examples {color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
 .container {padding-inline: 0 !important}
 .gradio-container { width: 100% !important; }
 .gradio-row { display: flex !important; flex-direction: row !important; }
 .gradio-column { flex: 1 !important; min-width: 0 !important; }
+#sign_in_button {flex-grow: 0; width: auto !important; display: flex; align-items: center; justify-content: center; margin: 0 auto;}
+.datasets {height: 70px;}
 """
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -12,9 +12,13 @@ from huggingface_hub import HfApi, upload_file, repo_exists
 from unstructured.chunking.title import chunk_by_title
 from unstructured.partition.auto import partition
-from synthetic_dataset_generator.constants import MAX_NUM_ROWS
 from synthetic_dataset_generator.utils import get_argilla_client
 def validate_argilla_user_workspace_dataset(
     dataset_name: str,

 from unstructured.chunking.title import chunk_by_title
 from unstructured.partition.auto import partition
+from synthetic_dataset_generator.constants import MAX_NUM_ROWS, SAVE_LOCAL_DIR
 from synthetic_dataset_generator.utils import get_argilla_client
+if SAVE_LOCAL_DIR is not None:
+    import os
+    os.makedirs(SAVE_LOCAL_DIR, exist_ok=True)
 def validate_argilla_user_workspace_dataset(
     dataset_name: str,

src/synthetic_dataset_generator/apps/chat.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import ast
 import json
 import random
 import uuid
 from typing import Dict, List, Union
@@ -29,6 +30,7 @@ from synthetic_dataset_generator.constants import (
     DEFAULT_BATCH_SIZE,
     MODEL,
     MODEL_COMPLETION,
     SFT_AVAILABLE,
 )
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
@@ -309,7 +311,7 @@ def generate_dataset_from_seed(
         progress(
             step_progress * n_processed / num_rows,
             total=total_steps,
-            desc="Generating questions",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
@@ -368,7 +370,9 @@ def generate_dataset_from_seed(
                 follow_up_instructions = list(
                     follow_up_generator_instruction.process(inputs=conversations_batch)
                 )
-                for conv, follow_up in zip(conversations_batch, follow_up_instructions[0]):
                     conv["messages"].append(
                         {"role": "user", "content": follow_up["generation"]}
                     )
@@ -506,7 +510,7 @@ def push_dataset(
         num_turns=num_turns,
         num_rows=num_rows,
         temperature=temperature,
-        temperature_completion=temperature_completion
     )
     push_dataset_to_hub(
         dataframe=dataframe,
@@ -637,6 +641,45 @@ def push_dataset(
     return ""
 def show_system_prompt_visibility():
     return {system_prompt: gr.Textbox(visible=True)}
@@ -672,6 +715,31 @@ def show_temperature_completion():
         return {temperature_completion: gr.Slider(value=0.9, visible=True)}
 ######################
 # Gradio UI
 ######################
@@ -781,7 +849,7 @@ with gr.Blocks() as app:
                     )
                     document_column = gr.Dropdown(
                         label="Document Column",
-                        info="Select the document column to generate the RAG dataset",
                         choices=["Load your data first in step 1."],
                         value="Load your data first in step 1.",
                         interactive=False,
@@ -852,10 +920,23 @@ with gr.Blocks() as app:
                     btn_push_to_hub = gr.Button(
                         "Push to Hub", variant="primary", scale=2
                     )
                 with gr.Column(scale=3):
                     success_message = gr.Markdown(
-                        visible=True,
-                        min_height=100,  # don't remove this otherwise progress is not visible
                     )
                     with gr.Accordion(
                         "Customize your pipeline with distilabel",
@@ -953,6 +1034,9 @@ with gr.Blocks() as app:
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
@@ -999,6 +1083,49 @@ with gr.Blocks() as app:
         outputs=[pipeline_code_ui],
     )
     clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
     clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
@@ -1011,3 +1138,5 @@ with gr.Blocks() as app:
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
     app.load(fn=show_temperature_completion, outputs=[temperature_completion])

 import ast
 import json
+import os
 import random
 import uuid
 from typing import Dict, List, Union
     DEFAULT_BATCH_SIZE,
     MODEL,
     MODEL_COMPLETION,
+    SAVE_LOCAL_DIR,
     SFT_AVAILABLE,
 )
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
         progress(
             step_progress * n_processed / num_rows,
             total=total_steps,
+            desc="Generating instructions",
         )
         remaining_rows = num_rows - n_processed
         batch_size = min(batch_size, remaining_rows)
                 follow_up_instructions = list(
                     follow_up_generator_instruction.process(inputs=conversations_batch)
                 )
+                for conv, follow_up in zip(
+                    conversations_batch, follow_up_instructions[0]
+                ):
                     conv["messages"].append(
                         {"role": "user", "content": follow_up["generation"]}
                     )
         num_turns=num_turns,
         num_rows=num_rows,
         temperature=temperature,
+        temperature_completion=temperature_completion,
     )
     push_dataset_to_hub(
         dataframe=dataframe,
     return ""
+def save_local(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+    temperature: float,
+    repo_name: str,
+    temperature_completion: Union[float, None] = None,
+) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = _get_dataframe()
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+        )
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        num_turns=num_turns,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+    )
+    local_dataset = Dataset.from_pandas(dataframe)
+    output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
+    output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
+    local_dataset.to_csv(output_csv, index=False)
+    local_dataset.to_json(output_json, index=False)
+    return output_csv, output_json
 def show_system_prompt_visibility():
     return {system_prompt: gr.Textbox(visible=True)}
         return {temperature_completion: gr.Slider(value=0.9, visible=True)}
+def show_save_local_button():
+    return {btn_save_local: gr.Button(visible=True)}
+def hide_save_local_button():
+    return {btn_save_local: gr.Button(visible=False)}
+def show_save_local():
+    gr.update(success_message, min_height=0)
+    return {
+        csv_file: gr.File(visible=True),
+        json_file: gr.File(visible=True),
+        success_message: success_message
+    }
+def hide_save_local():
+    gr.update(success_message, min_height=100)
+    return {
+        csv_file: gr.File(visible=False),
+        json_file: gr.File(visible=False),
+        success_message: success_message,
+    }
 ######################
 # Gradio UI
 ######################
                     )
                     document_column = gr.Dropdown(
                         label="Document Column",
+                        info="Select the document column to generate the chat data",
                         choices=["Load your data first in step 1."],
                         value="Load your data first in step 1.",
                         interactive=False,
                     btn_push_to_hub = gr.Button(
                         "Push to Hub", variant="primary", scale=2
                     )
+                    btn_save_local = gr.Button(
+                        "Save locally", variant="primary", scale=2, visible=False
+                    )
                 with gr.Column(scale=3):
+                    csv_file = gr.File(
+                        label="CSV",
+                        elem_classes="datasets",
+                        visible=False,
+                    )
+                    json_file = gr.File(
+                        label="JSON",
+                        elem_classes="datasets",
+                        visible=False,
+                    )
                     success_message = gr.Markdown(
+                        visible=False,
+                        min_height=0 # don't remove this otherwise progress is not visible
                     )
                     with gr.Accordion(
                         "Customize your pipeline with distilabel",
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=hide_save_local,
+        outputs=[csv_file, json_file, success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
         outputs=[pipeline_code_ui],
     )
+    btn_save_local.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=show_save_local,
+        inputs=[],
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        save_local,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+            temperature,
+            repo_name,
+            temperature_completion,
+        ],
+        outputs=[csv_file, json_file],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
     clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
     clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
     app.load(fn=show_temperature_completion, outputs=[temperature_completion])
+    if SAVE_LOCAL_DIR is not None:
+        app.load(fn=show_save_local_button, outputs=btn_save_local)

src/synthetic_dataset_generator/apps/rag.py CHANGED Viewed

@@ -24,7 +24,12 @@ from synthetic_dataset_generator.apps.base import (
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, MODEL, MODEL_COMPLETION
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
@@ -156,7 +161,7 @@ def generate_dataset(
         is_sample=is_sample,
     )
     response_generator = get_response_generator(
-        temperature = temperature_completion or temperature , is_sample=is_sample
     )
     if reranking:
         reranking_generator = get_sentence_pair_generator(
@@ -486,6 +491,49 @@ def push_dataset(
     return ""
 def show_system_prompt_visibility():
     return {system_prompt: gr.Textbox(visible=True)}
@@ -521,6 +569,32 @@ def show_temperature_completion():
         return {temperature_completion: gr.Slider(value=0.9, visible=True)}
 ######################
 # Gradio UI
 ######################
@@ -675,10 +749,23 @@ with gr.Blocks() as app:
                     scale=1,
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
                 success_message = gr.Markdown(
-                    visible=True,
-                    min_height=100,  # don't remove this otherwise progress is not visible
                 )
                 with gr.Accordion(
                     "Customize your pipeline with distilabel",
@@ -776,6 +863,9 @@ with gr.Blocks() as app:
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
@@ -822,6 +912,49 @@ with gr.Blocks() as app:
         outputs=[pipeline_code_ui],
     )
     clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
     clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
@@ -835,3 +968,5 @@ with gr.Blocks() as app:
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
     app.load(fn=show_temperature_completion, outputs=[temperature_completion])

     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
+from synthetic_dataset_generator.constants import (
+    DEFAULT_BATCH_SIZE,
+    MODEL,
+    MODEL_COMPLETION,
+    SAVE_LOCAL_DIR,
+)
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
         is_sample=is_sample,
     )
     response_generator = get_response_generator(
+        temperature=temperature_completion or temperature, is_sample=is_sample
     )
     if reranking:
         reranking_generator = get_sentence_pair_generator(
     return ""
+def save_local(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    retrieval_reranking: list[str],
+    num_rows: int,
+    temperature: float,
+    repo_name: str,
+    temperature_completion: float,
+) -> pd.DataFrame:
+    retrieval = "Retrieval" in retrieval_reranking
+    reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+        )
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        retrieval=retrieval,
+        reranking=reranking,
+        num_rows=num_rows,
+        temperature=temperature,
+        temperature_completion=temperature_completion,
+    )
+    local_dataset = Dataset.from_pandas(dataframe)
+    output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
+    output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
+    local_dataset.to_csv(output_csv, index=False)
+    local_dataset.to_json(output_json, index=False)
+    return output_csv, output_json
 def show_system_prompt_visibility():
     return {system_prompt: gr.Textbox(visible=True)}
         return {temperature_completion: gr.Slider(value=0.9, visible=True)}
+def show_save_local_button():
+    return {btn_save_local: gr.Button(visible=True)}
+def hide_save_local_button():
+    return {btn_save_local: gr.Button(visible=False)}
+def show_save_local():
+    gr.update(success_message, min_height=0)
+    return {
+        csv_file: gr.File(visible=True),
+        json_file: gr.File(visible=True),
+        success_message: success_message,
+    }
+def hide_save_local():
+    gr.update(success_message, min_height=100)
+    return {
+        csv_file: gr.File(visible=False),
+        json_file: gr.File(visible=False),
+        success_message: success_message,
+    }
 ######################
 # Gradio UI
 ######################
                     scale=1,
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+                btn_save_local = gr.Button(
+                    "Save locally", variant="primary", scale=2, visible=False
+                )
             with gr.Column(scale=3):
+                csv_file = gr.File(
+                    label="CSV",
+                    elem_classes="datasets",
+                    visible=False,
+                )
+                json_file = gr.File(
+                    label="JSON",
+                    elem_classes="datasets",
+                    visible=False,
+                )
                 success_message = gr.Markdown(
+                    visible=False,
+                    min_height=0,  # don't remove this otherwise progress is not visible
                 )
                 with gr.Accordion(
                     "Customize your pipeline with distilabel",
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=hide_save_local,
+        outputs=[csv_file, json_file, success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
         outputs=[pipeline_code_ui],
     )
+    btn_save_local.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=show_save_local,
+        inputs=[],
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        save_local,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+            temperature,
+            repo_name,
+            temperature_completion,
+        ],
+        outputs=[csv_file, json_file],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            retrieval_reranking,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
     clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
     clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
     app.load(fn=show_temperature_completion, outputs=[temperature_completion])
+    if SAVE_LOCAL_DIR is not None:
+        app.load(fn=show_save_local_button, outputs=btn_save_local)

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import random
 import uuid
 from typing import List, Union
@@ -19,7 +20,7 @@ from synthetic_dataset_generator.apps.base import (
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
@@ -195,7 +196,7 @@ def generate_dataset(
                     set(
                         label.lower().strip()
                         for label in x
-                        if label.lower().strip() in labels
                     )
                 )
             else:
@@ -406,6 +407,33 @@ def push_dataset(
     return ""
 def validate_input_labels(labels: List[str]) -> List[str]:
     if (
         not labels
@@ -425,6 +453,32 @@ def hide_pipeline_code_visibility():
     return {pipeline_code_ui: gr.Accordion(visible=False)}
 ######################
 # Gradio UI
 ######################
@@ -544,10 +598,23 @@ with gr.Blocks() as app:
                     scale=1,
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
             with gr.Column(scale=3):
                 success_message = gr.Markdown(
-                    visible=True,
-                    min_height=100,  # don't remove this otherwise progress is not visible
                 )
                 with gr.Accordion(
                     "Customize your pipeline with distilabel",
@@ -600,6 +667,9 @@ with gr.Blocks() as app:
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
@@ -644,6 +714,47 @@ with gr.Blocks() as app:
         outputs=[pipeline_code_ui],
     )
     gr.on(
         triggers=[clear_btn_part.click, clear_btn_full.click],
         fn=lambda _: (
@@ -660,3 +771,5 @@ with gr.Blocks() as app:
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])

 import json
+import os
 import random
 import uuid
 from typing import List, Union
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
+from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, SAVE_LOCAL_DIR
 from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
                     set(
                         label.lower().strip()
                         for label in x
+                        if isinstance(label, str) and label.lower().strip() in labels
                     )
                 )
             else:
     return ""
+def save_local(
+    system_prompt: str,
+    difficulty: str,
+    clarity: str,
+    labels: List[str],
+    multi_label: bool,
+    num_rows: int,
+    temperature: float,
+    repo_name: str,
+) -> pd.DataFrame:
+    dataframe = generate_dataset(
+        system_prompt=system_prompt,
+        difficulty=difficulty,
+        clarity=clarity,
+        multi_label=multi_label,
+        labels=labels,
+        num_rows=num_rows,
+        temperature=temperature,
+    )
+    local_dataset = Dataset.from_pandas(dataframe)
+    output_csv = os.path.join(SAVE_LOCAL_DIR, repo_name + ".csv")
+    output_json = os.path.join(SAVE_LOCAL_DIR, repo_name + ".json")
+    local_dataset.to_csv(output_csv, index=False)
+    local_dataset.to_json(output_json, index=False)
+    return output_csv, output_json
 def validate_input_labels(labels: List[str]) -> List[str]:
     if (
         not labels
     return {pipeline_code_ui: gr.Accordion(visible=False)}
+def show_save_local_button():
+    return {btn_save_local: gr.Button(visible=True)}
+def hide_save_local_button():
+    return {btn_save_local: gr.Button(visible=False)}
+def show_save_local():
+    gr.update(success_message, min_height=0)
+    return {
+        csv_file: gr.File(visible=True),
+        json_file: gr.File(visible=True),
+        success_message: success_message,
+    }
+def hide_save_local():
+    gr.update(success_message, min_height=100)
+    return {
+        csv_file: gr.File(visible=False),
+        json_file: gr.File(visible=False),
+        success_message: success_message,
+    }
 ######################
 # Gradio UI
 ######################
                     scale=1,
                 )
                 btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+                btn_save_local = gr.Button(
+                    "Save locally", variant="primary", scale=2, visible=False
+                )
             with gr.Column(scale=3):
+                csv_file = gr.File(
+                    label="CSV",
+                    elem_classes="datasets",
+                    visible=False,
+                )
+                json_file = gr.File(
+                    label="JSON",
+                    elem_classes="datasets",
+                    visible=False,
+                )
                 success_message = gr.Markdown(
+                    visible=False,
+                    min_height=0,  # don't remove this otherwise progress is not visible
                 )
                 with gr.Accordion(
                     "Customize your pipeline with distilabel",
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
+    ).success(
+        fn=hide_save_local,
+        outputs=[csv_file, json_file, success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
         outputs=[pipeline_code_ui],
     )
+    btn_save_local.click(
+        fn=hide_success_message,
+        outputs=[success_message],
+    ).success(
+        fn=hide_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    ).success(
+        fn=show_save_local,
+        inputs=[],
+        outputs=[csv_file, json_file, success_message],
+    ).success(
+        save_local,
+        inputs=[
+            system_prompt,
+            difficulty,
+            clarity,
+            labels,
+            multi_label,
+            num_rows,
+            temperature,
+            repo_name,
+        ],
+        outputs=[csv_file, json_file],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            system_prompt,
+            difficulty,
+            clarity,
+            labels,
+            multi_label,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=show_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
+    )
     gr.on(
         triggers=[clear_btn_part.click, clear_btn_full.click],
         fn=lambda _: (
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
+    if SAVE_LOCAL_DIR is not None:
+        app.load(fn=show_save_local_button, outputs=btn_save_local)

src/synthetic_dataset_generator/constants.py CHANGED Viewed

@@ -8,6 +8,9 @@ MAX_NUM_TOKENS = int(os.getenv("MAX_NUM_TOKENS", 2048))
 MAX_NUM_ROWS = int(os.getenv("MAX_NUM_ROWS", 1000))
 DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
 # Models
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 TOKENIZER_ID = os.getenv(key="TOKENIZER_ID", default=None)

 MAX_NUM_ROWS = int(os.getenv("MAX_NUM_ROWS", 1000))
 DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
+# Directory to locally save the generated data
+SAVE_LOCAL_DIR = os.getenv(key="SAVE_LOCAL_DIR", default=None)
 # Models
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 TOKENIZER_ID = os.getenv(key="TOKENIZER_ID", default=None)

src/synthetic_dataset_generator/pipelines/base.py CHANGED Viewed

@@ -87,10 +87,17 @@ def _get_llm(
 ):
     model = MODEL_COMPLETION if is_completion else MODEL
     tokenizer_id = TOKENIZER_ID_COMPLETION if is_completion else TOKENIZER_ID or model
-    if OPENAI_BASE_URL:
         llm = OpenAILLM(
             model=model,
-            base_url=OPENAI_BASE_URL_COMPLETION if is_completion else OPENAI_BASE_URL,
             api_key=_get_next_api_key(),
             structured_output=structured_output,
             **kwargs,
@@ -103,7 +110,7 @@ def _get_llm(
                 del kwargs["generation_kwargs"]["stop_sequences"]
             if "do_sample" in kwargs["generation_kwargs"]:
                 del kwargs["generation_kwargs"]["do_sample"]
-    elif OLLAMA_BASE_URL:
         if "generation_kwargs" in kwargs:
             if "max_new_tokens" in kwargs["generation_kwargs"]:
                 kwargs["generation_kwargs"]["num_predict"] = kwargs[
@@ -123,32 +130,28 @@ def _get_llm(
             kwargs["generation_kwargs"]["options"] = options
         llm = OllamaLLM(
             model=model,
-            host=OLLAMA_BASE_URL_COMPLETION if is_completion else OLLAMA_BASE_URL,
             tokenizer_id=tokenizer_id,
             use_magpie_template=use_magpie_template,
             structured_output=structured_output,
             **kwargs,
         )
-    elif HUGGINGFACE_BASE_URL:
         kwargs["generation_kwargs"]["do_sample"] = True
         llm = InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
-            base_url=(
-                HUGGINGFACE_BASE_URL_COMPLETION
-                if is_completion
-                else HUGGINGFACE_BASE_URL
-            ),
             tokenizer_id=tokenizer_id,
             use_magpie_template=use_magpie_template,
             structured_output=structured_output,
             **kwargs,
         )
-    elif VLLM_BASE_URL:
         if "generation_kwargs" in kwargs:
             if "do_sample" in kwargs["generation_kwargs"]:
                 del kwargs["generation_kwargs"]["do_sample"]
         llm = ClientvLLM(
-            base_url=VLLM_BASE_URL_COMPLETION if is_completion else VLLM_BASE_URL,
             model=model,
             tokenizer=tokenizer_id,
             api_key=_get_next_api_key(),

 ):
     model = MODEL_COMPLETION if is_completion else MODEL
     tokenizer_id = TOKENIZER_ID_COMPLETION if is_completion else TOKENIZER_ID or model
+    base_urls = {
+        "openai": OPENAI_BASE_URL_COMPLETION if is_completion else OPENAI_BASE_URL,
+        "ollama": OLLAMA_BASE_URL_COMPLETION if is_completion else OLLAMA_BASE_URL,
+        "huggingface": HUGGINGFACE_BASE_URL_COMPLETION if is_completion else HUGGINGFACE_BASE_URL,
+        "vllm": VLLM_BASE_URL_COMPLETION if is_completion else VLLM_BASE_URL,
+    }
+    if base_urls["openai"]:
         llm = OpenAILLM(
             model=model,
+            base_url=base_urls["openai"],
             api_key=_get_next_api_key(),
             structured_output=structured_output,
             **kwargs,
                 del kwargs["generation_kwargs"]["stop_sequences"]
             if "do_sample" in kwargs["generation_kwargs"]:
                 del kwargs["generation_kwargs"]["do_sample"]
+    elif base_urls["ollama"]:
         if "generation_kwargs" in kwargs:
             if "max_new_tokens" in kwargs["generation_kwargs"]:
                 kwargs["generation_kwargs"]["num_predict"] = kwargs[
             kwargs["generation_kwargs"]["options"] = options
         llm = OllamaLLM(
             model=model,
+            host=base_urls["ollama"],
             tokenizer_id=tokenizer_id,
             use_magpie_template=use_magpie_template,
             structured_output=structured_output,
             **kwargs,
         )
+    elif base_urls["huggingface"]:
         kwargs["generation_kwargs"]["do_sample"] = True
         llm = InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
+            base_url=base_urls["huggingface"],
             tokenizer_id=tokenizer_id,
             use_magpie_template=use_magpie_template,
             structured_output=structured_output,
             **kwargs,
         )
+    elif base_urls["vllm"]:
         if "generation_kwargs" in kwargs:
             if "do_sample" in kwargs["generation_kwargs"]:
                 del kwargs["generation_kwargs"]["do_sample"]
         llm = ClientvLLM(
+            base_url=base_urls["vllm"],
             model=model,
             tokenizer=tokenizer_id,
             api_key=_get_next_api_key(),

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -109,7 +109,7 @@ def get_labeller_generator(system_prompt: str, labels: List[str], multi_label: b
         "temperature": 0.01,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
-    llm = _get_llm(is_completion=True, generation_kwargs=generation_kwargs)
     labeller_generator = TextClassification(
         llm=llm,
         context=system_prompt,

         "temperature": 0.01,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
+    llm = _get_llm(generation_kwargs=generation_kwargs)
     labeller_generator = TextClassification(
         llm=llm,
         context=system_prompt,