Spaces:

argilla
/

synthetic-data-generator

Running

Sara Han commited on Feb 4

Commit

b2669f7

unverified ·

1 Parent(s): be0e284

feat: add seed data for chat data (#32)

* add similar ui logic for seed data in chat

* add logic for seed generation

* remove todos

* apply feedback

* small fix

Files changed (7) hide show

src/synthetic_dataset_generator/apps/base.py +83 -1
src/synthetic_dataset_generator/apps/chat.py +436 -59
src/synthetic_dataset_generator/apps/eval.py +6 -10
src/synthetic_dataset_generator/apps/rag.py +55 -159
src/synthetic_dataset_generator/apps/textcat.py +2 -13
src/synthetic_dataset_generator/pipelines/chat.py +249 -16
src/synthetic_dataset_generator/pipelines/rag.py +1 -8

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import io
 import uuid
 from typing import Union
 import argilla as rg
 import gradio as gr
-from datasets import Dataset, concatenate_datasets, load_dataset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file, repo_exists
 from synthetic_dataset_generator.constants import MAX_NUM_ROWS
 from synthetic_dataset_generator.utils import get_argilla_client
@@ -179,3 +183,81 @@ def get_iframe(hub_repo_id: str) -> str:
     ></iframe>
     """
     return iframe

 import io
 import uuid
+from tqdm import tqdm
 from typing import Union
 import argilla as rg
 import gradio as gr
+import pandas as pd
+from datasets import Dataset, concatenate_datasets, get_dataset_config_names, get_dataset_split_names, load_dataset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file, repo_exists
+from unstructured.chunking.title import chunk_by_title
+from unstructured.partition.auto import partition
 from synthetic_dataset_generator.constants import MAX_NUM_ROWS
 from synthetic_dataset_generator.utils import get_argilla_client
     ></iframe>
     """
     return iframe
+def _get_valid_columns(dataframe: pd.DataFrame):
+    doc_valid_columns = []
+    for col in dataframe.columns:
+        sample_val = dataframe[col].iloc[0]
+        if isinstance(sample_val, str):
+            doc_valid_columns.append(col)
+    return doc_valid_columns
+def load_dataset_from_hub(
+    repo_id: str,
+    num_rows: int = 10,
+    token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if not repo_id:
+        raise gr.Error("Please provide a Hub repo ID")
+    subsets = get_dataset_config_names(repo_id, token=token)
+    splits = get_dataset_split_names(repo_id, subsets[0], token=token)
+    ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
+    rows = []
+    for idx, row in enumerate(tqdm(ds, desc="Loading the dataset", total=num_rows)):
+        rows.append(row)
+        if idx == num_rows:
+            break
+    ds = Dataset.from_list(rows)
+    dataframe = ds.to_pandas()
+    doc_valid_columns = _get_valid_columns(dataframe)
+    col_doc = doc_valid_columns[0] if doc_valid_columns else ""
+    return (
+        dataframe,
+        gr.Dropdown(
+            choices=doc_valid_columns,
+            label="Documents column",
+            value=col_doc,
+            interactive=(False if col_doc == "" else True),
+            multiselect=False,
+        ),
+    )
+def preprocess_input_data(
+    file_paths: list[str], num_rows: int, progress=gr.Progress(track_tqdm=True)
+):
+    if not file_paths:
+        raise gr.Error("Please provide an input file")
+    data = {}
+    total_chunks = 0
+    for file_path in tqdm(file_paths, desc="Processing files", total=len(file_paths)):
+        partitioned_file = partition(filename=file_path)
+        chunks = [str(chunk) for chunk in chunk_by_title(partitioned_file)]
+        data[file_path] = chunks
+        total_chunks += len(chunks)
+        if total_chunks >= num_rows:
+            break
+    dataframe = pd.DataFrame.from_records(
+        [(k, v) for k, values in data.items() for v in values],
+        columns=["filename", "chunks"],
+    )
+    col_doc = "chunks"
+    return (
+        dataframe,
+        gr.Dropdown(
+            choices=["chunks"],
+            label="Documents column",
+            value=col_doc,
+            interactive=(False if col_doc == "" else True),
+            multiselect=False,
+        ),
+    )

src/synthetic_dataset_generator/apps/chat.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import ast
 import random
 import uuid
 from typing import Dict, List, Union
@@ -8,11 +9,15 @@ import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     test_max_num_rows,
@@ -29,15 +34,18 @@ from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
 from synthetic_dataset_generator.pipelines.chat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
 from synthetic_dataset_generator.utils import (
     get_argilla_client,
     get_org_dropdown,
     get_random_repo_name,
@@ -45,6 +53,14 @@ from synthetic_dataset_generator.utils import (
 )
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
         return ast.literal_eval(
@@ -77,28 +93,57 @@ def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
     return result
-def generate_sample_dataset(system_prompt: str, num_turns: int, progress=gr.Progress()):
-    progress(0.1, desc="Generating sample dataset")
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
-        num_rows=10,
-        progress=progress,
         is_sample=True,
     )
     progress(1.0, desc="Sample dataset generated")
     return dataframe
-def _get_dataframe():
-    return gr.Dataframe(
-        headers=["prompt", "completion"],
-        wrap=True,
-        interactive=False,
-    )
-def generate_dataset(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 10,
@@ -108,9 +153,7 @@ def generate_dataset(
 ) -> pd.DataFrame:
     num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating instructions")
-    magpie_generator = get_magpie_generator(
-        num_turns, temperature, is_sample
-    )
     response_generator = get_response_generator(
         system_prompt, num_turns, temperature, is_sample
     )
@@ -217,6 +260,171 @@ def generate_dataset(
     return dataframe
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     org_name: str,
@@ -251,17 +459,35 @@ def push_dataset_to_hub(
 def push_dataset(
     org_name: str,
     repo_name: str,
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 10,
-    private: bool = False,
     temperature: float = 0.9,
     pipeline_code: str = "",
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
         num_rows=num_rows,
         temperature=temperature,
@@ -395,6 +621,28 @@ def push_dataset(
     return ""
 def show_pipeline_code_visibility():
     return {pipeline_code_ui: gr.Accordion(visible=True)}
@@ -422,29 +670,85 @@ with gr.Blocks() as app:
                 )
             )
         else:
-            gr.Markdown(value="## 1. Describe the dataset you want")
-            with gr.Row():
                 with gr.Column(scale=2):
-                    dataset_description = gr.Textbox(
-                        label="Dataset description",
-                        placeholder="Give a precise description of your desired dataset.",
-                    )
-                    with gr.Row():
-                        clear_btn_part = gr.Button(
-                            "Clear",
-                            variant="secondary",
-                        )
-                        load_btn = gr.Button(
-                            "Create",
-                            variant="primary",
-                        )
-                with gr.Column(scale=3):
-                    examples = gr.Examples(
-                        examples=DEFAULT_DATASET_DESCRIPTIONS,
-                        inputs=[dataset_description],
-                        cache_examples=False,
-                        label="Examples",
                     )
             gr.HTML(value="<hr>")
             gr.Markdown(value="## 2. Configure your dataset")
@@ -454,6 +758,16 @@ with gr.Blocks() as app:
                         label="System prompt",
                         placeholder="You are a helpful assistant.",
                     )
                     num_turns = gr.Number(
                         value=1,
                         label="Number of turns in the conversation",
@@ -519,7 +833,10 @@ with gr.Blocks() as app:
                         visible=False,
                     ) as pipeline_code_ui:
                         code = generate_pipeline_code(
                             system_prompt=system_prompt.value,
                             num_turns=num_turns.value,
                             num_rows=num_rows.value,
                         )
@@ -529,77 +846,137 @@ with gr.Blocks() as app:
                             label="Distilabel Pipeline Code",
                         )
-    load_btn.click(
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt],
-        show_progress=True,
-    ).then(
         fn=generate_sample_dataset,
-        inputs=[system_prompt, num_turns],
-        outputs=[dataframe],
-        show_progress=True,
     )
     btn_apply_to_sample_dataset.click(
         fn=generate_sample_dataset,
-        inputs=[system_prompt, num_turns],
-        outputs=[dataframe],
-        show_progress=True,
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
         outputs=[pipeline_code_ui],
-        show_progress=True,
     ).success(
         fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
             system_prompt,
             num_turns,
             num_rows,
-            private,
             temperature,
             pipeline_code,
         ],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=generate_pipeline_code,
-        inputs=[system_prompt, num_turns, num_rows],
         outputs=[pipeline_code],
     ).success(
         fn=show_pipeline_code_visibility,
         inputs=[],
         outputs=[pipeline_code_ui],
     )
-    gr.on(
-        triggers=[clear_btn_part.click, clear_btn_full.click],
-        fn=lambda _: ("", "", 1, _get_dataframe()),
         inputs=[dataframe],
-        outputs=[dataset_description, system_prompt, num_turns, dataframe],
     )
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])
-    app.load(fn=swap_visibility, outputs=main_ui)

 import ast
+import json
 import random
 import uuid
 from typing import Dict, List, Union
 import pandas as pd
 from datasets import Dataset
 from distilabel.distiset import Distiset
+from gradio.oauth import OAuthToken
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
     hide_success_message,
+    load_dataset_from_hub,
+    preprocess_input_data,
     push_pipeline_code_to_hub,
     show_success_message,
     test_max_num_rows,
 from synthetic_dataset_generator.pipelines.chat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
+    get_follow_up_generator,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
+    get_sentence_pair_generator,
 )
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
 from synthetic_dataset_generator.utils import (
+    column_to_list,
     get_argilla_client,
     get_org_dropdown,
     get_random_repo_name,
 )
+def _get_dataframe():
+    return gr.Dataframe(
+        headers=["prompt", "completion"],
+        wrap=True,
+        interactive=False,
+    )
 def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     def convert_to_list_of_dicts(messages: str) -> List[Dict[str, str]]:
         return ast.literal_eval(
     return result
+def load_dataset_file(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    num_rows: int = 10,
+    token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+):
+    progress(0.1, desc="Loading the source data")
+    if input_type == "dataset-input":
+        return load_dataset_from_hub(repo_id=repo_id, num_rows=num_rows, token=token)
+    else:
+        return preprocess_input_data(file_paths=file_paths, num_rows=num_rows)
+def generate_sample_dataset(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+    oauth_token: Union[OAuthToken, None],
+    progress=gr.Progress(),
+):
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["prompt", "completion"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating sample dataset")
     dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
         system_prompt=system_prompt,
+        document_column=document_column,
         num_turns=num_turns,
+        num_rows=num_rows,
         is_sample=True,
     )
     progress(1.0, desc="Sample dataset generated")
     return dataframe
+def generate_dataset_from_prompt(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 10,
 ) -> pd.DataFrame:
     num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(num_turns, temperature, is_sample)
     response_generator = get_response_generator(
         system_prompt, num_turns, temperature, is_sample
     )
     return dataframe
+def generate_dataset_from_seed(
+    dataframe: pd.DataFrame,
+    document_column: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    num_rows = test_max_num_rows(num_rows)
+    progress(0.0, desc="Initializing dataset generation")
+    document_data = column_to_list(dataframe, document_column)
+    if len(document_data) < num_rows:
+        document_data += random.choices(document_data, k=num_rows - len(document_data))
+    instruction_generator = get_sentence_pair_generator(
+        temperature=temperature, is_sample=is_sample
+    )
+    response_generator = get_response_generator(
+        system_prompt=None, num_turns=1, temperature=temperature, is_sample=is_sample
+    )
+    follow_up_generator_instruction = get_follow_up_generator(
+        type="instruction", temperature=temperature, is_sample=is_sample
+    )
+    follow_up_generator_response = get_follow_up_generator(
+        type="response", temperature=temperature, is_sample=is_sample
+    )
+    steps = 2 * num_turns
+    total_steps: int = num_rows * steps
+    step_progress = round(1 / steps, 2)
+    batch_size = DEFAULT_BATCH_SIZE
+    # create instructions
+    n_processed = 0
+    instruction_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating questions",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        batch = [
+            {"anchor": document}
+            for document in document_data[n_processed : n_processed + batch_size]
+        ]
+        questions = list(instruction_generator.process(inputs=batch))
+        instruction_results.extend(questions[0])
+        n_processed += batch_size
+    for result in instruction_results:
+        result["instruction"] = result["positive"]
+        result["prompt"] = result.pop("positive")
+    progress(step_progress, desc="Generating instructions")
+    # generate responses
+    n_processed = 0
+    response_results = []
+    while n_processed < num_rows:
+        progress(
+            step_progress + step_progress * n_processed / num_rows,
+            total=total_steps,
+            desc="Generating responses",
+        )
+        batch = instruction_results[n_processed : n_processed + batch_size]
+        responses = list(response_generator.process(inputs=batch))
+        response_results.extend(responses[0])
+        n_processed += batch_size
+    for result in response_results:
+        result["completion"] = result.pop("generation")
+    # generate follow-ups
+    if num_turns > 1:
+        n_processed = 0
+        final_conversations = []
+        while n_processed < num_rows:
+            progress(
+                step_progress + step_progress * n_processed / num_rows,
+                total=total_steps,
+                desc="Generating follow-ups",
+            )
+            batch = response_results[n_processed : n_processed + batch_size]
+            conversations_batch = [
+                {
+                    "messages": [
+                        {"role": "user", "content": result["prompt"]},
+                        {"role": "assistant", "content": result["completion"]},
+                    ]
+                }
+                for result in batch
+            ]
+            for _ in range(num_turns - 1):
+                follow_up_instructions = list(
+                    follow_up_generator_instruction.process(inputs=conversations_batch)
+                )
+                for conv, follow_up in zip(conversations_batch, follow_up_instructions[0]):
+                    conv["messages"].append(
+                        {"role": "user", "content": follow_up["generation"]}
+                    )
+                follow_up_responses = list(
+                    follow_up_generator_response.process(inputs=conversations_batch)
+                )
+                for conv, follow_up in zip(conversations_batch, follow_up_responses[0]):
+                    conv["messages"].append(
+                        {"role": "assistant", "content": follow_up["generation"]}
+                    )
+            final_conversations.extend(
+                [{"messages": conv["messages"]} for conv in conversations_batch]
+            )
+            n_processed += batch_size
+    # create distiset
+    distiset_results = []
+    if num_turns == 1:
+        for result in response_results:
+            record = {}
+            for relevant_keys in ["prompt", "completion"]:
+                if relevant_keys in result:
+                    record[relevant_keys] = result[relevant_keys]
+            distiset_results.append(record)
+        dataframe = pd.DataFrame(distiset_results)
+    else:
+        distiset_results = final_conversations
+        dataframe = pd.DataFrame(distiset_results)
+        dataframe["messages"] = dataframe["messages"].apply(lambda x: json.dumps(x))
+    progress(1.0, desc="Dataset generation completed")
+    return dataframe
+def generate_dataset(
+    input_type: str,
+    dataframe: pd.DataFrame,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int = 1,
+    num_rows: int = 10,
+    temperature: float = 0.9,
+    is_sample: bool = False,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = generate_dataset_from_prompt(
+            system_prompt=system_prompt,
+            num_turns=num_turns,
+            num_rows=num_rows,
+            temperature=temperature,
+            is_sample=is_sample,
+        )
+    else:
+        dataframe = generate_dataset_from_seed(
+            dataframe=dataframe,
+            document_column=document_column,
+            num_turns=num_turns,
+            num_rows=num_rows,
+            temperature=temperature,
+            is_sample=is_sample,
+        )
+    return dataframe
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     org_name: str,
 def push_dataset(
     org_name: str,
     repo_name: str,
+    private: bool,
+    original_repo_id: str,
+    file_paths: list[str],
+    input_type: str,
     system_prompt: str,
+    document_column: str,
     num_turns: int = 1,
     num_rows: int = 10,
     temperature: float = 0.9,
     pipeline_code: str = "",
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    if input_type == "prompt-input":
+        dataframe = _get_dataframe()
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=original_repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating dataset")
     dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
         system_prompt=system_prompt,
+        document_column=document_column,
         num_turns=num_turns,
         num_rows=num_rows,
         temperature=temperature,
     return ""
+def show_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=True)}
+def hide_system_prompt_visibility():
+    return {system_prompt: gr.Textbox(visible=False)}
+def show_document_column_visibility():
+    return {document_column: gr.Dropdown(visible=True)}
+def hide_document_column_visibility():
+    return {
+        document_column: gr.Dropdown(
+            choices=["Load your data first in step 1."],
+            value="Load your data first in step 1.",
+            visible=False,
+        )
+    }
 def show_pipeline_code_visibility():
     return {pipeline_code_ui: gr.Accordion(visible=True)}
                 )
             )
         else:
+            gr.Markdown("## 1. Select your input")
+            with gr.Row(equal_height=False):
                 with gr.Column(scale=2):
+                    input_type = gr.Dropdown(
+                        label="Input type",
+                        choices=["prompt-input", "dataset-input", "file-input"],
+                        value="prompt-input",
+                        multiselect=False,
+                        visible=False,
                     )
+                    with gr.Tab("Generate from prompt") as tab_prompt_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                dataset_description = gr.Textbox(
+                                    label="Dataset description",
+                                    placeholder="Give a precise description of your desired dataset.",
+                                )
+                                with gr.Row():
+                                    clear_prompt_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_prompt_btn = gr.Button(
+                                        "Create", variant="primary"
+                                    )
+                            with gr.Column(scale=3):
+                                examples = gr.Examples(
+                                    examples=DEFAULT_DATASET_DESCRIPTIONS,
+                                    inputs=[dataset_description],
+                                    cache_examples=False,
+                                    label="Examples",
+                                )
+                    with gr.Tab("Load from Hub") as tab_dataset_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                search_in = HuggingfaceHubSearch(
+                                    label="Search",
+                                    placeholder="Search for a dataset",
+                                    search_type="dataset",
+                                    sumbit_on_select=True,
+                                )
+                                with gr.Row():
+                                    clear_dataset_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_dataset_btn = gr.Button(
+                                        "Load", variant="primary"
+                                    )
+                            with gr.Column(scale=3):
+                                examples = gr.Examples(
+                                    examples=[
+                                        "charris/wikipedia_sample",
+                                        "plaguss/argilla_sdk_docs_raw_unstructured",
+                                        "BeIR/hotpotqa-generated-queries",
+                                    ],
+                                    label="Example datasets",
+                                    fn=lambda x: x,
+                                    inputs=[search_in],
+                                    run_on_click=True,
+                                )
+                                search_out = gr.HTML(
+                                    label="Dataset preview", visible=False
+                                )
+                    with gr.Tab("Load your file") as tab_file_input:
+                        with gr.Row(equal_height=False):
+                            with gr.Column(scale=2):
+                                file_in = gr.File(
+                                    label="Upload your file. Supported formats: .md, .txt, .docx, .pdf",
+                                    file_count="multiple",
+                                    file_types=[".md", ".txt", ".docx", ".pdf"],
+                                )
+                                with gr.Row():
+                                    clear_file_btn_part = gr.Button(
+                                        "Clear", variant="secondary"
+                                    )
+                                    load_file_btn = gr.Button("Load", variant="primary")
+                            with gr.Column(scale=3):
+                                file_out = gr.HTML(
+                                    label="Dataset preview", visible=False
+                                )
             gr.HTML(value="<hr>")
             gr.Markdown(value="## 2. Configure your dataset")
                         label="System prompt",
                         placeholder="You are a helpful assistant.",
                     )
+                    document_column = gr.Dropdown(
+                        label="Document Column",
+                        info="Select the document column to generate the RAG dataset",
+                        choices=["Load your data first in step 1."],
+                        value="Load your data first in step 1.",
+                        interactive=False,
+                        multiselect=False,
+                        allow_custom_value=False,
+                        visible=False,
+                    )
                     num_turns = gr.Number(
                         value=1,
                         label="Number of turns in the conversation",
                         visible=False,
                     ) as pipeline_code_ui:
                         code = generate_pipeline_code(
+                            repo_id=search_in.value,
+                            input_type=input_type.value,
                             system_prompt=system_prompt.value,
+                            document_column=document_column.value,
                             num_turns=num_turns.value,
                             num_rows=num_rows.value,
                         )
                             label="Distilabel Pipeline Code",
                         )
+    tab_prompt_input.select(
+        fn=lambda: "prompt-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=show_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=hide_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_dataset_input.select(
+        fn=lambda: "dataset-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    tab_file_input.select(
+        fn=lambda: "file-input",
+        inputs=[],
+        outputs=[input_type],
+    ).then(fn=hide_system_prompt_visibility, inputs=[], outputs=[system_prompt]).then(
+        fn=show_document_column_visibility, inputs=[], outputs=[document_column]
+    )
+    search_in.submit(
+        fn=lambda df: pd.DataFrame(columns=df.columns),
+        inputs=[dataframe],
+        outputs=[dataframe],
+    )
+    load_prompt_btn.click(
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt],
+    ).success(
         fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=dataframe,
+    )
+    gr.on(
+        triggers=[load_dataset_btn.click, load_file_btn.click],
+        fn=load_dataset_file,
+        inputs=[search_in, file_in, input_type],
+        outputs=[dataframe, document_column],
     )
     btn_apply_to_sample_dataset.click(
         fn=generate_sample_dataset,
+        inputs=[
+            search_in,
+            file_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
+        outputs=dataframe,
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
         outputs=[pipeline_code_ui],
     ).success(
         fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
+            private,
+            search_in,
+            file_in,
+            input_type,
             system_prompt,
+            document_column,
             num_turns,
             num_rows,
             temperature,
             pipeline_code,
         ],
         outputs=[success_message],
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            input_type,
+            system_prompt,
+            document_column,
+            num_turns,
+            num_rows,
+        ],
         outputs=[pipeline_code],
     ).success(
         fn=show_pipeline_code_visibility,
         inputs=[],
         outputs=[pipeline_code_ui],
     )
+    clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
+    clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
+    clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
+    clear_btn_full.click(
+        fn=lambda df: ("", "", [], _get_dataframe()),
         inputs=[dataframe],
+        outputs=[system_prompt, document_column, num_turns, dataframe],
     )
+    app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])
     app.load(fn=get_random_repo_name, outputs=[repo_name])

src/synthetic_dataset_generator/apps/eval.py CHANGED Viewed

@@ -15,7 +15,7 @@ from datasets import (
 from distilabel.distiset import Distiset
 from gradio.oauth import OAuthToken  #
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from huggingface_hub import HfApi, repo_exists
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
@@ -130,9 +130,9 @@ def load_dataset_from_hub(
             choices=response_valid_columns,
             label="Response column",
             value=col_response,
-            interactive=False
-            if col_response == "No valid response columns found."
-            else True,
         ),
         prompt_template,
         structured_output,
@@ -831,16 +831,13 @@ with gr.Blocks() as app:
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
@@ -862,7 +859,6 @@ with gr.Blocks() as app:
             pipeline_code,
         ],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
@@ -882,14 +878,14 @@ with gr.Blocks() as app:
         outputs=[pipeline_code_ui],
     )
-    clear_btn_part.click(fn=lambda : "", inputs=[], outputs=[search_in])
     clear_btn_full.click(
         fn=lambda df: ("", "", pd.DataFrame(columns=df.columns)),
         inputs=[dataframe],
         outputs=[
             instruction_instruction_response,
             response_instruction_response,
-            dataframe
         ],
     )

 from distilabel.distiset import Distiset
 from gradio.oauth import OAuthToken  #
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
             choices=response_valid_columns,
             label="Response column",
             value=col_response,
+            interactive=(
+                False if col_response == "No valid response columns found." else True
+            ),
         ),
         prompt_template,
         structured_output,
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
             pipeline_code,
         ],
         outputs=[success_message],
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[pipeline_code_ui],
     )
+    clear_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_btn_full.click(
         fn=lambda df: ("", "", pd.DataFrame(columns=df.columns)),
         inputs=[dataframe],
         outputs=[
             instruction_instruction_response,
             response_instruction_response,
+            dataframe,
         ],
     )

src/synthetic_dataset_generator/apps/rag.py CHANGED Viewed

@@ -1,30 +1,23 @@
 import os
 import random
 import uuid
-from tqdm import tqdm
 from typing import Union
 import argilla as rg
 import gradio as gr
 import nltk
 import pandas as pd
-from datasets import (
-    Dataset,
-    get_dataset_config_names,
-    get_dataset_split_names,
-    load_dataset,
-)
 from distilabel.distiset import Distiset
 from gradio.oauth import OAuthToken
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi
-from unstructured.chunking.title import chunk_by_title
-from unstructured.partition.auto import partition
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
-    get_iframe,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     test_max_num_rows,
@@ -39,11 +32,11 @@ from synthetic_dataset_generator.pipelines.embeddings import (
 )
 from synthetic_dataset_generator.pipelines.rag import (
     DEFAULT_DATASET_DESCRIPTIONS,
     get_chunks_generator,
     get_prompt_generator,
-    generate_pipeline_code,
-    get_sentence_pair_generator,
     get_response_generator,
 )
 from synthetic_dataset_generator.utils import (
     column_to_list,
@@ -58,81 +51,6 @@ nltk.data.path.append("./nltk_data")
 nltk.download("punkt_tab", download_dir="./nltk_data")
 nltk.download("averaged_perceptron_tagger_eng", download_dir="./nltk_data")
-def _get_valid_columns(dataframe: pd.DataFrame):
-    doc_valid_columns = []
-    for col in dataframe.columns:
-        sample_val = dataframe[col].iloc[0]
-        if isinstance(sample_val, str):
-            doc_valid_columns.append(col)
-    return doc_valid_columns
-def _load_dataset_from_hub(
-    repo_id: str,
-    num_rows: int = 10,
-    token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if not repo_id:
-        raise gr.Error("Please provide a Hub repo ID")
-    subsets = get_dataset_config_names(repo_id, token=token)
-    splits = get_dataset_split_names(repo_id, subsets[0], token=token)
-    ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
-    rows = []
-    for idx, row in enumerate(tqdm(ds, desc="Loading the dataset", total=num_rows)):
-        rows.append(row)
-        if idx == num_rows:
-            break
-    ds = Dataset.from_list(rows)
-    dataframe = ds.to_pandas()
-    doc_valid_columns = _get_valid_columns(dataframe)
-    col_doc = doc_valid_columns[0] if doc_valid_columns else ""
-    return (
-        dataframe,
-        gr.Dropdown(
-            choices=doc_valid_columns,
-            label="Documents column",
-            value=col_doc,
-            interactive=(False if col_doc == "" else True),
-            multiselect=False,
-        ),
-    )
-def _preprocess_input_data(file_paths: list[str], num_rows: int, progress=gr.Progress(track_tqdm=True)):
-    if not file_paths:
-        raise gr.Error("Please provide an input file")
-    data = {}
-    total_chunks = 0
-    for file_path in tqdm(file_paths, desc="Processing files", total=len(file_paths)):
-        partitioned_file = partition(filename=file_path)
-        chunks = [str(chunk) for chunk in chunk_by_title(partitioned_file)]
-        data[file_path] = chunks
-        total_chunks += len(chunks)
-        if total_chunks >= num_rows:
-            break
-    dataframe = pd.DataFrame.from_records(
-        [(k, v) for k, values in data.items() for v in values],
-        columns=["filename", "chunks"],
-    )
-    col_doc = "chunks"
-    return (
-        dataframe,
-        gr.Dropdown(
-            choices=["chunks"],
-            label="Documents column",
-            value=col_doc,
-            interactive=(False if col_doc == "" else True),
-            multiselect=False,
-        ),
-    )
 def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
     progress(0.1, desc="Initializing")
@@ -161,9 +79,48 @@ def load_dataset_file(
 ):
     progress(0.1, desc="Loading the source data")
     if input_type == "dataset-input":
-        return _load_dataset_from_hub(repo_id, num_rows, token)
     else:
-        return _preprocess_input_data(file_paths, num_rows)
 def generate_dataset(
@@ -323,44 +280,6 @@ def generate_dataset(
     return dataframe
-def generate_sample_dataset(
-    repo_id: str,
-    file_paths: list[str],
-    input_type: str,
-    system_prompt: str,
-    document_column: str,
-    retrieval_reranking: list[str],
-    num_rows: str,
-    oauth_token: Union[OAuthToken, None],
-    progress=gr.Progress(),
-):
-    retrieval = "Retrieval" in retrieval_reranking
-    reranking = "Reranking" in retrieval_reranking
-    if input_type == "prompt-input":
-        dataframe = pd.DataFrame(columns=["context", "question", "response"])
-    else:
-        dataframe, _ = load_dataset_file(
-            repo_id=repo_id,
-            file_paths=file_paths,
-            input_type=input_type,
-            num_rows=num_rows,
-            token=oauth_token,
-        )
-    progress(0.5, desc="Generating dataset")
-    dataframe = generate_dataset(
-        input_type=input_type,
-        dataframe=dataframe,
-        system_prompt=system_prompt,
-        document_column=document_column,
-        retrieval=retrieval,
-        reranking=reranking,
-        num_rows=10,
-        is_sample=True,
-    )
-    return dataframe
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     org_name: str,
@@ -428,15 +347,12 @@ def push_dataset(
         reranking=reranking,
         num_rows=num_rows,
         temperature=temperature,
-        is_sample=True,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, oauth_token, private, pipeline_code
     )
     dataframe = dataframe[
-        dataframe.applymap(
-            lambda x: str(x).strip() if pd.notna(x) else x
-        ).apply(
             lambda row: row.notna().all() and (row != "").all(), axis=1
         )
     ]
@@ -677,7 +593,7 @@ with gr.Blocks() as app:
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 2. Configure your task")
-        with gr.Row(equal_height=True):
             with gr.Column(scale=2):
                 system_prompt = gr.Textbox(
                     label="System prompt",
@@ -701,9 +617,7 @@ with gr.Blocks() as app:
                 )
                 with gr.Row():
                     clear_btn_full = gr.Button("Clear", variant="secondary")
-                    btn_apply_to_sample_dataset = gr.Button(
-                        "Save", variant="primary"
-                    )
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
                     headers=["context", "question", "response"],
@@ -791,35 +705,23 @@ with gr.Blocks() as app:
         fn=hide_document_column_visibility, inputs=[], outputs=[document_column]
     )
-    search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out).then(
         fn=lambda df: pd.DataFrame(columns=df.columns),
         inputs=[dataframe],
         outputs=[dataframe],
     )
-    load_dataset_btn.click(
         fn=load_dataset_file,
         inputs=[search_in, file_in, input_type],
-        outputs=[
-            dataframe,
-            document_column,
-        ],
-    )
-    load_file_btn.click(
-        fn=load_dataset_file,
-        inputs=[search_in, file_in, input_type],
-        outputs=[
-            dataframe,
-            document_column,
-        ],
     )
     load_prompt_btn.click(
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt],
-        show_progress=True,
     ).success(
         fn=generate_sample_dataset,
         inputs=[
@@ -852,16 +754,13 @@ with gr.Blocks() as app:
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
@@ -883,7 +782,6 @@ with gr.Blocks() as app:
             pipeline_code,
         ],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
@@ -905,11 +803,9 @@ with gr.Blocks() as app:
         outputs=[pipeline_code_ui],
     )
-    clear_dataset_btn_part.click(fn=lambda : "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
-    clear_prompt_btn_part.click(
-        fn=lambda : "", inputs=[], outputs=[dataset_description]
-    )
     clear_btn_full.click(
         fn=lambda df: ("", [], pd.DataFrame(columns=df.columns)),
         inputs=[dataframe],

 import os
 import random
 import uuid
 from typing import Union
 import argilla as rg
 import gradio as gr
 import nltk
 import pandas as pd
+from datasets import Dataset
 from distilabel.distiset import Distiset
 from gradio.oauth import OAuthToken
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
     combine_datasets,
     hide_success_message,
+    load_dataset_from_hub,
+    preprocess_input_data,
     push_pipeline_code_to_hub,
     show_success_message,
     test_max_num_rows,
 )
 from synthetic_dataset_generator.pipelines.rag import (
     DEFAULT_DATASET_DESCRIPTIONS,
+    generate_pipeline_code,
     get_chunks_generator,
     get_prompt_generator,
     get_response_generator,
+    get_sentence_pair_generator,
 )
 from synthetic_dataset_generator.utils import (
     column_to_list,
 nltk.download("punkt_tab", download_dir="./nltk_data")
 nltk.download("averaged_perceptron_tagger_eng", download_dir="./nltk_data")
 def generate_system_prompt(dataset_description: str, progress=gr.Progress()):
     progress(0.1, desc="Initializing")
 ):
     progress(0.1, desc="Loading the source data")
     if input_type == "dataset-input":
+        return load_dataset_from_hub(repo_id=repo_id, num_rows=num_rows, token=token)
     else:
+        return preprocess_input_data(file_paths=file_paths, num_rows=num_rows)
+def generate_sample_dataset(
+    repo_id: str,
+    file_paths: list[str],
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    retrieval_reranking: list[str],
+    num_rows: str,
+    oauth_token: Union[OAuthToken, None],
+    progress=gr.Progress(),
+):
+    retrieval = "Retrieval" in retrieval_reranking
+    reranking = "Reranking" in retrieval_reranking
+    if input_type == "prompt-input":
+        dataframe = pd.DataFrame(columns=["context", "question", "response"])
+    else:
+        dataframe, _ = load_dataset_file(
+            repo_id=repo_id,
+            file_paths=file_paths,
+            input_type=input_type,
+            num_rows=num_rows,
+            token=oauth_token,
+        )
+    progress(0.5, desc="Generating dataset")
+    dataframe = generate_dataset(
+        input_type=input_type,
+        dataframe=dataframe,
+        system_prompt=system_prompt,
+        document_column=document_column,
+        retrieval=retrieval,
+        reranking=reranking,
+        num_rows=10,
+        is_sample=True,
+    )
+    progress(1.0, desc="Sample dataset generated")
+    return dataframe
 def generate_dataset(
     return dataframe
 def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     org_name: str,
         reranking=reranking,
         num_rows=num_rows,
         temperature=temperature,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, oauth_token, private, pipeline_code
     )
     dataframe = dataframe[
+        dataframe.applymap(lambda x: str(x).strip() if pd.notna(x) else x).apply(
             lambda row: row.notna().all() and (row != "").all(), axis=1
         )
     ]
         gr.HTML(value="<hr>")
         gr.Markdown(value="## 2. Configure your task")
+        with gr.Row(equal_height=False):
             with gr.Column(scale=2):
                 system_prompt = gr.Textbox(
                     label="System prompt",
                 )
                 with gr.Row():
                     clear_btn_full = gr.Button("Clear", variant="secondary")
+                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
                     headers=["context", "question", "response"],
         fn=hide_document_column_visibility, inputs=[], outputs=[document_column]
     )
+    search_in.submit(
         fn=lambda df: pd.DataFrame(columns=df.columns),
         inputs=[dataframe],
         outputs=[dataframe],
     )
+    gr.on(
+        triggers=[load_dataset_btn.click, load_file_btn.click],
         fn=load_dataset_file,
         inputs=[search_in, file_in, input_type],
+        outputs=[dataframe, document_column],
     )
     load_prompt_btn.click(
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt],
     ).success(
         fn=generate_sample_dataset,
         inputs=[
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
             pipeline_code,
         ],
         outputs=[success_message],
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[pipeline_code_ui],
     )
+    clear_dataset_btn_part.click(fn=lambda: "", inputs=[], outputs=[search_in])
     clear_file_btn_part.click(fn=lambda: None, inputs=[], outputs=[file_in])
+    clear_prompt_btn_part.click(fn=lambda: "", inputs=[], outputs=[dataset_description])
     clear_btn_full.click(
         fn=lambda df: ("", [], pd.DataFrame(columns=df.columns)),
         inputs=[dataframe],

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -458,7 +458,7 @@ with gr.Blocks() as app:
         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
-        with gr.Row(equal_height=True):
             with gr.Column(scale=2):
                 system_prompt = gr.Textbox(
                     label="System prompt",
@@ -508,9 +508,7 @@ with gr.Blocks() as app:
                 )
                 with gr.Row():
                     clear_btn_full = gr.Button("Clear", variant="secondary")
-                    btn_apply_to_sample_dataset = gr.Button(
-                        "Save", variant="primary"
-                    )
             with gr.Column(scale=3):
                 dataframe = _get_dataframe()
@@ -574,45 +572,37 @@ with gr.Blocks() as app:
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt, labels],
-        show_progress=True,
     ).then(
         fn=generate_sample_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
-        show_progress=True,
     )
     btn_apply_to_sample_dataset.click(
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
-        show_progress=True,
     ).success(
         fn=generate_sample_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
-        show_progress=True,
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
-        show_progress=True,
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
@@ -633,7 +623,6 @@ with gr.Blocks() as app:
             pipeline_code,
         ],
         outputs=[success_message],
-        show_progress=True,
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],

         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
+        with gr.Row(equal_height=False):
             with gr.Column(scale=2):
                 system_prompt = gr.Textbox(
                     label="System prompt",
                 )
                 with gr.Row():
                     clear_btn_full = gr.Button("Clear", variant="secondary")
+                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
             with gr.Column(scale=3):
                 dataframe = _get_dataframe()
         fn=generate_system_prompt,
         inputs=[dataset_description],
         outputs=[system_prompt, labels],
     ).then(
         fn=generate_sample_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
     )
     btn_apply_to_sample_dataset.click(
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
     ).success(
         fn=generate_sample_dataset,
         inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
     )
     btn_push_to_hub.click(
         fn=validate_argilla_user_workspace_dataset,
         inputs=[repo_name],
         outputs=[success_message],
     ).then(
         fn=validate_push_to_hub,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=validate_input_labels,
         inputs=[labels],
         outputs=[labels],
     ).success(
         fn=hide_success_message,
         outputs=[success_message],
     ).success(
         fn=hide_pipeline_code_visibility,
         inputs=[],
             pipeline_code,
         ],
         outputs=[success_message],
     ).success(
         fn=show_success_message,
         inputs=[org_name, repo_name],

src/synthetic_dataset_generator/pipelines/chat.py CHANGED Viewed

@@ -1,4 +1,10 @@
-from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
     MAGPIE_PRE_QUERY_TEMPLATE,
@@ -118,6 +124,18 @@ The prompt you write should follow the same style and structure as the following
 User dataset description:
 """
 DEFAULT_DATASET_DESCRIPTIONS = [
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
@@ -203,6 +221,21 @@ def get_magpie_generator(num_turns: int, temperature: float, is_sample: bool):
     return magpie_generator
 def get_response_generator(
     system_prompt: str, num_turns: int, temperature: float, is_sample: bool
 ):
@@ -231,36 +264,236 @@ def get_response_generator(
     return response_generator
-def generate_pipeline_code(system_prompt: str, num_turns: int, num_rows: int):
     input_mappings = _get_output_mappings(num_turns)
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
-import os
 from distilabel.pipeline import Pipeline
-from distilabel.steps import KeepColumns
-from distilabel.steps.tasks import MagpieGenerator
-from distilabel.llms import {_get_llm_class()}
-SYSTEM_PROMPT = "{system_prompt}"
 with Pipeline(name="sft") as pipeline:
-    magpie = MagpieGenerator(
         llm={_get_llm_class()}.from_dict(
             {_get_llm().dump()}
         ),
-        n_turns={num_turns},
-        num_rows={num_rows},
-        batch_size=1,
-        system_prompt=SYSTEM_PROMPT,
-        output_mappings={input_mappings},
     )
-    keep_columns = KeepColumns(
-        columns={list(input_mappings.values())} + ["model_name"],
     )
-    magpie.connect(keep_columns)
 if __name__ == "__main__":
     distiset = pipeline.run()
 """
     return code

+from datasets import get_dataset_config_names, get_dataset_split_names
+from distilabel.steps.tasks import (
+    ChatGeneration,
+    Magpie,
+    GenerateSentencePair,
+    TextGeneration,
+)
 from synthetic_dataset_generator.constants import (
     MAGPIE_PRE_QUERY_TEMPLATE,
 User dataset description:
 """
+FOLLOW_UP_TEMPLATE = """Conversation:
+{% for message in messages %}
+    {% if message.role == "user" %}
+User Question: {{ message.content }}
+    {% elif message.role == "assistant" %}
+Assistant Response: {{ message.content }}
+    {% endif %}
+{% endfor %}
+Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
+""".rstrip()
 DEFAULT_DATASET_DESCRIPTIONS = [
     "rude customer assistant for a phone company",
     "assistant that solves math puzzles using python",
     return magpie_generator
+def get_sentence_pair_generator(temperature: float, is_sample: bool):
+    generation_kwargs = {
+        "temperature": temperature,
+        "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
+    }
+    sentence_pair_generator = GenerateSentencePair(
+        llm=_get_llm(generation_kwargs=generation_kwargs),
+        triplet=False,
+        action="query",
+        hard_negative=True,
+    )
+    sentence_pair_generator.load()
+    return sentence_pair_generator
 def get_response_generator(
     system_prompt: str, num_turns: int, temperature: float, is_sample: bool
 ):
     return response_generator
+def get_follow_up_generator(type: str, temperature: float, is_sample: bool):
+    if type == "instruction":
+        generation_kwargs = {
+            "temperature": temperature,
+            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
+        }
+        follow_up_generator = TextGeneration(
+            llm=_get_llm(generation_kwargs=generation_kwargs),
+            template=FOLLOW_UP_TEMPLATE,
+            columns=["messages"],
+        )
+    else:
+        generation_kwargs = {
+            "temperature": temperature,
+            "max_new_tokens": MAX_NUM_TOKENS,
+        }
+        follow_up_generator = ChatGeneration(
+            llm=_get_llm(generation_kwargs=generation_kwargs),
+        )
+    follow_up_generator.load()
+    return follow_up_generator
+def generate_pipeline_code_system_prompt(
+    system_prompt: str,
+    num_turns: int,
+    num_rows: int,
+):
     input_mappings = _get_output_mappings(num_turns)
+    code = f"""
+    # Requirements: `pip install distilabel[hf-inference-endpoints]`
+    import os
+    from distilabel.pipeline import Pipeline
+    from distilabel.steps import KeepColumns
+    from distilabel.steps.tasks import MagpieGenerator
+    from distilabel.llms import {_get_llm_class()}
+    SYSTEM_PROMPT = "{system_prompt}"
+    with Pipeline(name="sft") as pipeline:
+        magpie = MagpieGenerator(
+            llm={_get_llm_class()}.from_dict(
+                {_get_llm().dump()}
+            ),
+            n_turns={num_turns},
+            num_rows={num_rows},
+            batch_size=1,
+            system_prompt=SYSTEM_PROMPT,
+            output_mappings={input_mappings},
+        )
+        keep_columns = KeepColumns(
+            columns={list(input_mappings.values())} + ["model_name"],
+        )
+        magpie.connect(keep_columns)
+    if __name__ == "__main__":
+        distiset = pipeline.run()
+    """
+    return code
+def generate_pipeline_code_seed(
+    repo_id: str,
+    subset: str,
+    split: str,
+    input_type: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+):
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
+from distilabel.models import {_get_llm_class()}
 from distilabel.pipeline import Pipeline
+from distilabel.steps import KeepColumns{", LoadDataFromDicts" if input_type != "dataset-input"  else ""}{", LoadDataFromHub" if input_type == "dataset-input" else ""}
+from distilabel.steps.tasks import GenerateSentencePair, TextGeneration {", ChatGeneration" if num_turns > 1 else ""}
+"""
+    if num_turns > 1:
+        code += """
+FOLLOW_UP_TEMPLATE = '''Conversation:
+{{% for message in messages %}}
+    {{% if message.role == "user" %}}
+User Question: {{{{ message.content }}}}
+    {{% elif message.role == "assistant" %}}
+Assistant Response: {{{{ message.content }}}}
+    {{% endif %}}
+{{% endfor %}}
+Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
+'''.rstrip()
+@step(inputs=["prompt", "completion"], outputs=["messages"])
+def PrepareMessages(*inputs: StepInput) -> StepOutput:
+    for input in inputs:
+        for item in input:
+            item["messages"] = [
+                {"role": "user", "content": item["prompt"]},
+                {"role": "assistant", "content": item["completion"]},
+            ]
+        yield input
+@step(inputs=["messages", "generation"], outputs=["messages"])
+def FormatMessagesInstruction(*inputs: StepInput) -> StepOutput:
+    for input in inputs:
+        for item in input:
+            item["messages"].append({"role": "user", "content": item["generation"]})
+        yield input
+@step(inputs=["messages", "generation"], outputs=["messages"])
+def FormatMessagesResponse(*inputs: StepInput) -> StepOutput:
+    for input in inputs:
+        for item in input:
+            item["messages"].append({"role": "assistant", "content": item["generation"]})
+        yield input
+"""
+    if input_type == "dataset-input":
+        code += f"""
+with Pipeline(name="sft") as pipeline:
+    load_the_dataset = LoadDataFromHub(
+        repo_id='{repo_id}',
+        config='{subset}',
+        split='{split}',
+        num_examples={num_rows},
+        batch_size=2,
+        output_mappings={{'{document_column}':'anchor'}},
+    )
+    """
+    else:
+        code += """
+data = process_and_chunk_files(files=[files])
 with Pipeline(name="sft") as pipeline:
+    load_the_dataset = LoadDataFromDicts(
+        data = data
+    )
+"""
+    code += f"""
+    instruction_generator = GenerateSentencePair(
+        name="instruction_generation",
+        triplet=False,
+        hard_negative=True,
+        action="query",
         llm={_get_llm_class()}.from_dict(
             {_get_llm().dump()}
         ),
+        input_batch_size=10,
+        output_mappings={{"positive": "prompt"}},
     )
+    response_generator = TextGeneration(
+        name="response_generation",
+        llm={_get_llm_class()}.from_dict(
+            {_get_llm().dump()}
+        ),
+        input_batch_size=10,
+        input_mappings={{"instruction": "prompt"}},
+        output_mappings={{"generation": "completion"}},
     )
+    """
+    if num_turns > 1:
+        code += """
+    prepare_messages = PrepareMessages()
+    """
+        for i in range(num_turns - 1):
+            code += f"""
+    follow_up_instruction_{i} = TextGeneration(
+        llm={_get_llm_class()}.from_dict(
+            {_get_llm().dump()}
+        ),
+        template=FOLLOW_UP_TEMPLATE,
+        columns=["messages"],
+    )
+    format_instruction_{i} = FormatMessagesInstruction()
+    follow_up_response_{i} = ChatGeneration(
+        llm={_get_llm_class()}.from_dict(
+            {_get_llm().dump()}
+        ),
+    )
+    format_response_{i} = FormatMessagesResponse()
+    """
+    if num_turns > 1:
+        code += """
+        keep_columns = KeepColumns(columns=["messages"])
+        """
+        code += "load_the_dataset >> instruction_generator >> response_generator >> prepare_messages"
+        for i in range(1, num_turns + 1):
+            code += f" >> follow_up_instruction_{i} >> format_instruction_{i} >> follow_up_response_{i} >> format_response_{i}"
+        code += " >> keep_columns"
+    code += """
 if __name__ == "__main__":
     distiset = pipeline.run()
+)
 """
     return code
+def generate_pipeline_code(
+    repo_id: str,
+    input_type: str,
+    system_prompt: str,
+    document_column: str,
+    num_turns: int,
+    num_rows: int,
+):
+    if input_type == "dataset-input" and repo_id is not None:
+        subset = get_dataset_config_names(repo_id)[0]
+        split = get_dataset_split_names(repo_id, subset)[0]
+    else:
+        subset = "default"
+        split = "train"
+    if input_type == "prompt-type":
+        return generate_pipeline_code_system_prompt(
+            system_prompt=system_prompt,
+            num_turns=num_turns,
+            num_rows=num_rows,
+        )
+    return generate_pipeline_code_seed(
+        repo_id=repo_id,
+        subset=subset,
+        split=split,
+        input_type=input_type,
+        document_column=document_column,
+        num_turns=num_turns,
+        num_rows=num_rows,
+    )

src/synthetic_dataset_generator/pipelines/rag.py CHANGED Viewed

@@ -1,7 +1,3 @@
-import os
-from typing import List
 from datasets import get_dataset_config_names, get_dataset_split_names
 from distilabel.steps.tasks import (
     GenerateSentencePair,
@@ -292,10 +288,7 @@ with Pipeline(name="rag") as pipeline:
     pipeline += """
     if __name__ == "__main__":
-        distiset = pipeline.run(use_cache=False)
-        print(distiset)
-        if distiset:
-            print(distiset["default"]["train"][0])
     """
     return base_code + pipeline

 from datasets import get_dataset_config_names, get_dataset_split_names
 from distilabel.steps.tasks import (
     GenerateSentencePair,
     pipeline += """
     if __name__ == "__main__":
+        distiset = pipeline.run()
     """
     return base_code + pipeline