synthetic-data-generator

Paused

App Files Files Community

davidberenstein1957 commited on Dec 3, 2024

Commit

cd47483

1 Parent(s): 0202688

add support for custom BASE_URL, MODEL, APIKEY

Browse files

Files changed (17) hide show

README.md +7 -1
app.py +5 -5
pyproject.toml +12 -4
src/distilabel_dataset_generator/__init__.py +0 -26
src/distilabel_dataset_generator/apps/__init__.py +0 -0
src/distilabel_dataset_generator/apps/base.py +1 -1
src/distilabel_dataset_generator/apps/eval.py +5 -7
src/distilabel_dataset_generator/apps/sft.py +169 -164
src/distilabel_dataset_generator/apps/textcat.py +1 -3
src/distilabel_dataset_generator/constants.py +55 -0
src/distilabel_dataset_generator/pipelines/__init__.py +0 -0
src/distilabel_dataset_generator/pipelines/base.py +2 -4
src/distilabel_dataset_generator/pipelines/embeddings.py +1 -1
src/distilabel_dataset_generator/pipelines/eval.py +15 -14
src/distilabel_dataset_generator/pipelines/sft.py +15 -6
src/distilabel_dataset_generator/pipelines/textcat.py +13 -14
src/distilabel_dataset_generator/utils.py +1 -1

README.md CHANGED Viewed

@@ -80,7 +80,13 @@ pip install synthetic-dataset-generator
 ### Environment Variables
-- `HF_TOKEN`: Your Hugging Face token to push your datasets to the Hugging Face Hub and run *Free* Inference Endpoints Requests. You can get one [here](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained).
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

 ### Environment Variables
+- `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints.
+Optionally, you can set the following environment variables to customize the generation process.
+- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`.
+- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`.
+- `API_KEY`: The API key to use for the corresponding API, e.g. `hf_...`.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
-from src.distilabel_dataset_generator.apps.eval import app as eval_app
-from src.distilabel_dataset_generator.apps.faq import app as faq_app
-from src.distilabel_dataset_generator.apps.sft import app as sft_app
-from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme = "argilla/argilla-theme"

+from distilabel_dataset_generator._tabbedinterface import TabbedInterface
+from distilabel_dataset_generator.apps.eval import app as eval_app
+from distilabel_dataset_generator.apps.faq import app as faq_app
+from distilabel_dataset_generator.apps.sft import app as sft_app
+from distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme = "argilla/argilla-theme"

pyproject.toml CHANGED Viewed

@@ -5,6 +5,18 @@ description = "Build datasets using natural language"
 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
 ]
 dependencies = [
     "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
     "gradio[oauth]<5.0.0",
@@ -14,14 +26,10 @@ dependencies = [
     "gradio-huggingfacehub-search>=0.0.7",
     "argilla>=2.4.0",
 ]
-requires-python = "<3.13,>=3.10"
-readme = "README.md"
-license = {text = "apache 2"}
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 [tool.pdm]
 distribution = true

 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},
 ]
+tags = [
+    "gradio",
+    "synthetic-data",
+    "huggingface",
+    "argilla",
+    "generative-ai",
+    "ai",
+]
+requires-python = "<3.13,>=3.10"
+readme = "README.md"
+license = {text = "Apache 2"}
 dependencies = [
     "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
     "gradio[oauth]<5.0.0",
     "gradio-huggingfacehub-search>=0.0.7",
     "argilla>=2.4.0",
 ]
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 [tool.pdm]
 distribution = true

src/distilabel_dataset_generator/__init__.py CHANGED Viewed

@@ -1,8 +1,5 @@
-import os
-import warnings
 from typing import Optional
-import argilla as rg
 import distilabel
 import distilabel.distiset
 from distilabel.utils.card.dataset_card import (
@@ -11,29 +8,6 @@ from distilabel.utils.card.dataset_card import (
 )
 from huggingface_hub import DatasetCardData, HfApi
-HF_TOKENS = [os.getenv("HF_TOKEN")] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
-HF_TOKENS = [token for token in HF_TOKENS if token]
-if len(HF_TOKENS) == 0:
-    raise ValueError(
-        "HF_TOKEN is not set. Ensure you have set the HF_TOKEN environment variable that has access to the Hugging Face Hub repositories and Inference Endpoints."
-    )
-ARGILLA_API_URL = os.getenv("ARGILLA_API_URL")
-ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")
-if ARGILLA_API_URL is None or ARGILLA_API_KEY is None:
-    ARGILLA_API_URL = os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
-    ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
-if ARGILLA_API_URL is None or ARGILLA_API_KEY is None:
-    warnings.warn("ARGILLA_API_URL or ARGILLA_API_KEY is not set")
-    argilla_client = None
-else:
-    argilla_client = rg.Argilla(
-        api_url=ARGILLA_API_URL,
-        api_key=ARGILLA_API_KEY,
-    )
 class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
     def _generate_card(

 from typing import Optional
 import distilabel
 import distilabel.distiset
 from distilabel.utils.card.dataset_card import (
 )
 from huggingface_hub import DatasetCardData, HfApi
 class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
     def _generate_card(

src/distilabel_dataset_generator/apps/__init__.py ADDED Viewed

File without changes

src/distilabel_dataset_generator/apps/base.py CHANGED Viewed

@@ -10,7 +10,7 @@ from distilabel.distiset import Distiset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
-from src.distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_login_button,

 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
+from distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_login_button,

src/distilabel_dataset_generator/apps/eval.py CHANGED Viewed

@@ -16,25 +16,23 @@ from distilabel.distiset import Distiset
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi
-from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from src.distilabel_dataset_generator.pipelines.base import (
-    DEFAULT_BATCH_SIZE,
-)
-from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
-from src.distilabel_dataset_generator.pipelines.eval import (
     generate_pipeline_code,
     get_custom_evaluator,
     get_ultrafeedback_evaluator,
 )
-from src.distilabel_dataset_generator.utils import (
     column_to_list,
     extract_column_names,
     get_argilla_client,

 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi
+from distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
+from distilabel_dataset_generator.constants import DEFAULT_BATCH_SIZE
+from distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
+from distilabel_dataset_generator.pipelines.eval import (
     generate_pipeline_code,
     get_custom_evaluator,
     get_ultrafeedback_evaluator,
 )
+from distilabel_dataset_generator.utils import (
     column_to_list,
     extract_column_names,
     get_argilla_client,

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -9,27 +9,25 @@ from datasets import Dataset
 from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
-from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from src.distilabel_dataset_generator.pipelines.base import (
-    DEFAULT_BATCH_SIZE,
-)
-from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
-from src.distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
-from src.distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_org_dropdown,
@@ -354,168 +352,175 @@ def hide_pipeline_code_visibility():
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
-        gr.Markdown(value="## 1. Describe the dataset you want")
-        with gr.Row():
-            with gr.Column(scale=2):
-                dataset_description = gr.Textbox(
-                    label="Dataset description",
-                    placeholder="Give a precise description of your desired dataset.",
-                )
-                with gr.Accordion("Temperature", open=False):
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=1,
-                        value=0.8,
-                        step=0.1,
                         interactive=True,
-                        show_label=False,
                     )
-                load_btn = gr.Button(
-                    "Create dataset",
-                    variant="primary",
-                )
-            with gr.Column(scale=2):
-                examples = gr.Examples(
-                    examples=DEFAULT_DATASET_DESCRIPTIONS,
-                    inputs=[dataset_description],
-                    cache_examples=False,
-                    label="Examples",
-                )
-            with gr.Column(scale=1):
-                pass
-        gr.HTML(value="<hr>")
-        gr.Markdown(value="## 2. Configure your dataset")
-        with gr.Row(equal_height=False):
-            with gr.Column(scale=2):
-                system_prompt = gr.Textbox(
-                    label="System prompt",
-                    placeholder="You are a helpful assistant.",
-                )
-                num_turns = gr.Number(
-                    value=1,
-                    label="Number of turns in the conversation",
-                    minimum=1,
-                    maximum=4,
-                    step=1,
-                    interactive=True,
-                    info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
-                )
-                btn_apply_to_sample_dataset = gr.Button(
-                    "Refresh dataset", variant="secondary"
-                )
-            with gr.Column(scale=3):
-                dataframe = gr.Dataframe(
-                    headers=["prompt", "completion"],
-                    wrap=True,
-                    height=500,
-                    interactive=False,
-                )
-        gr.HTML(value="<hr>")
-        gr.Markdown(value="## 3. Generate your dataset")
-        with gr.Row(equal_height=False):
-            with gr.Column(scale=2):
-                org_name = get_org_dropdown()
-                repo_name = gr.Textbox(
-                    label="Repo name",
-                    placeholder="dataset_name",
-                    value=f"my-distiset-{str(uuid.uuid4())[:8]}",
-                    interactive=True,
-                )
-                num_rows = gr.Number(
-                    label="Number of rows",
-                    value=10,
-                    interactive=True,
-                    scale=1,
-                )
-                private = gr.Checkbox(
-                    label="Private dataset",
-                    value=False,
-                    interactive=True,
-                    scale=1,
-                )
-                btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
-            with gr.Column(scale=3):
-                success_message = gr.Markdown(visible=True)
-                with gr.Accordion(
-                    "Do you want to go further? Customize and run with Distilabel",
-                    open=False,
-                    visible=False,
-                ) as pipeline_code_ui:
-                    code = generate_pipeline_code(
-                        system_prompt=system_prompt.value,
-                        num_turns=num_turns.value,
-                        num_rows=num_rows.value,
                     )
-                    pipeline_code = gr.Code(
-                        value=code,
-                        language="python",
-                        label="Distilabel Pipeline Code",
                     )
-    load_btn.click(
-        fn=generate_system_prompt,
-        inputs=[dataset_description, temperature],
-        outputs=[system_prompt],
-        show_progress=True,
-    ).then(
-        fn=generate_sample_dataset,
-        inputs=[system_prompt, num_turns],
-        outputs=[dataframe],
-        show_progress=True,
-    )
-    btn_apply_to_sample_dataset.click(
-        fn=generate_sample_dataset,
-        inputs=[system_prompt, num_turns],
-        outputs=[dataframe],
-        show_progress=True,
-    )
-    btn_push_to_hub.click(
-        fn=validate_argilla_user_workspace_dataset,
-        inputs=[repo_name],
-        outputs=[success_message],
-        show_progress=True,
-    ).then(
-        fn=validate_push_to_hub,
-        inputs=[org_name, repo_name],
-        outputs=[success_message],
-        show_progress=True,
-    ).success(
-        fn=hide_success_message,
-        outputs=[success_message],
-        show_progress=True,
-    ).success(
-        fn=hide_pipeline_code_visibility,
-        inputs=[],
-        outputs=[pipeline_code_ui],
-    ).success(
-        fn=push_dataset,
-        inputs=[
-            org_name,
-            repo_name,
-            system_prompt,
-            num_turns,
-            num_rows,
-            private,
-        ],
-        outputs=[success_message],
-        show_progress=True,
-    ).success(
-        fn=show_success_message,
-        inputs=[org_name, repo_name],
-        outputs=[success_message],
-    ).success(
-        fn=generate_pipeline_code,
-        inputs=[system_prompt, num_turns, num_rows],
-        outputs=[pipeline_code],
-    ).success(
-        fn=show_pipeline_code_visibility,
-        inputs=[],
-        outputs=[pipeline_code_ui],
-    )
-    app.load(fn=swap_visibility, outputs=main_ui)
-    app.load(fn=get_org_dropdown, outputs=[org_name])

 from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
+from distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
+from distilabel_dataset_generator.constants import DEFAULT_BATCH_SIZE, SFT_AVAILABLE
+from distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
 )
+from distilabel_dataset_generator.pipelines.sft import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
+from distilabel_dataset_generator.utils import (
     _LOGGED_OUT_CSS,
     get_argilla_client,
     get_org_dropdown,
 with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
     with gr.Column() as main_ui:
+        if not SFT_AVAILABLE:
+            gr.Markdown(
+                value=f"## Supervised Fine-Tuning is not available for the {MODEL} model. Use Hugging Face Llama3 or Qwen2 models."
+            )
+        else:
+            gr.Markdown(value="## 1. Describe the dataset you want")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    dataset_description = gr.Textbox(
+                        label="Dataset description",
+                        placeholder="Give a precise description of your desired dataset.",
+                    )
+                    with gr.Accordion("Temperature", open=False):
+                        temperature = gr.Slider(
+                            minimum=0.1,
+                            maximum=1,
+                            value=0.8,
+                            step=0.1,
+                            interactive=True,
+                            show_label=False,
+                        )
+                    load_btn = gr.Button(
+                        "Create dataset",
+                        variant="primary",
+                    )
+                with gr.Column(scale=2):
+                    examples = gr.Examples(
+                        examples=DEFAULT_DATASET_DESCRIPTIONS,
+                        inputs=[dataset_description],
+                        cache_examples=False,
+                        label="Examples",
+                    )
+                with gr.Column(scale=1):
+                    pass
+            gr.HTML(value="<hr>")
+            gr.Markdown(value="## 2. Configure your dataset")
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    system_prompt = gr.Textbox(
+                        label="System prompt",
+                        placeholder="You are a helpful assistant.",
+                    )
+                    num_turns = gr.Number(
+                        value=1,
+                        label="Number of turns in the conversation",
+                        minimum=1,
+                        maximum=4,
+                        step=1,
                         interactive=True,
+                        info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
                     )
+                    btn_apply_to_sample_dataset = gr.Button(
+                        "Refresh dataset", variant="secondary"
                     )
+                with gr.Column(scale=3):
+                    dataframe = gr.Dataframe(
+                        headers=["prompt", "completion"],
+                        wrap=True,
+                        height=500,
+                        interactive=False,
                     )
+            gr.HTML(value="<hr>")
+            gr.Markdown(value="## 3. Generate your dataset")
+            with gr.Row(equal_height=False):
+                with gr.Column(scale=2):
+                    org_name = get_org_dropdown()
+                    repo_name = gr.Textbox(
+                        label="Repo name",
+                        placeholder="dataset_name",
+                        value=f"my-distiset-{str(uuid.uuid4())[:8]}",
+                        interactive=True,
+                    )
+                    num_rows = gr.Number(
+                        label="Number of rows",
+                        value=10,
+                        interactive=True,
+                        scale=1,
+                    )
+                    private = gr.Checkbox(
+                        label="Private dataset",
+                        value=False,
+                        interactive=True,
+                        scale=1,
+                    )
+                    btn_push_to_hub = gr.Button(
+                        "Push to Hub", variant="primary", scale=2
+                    )
+                with gr.Column(scale=3):
+                    success_message = gr.Markdown(visible=True)
+                    with gr.Accordion(
+                        "Do you want to go further? Customize and run with Distilabel",
+                        open=False,
+                        visible=False,
+                    ) as pipeline_code_ui:
+                        code = generate_pipeline_code(
+                            system_prompt=system_prompt.value,
+                            num_turns=num_turns.value,
+                            num_rows=num_rows.value,
+                        )
+                        pipeline_code = gr.Code(
+                            value=code,
+                            language="python",
+                            label="Distilabel Pipeline Code",
+                        )
+        load_btn.click(
+            fn=generate_system_prompt,
+            inputs=[dataset_description, temperature],
+            outputs=[system_prompt],
+            show_progress=True,
+        ).then(
+            fn=generate_sample_dataset,
+            inputs=[system_prompt, num_turns],
+            outputs=[dataframe],
+            show_progress=True,
+        )
+        btn_apply_to_sample_dataset.click(
+            fn=generate_sample_dataset,
+            inputs=[system_prompt, num_turns],
+            outputs=[dataframe],
+            show_progress=True,
+        )
+        btn_push_to_hub.click(
+            fn=validate_argilla_user_workspace_dataset,
+            inputs=[repo_name],
+            outputs=[success_message],
+            show_progress=True,
+        ).then(
+            fn=validate_push_to_hub,
+            inputs=[org_name, repo_name],
+            outputs=[success_message],
+            show_progress=True,
+        ).success(
+            fn=hide_success_message,
+            outputs=[success_message],
+            show_progress=True,
+        ).success(
+            fn=hide_pipeline_code_visibility,
+            inputs=[],
+            outputs=[pipeline_code_ui],
+        ).success(
+            fn=push_dataset,
+            inputs=[
+                org_name,
+                repo_name,
+                system_prompt,
+                num_turns,
+                num_rows,
+                private,
+            ],
+            outputs=[success_message],
+            show_progress=True,
+        ).success(
+            fn=show_success_message,
+            inputs=[org_name, repo_name],
+            outputs=[success_message],
+        ).success(
+            fn=generate_pipeline_code,
+            inputs=[system_prompt, num_turns, num_rows],
+            outputs=[pipeline_code],
+        ).success(
+            fn=show_pipeline_code_visibility,
+            inputs=[],
+            outputs=[pipeline_code_ui],
+        )
+        app.load(fn=swap_visibility, outputs=main_ui)
+        app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -9,15 +9,13 @@ from datasets import ClassLabel, Dataset, Features, Sequence, Value
 from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from src.distilabel_dataset_generator.pipelines.base import (
-    DEFAULT_BATCH_SIZE,
-)
 from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,

 from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
+from distilabel_dataset_generator.constants import DEFAULT_BATCH_SIZE
 from src.distilabel_dataset_generator.apps.base import (
     hide_success_message,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
 from src.distilabel_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,

src/distilabel_dataset_generator/constants.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import warnings
+import argilla as rg
+# Hugging Face
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN is None:
+    raise ValueError(
+        "HF_TOKEN is not set. Ensure you have set the HF_TOKEN environment variable that has access to the Hugging Face Hub repositories and Inference Endpoints."
+    )
+# Inference
+DEFAULT_BATCH_SIZE = 5
+MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
+API_KEYS = (
+    [os.getenv("HF_TOKEN")]
+    + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
+    + [os.getenv("API_KEY")]
+)
+API_KEYS = [token for token in API_KEYS if token]
+BASE_URL = os.getenv("BASE_URL", "https://api-inference.huggingface.co/v1/")
+if BASE_URL != "https://api-inference.huggingface.co/v1/" and len(API_KEYS) == 0:
+    raise ValueError(
+        "API_KEY is not set. Ensure you have set the API_KEY environment variable that has access to the Hugging Face Inference Endpoints."
+    )
+if "Qwen2" not in MODEL and "Llama-3" not in MODEL:
+    SFT_AVAILABLE = False
+    warnings.warn(
+        "SFT_AVAILABLE is set to False because the model is not a Qwen or Llama model."
+    )
+    MAGPIE_PRE_QUERY_TEMPLATE = None
+else:
+    SFT_AVAILABLE = True
+    if "Qwen2" in MODEL:
+        MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
+    else:
+        MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
+# Argilla
+ARGILLA_API_URL = os.getenv("ARGILLA_API_URL")
+ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")
+if ARGILLA_API_URL is None or ARGILLA_API_KEY is None:
+    ARGILLA_API_URL = os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
+    ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
+if ARGILLA_API_URL is None or ARGILLA_API_KEY is None:
+    warnings.warn("ARGILLA_API_URL or ARGILLA_API_KEY is not set")
+    argilla_client = None
+else:
+    argilla_client = rg.Argilla(
+        api_url=ARGILLA_API_URL,
+        api_key=ARGILLA_API_KEY,
+    )

src/distilabel_dataset_generator/pipelines/__init__.py ADDED Viewed

File without changes

src/distilabel_dataset_generator/pipelines/base.py CHANGED Viewed

@@ -1,12 +1,10 @@
-from src.distilabel_dataset_generator import HF_TOKENS
-DEFAULT_BATCH_SIZE = 5
 TOKEN_INDEX = 0
-MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 def _get_next_api_key():
     global TOKEN_INDEX
-    api_key = HF_TOKENS[TOKEN_INDEX % len(HF_TOKENS)]
     TOKEN_INDEX += 1
     return api_key

+from distilabel_dataset_generator.constants import API_KEYS
 TOKEN_INDEX = 0
 def _get_next_api_key():
     global TOKEN_INDEX
+    api_key = API_KEYS[TOKEN_INDEX % len(API_KEYS)]
     TOKEN_INDEX += 1
     return api_key

src/distilabel_dataset_generator/pipelines/embeddings.py CHANGED Viewed

@@ -4,7 +4,7 @@ from sentence_transformers import SentenceTransformer
 from sentence_transformers.models import StaticEmbedding
 # Initialize a StaticEmbedding module
-static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
 model = SentenceTransformer(modules=[static_embedding])

 from sentence_transformers.models import StaticEmbedding
 # Initialize a StaticEmbedding module
+static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
 model = SentenceTransformer(modules=[static_embedding])

src/distilabel_dataset_generator/pipelines/eval.py CHANGED Viewed

@@ -5,18 +5,16 @@ from distilabel.steps.tasks import (
     UltraFeedback,
 )
-from src.distilabel_dataset_generator.pipelines.base import (
-    MODEL,
-    _get_next_api_key,
-)
-from src.distilabel_dataset_generator.utils import extract_column_names
 def get_ultrafeedback_evaluator(aspect, is_sample):
     ultrafeedback_evaluator = UltraFeedback(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0,
@@ -33,7 +31,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
     custom_evaluator = TextGeneration(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
@@ -62,7 +60,8 @@ from distilabel.steps.tasks import UltraFeedback
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
-os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
 data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
@@ -76,8 +75,8 @@ with Pipeline(name="ultrafeedback") as pipeline:
     ultrafeedback_evaluator = UltraFeedback(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=os.environ["HF_TOKEN"],
             generation_kwargs={{
                 "temperature": 0,
                 "max_new_tokens": 2048,
@@ -101,7 +100,8 @@ from distilabel.steps.tasks import UltraFeedback
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
-os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
 data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
@@ -119,8 +119,8 @@ with Pipeline(name="ultrafeedback") as pipeline:
             aspect=aspect,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
-                tokenizer_id=MODEL,
-                api_key=os.environ["HF_TOKEN"],
                 generation_kwargs={{
                     "temperature": 0,
                     "max_new_tokens": 2048,
@@ -157,6 +157,7 @@ from distilabel.steps.tasks import TextGeneration
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 CUSTOM_TEMPLATE = "{prompt_template}"
 os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
@@ -171,7 +172,7 @@ with Pipeline(name="custom-evaluation") as pipeline:
     custom_evaluator = TextGeneration(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
             api_key=os.environ["HF_TOKEN"],
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{

     UltraFeedback,
 )
+from distilabel_dataset_generator.constants import BASE_URL, MODEL
+from distilabel_dataset_generator.pipelines.base import _get_next_api_key
+from distilabel_dataset_generator.utils import extract_column_names
 def get_ultrafeedback_evaluator(aspect, is_sample):
     ultrafeedback_evaluator = UltraFeedback(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0,
     custom_evaluator = TextGeneration(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
             api_key=_get_next_api_key(),
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
+BASE_URL = "{BASE_URL}"
+os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
 data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
     ultrafeedback_evaluator = UltraFeedback(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
+            api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0,
                 "max_new_tokens": 2048,
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
+BASE_URL = "{BASE_URL}"
+os.environ["BASE_URL"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
 data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
             aspect=aspect,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
+                base_url=BASE_URL,
+                api_key=os.environ["BASE_URL"],
                 generation_kwargs={{
                     "temperature": 0,
                     "max_new_tokens": 2048,
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
+BASE_URL = "{BASE_URL}"
 CUSTOM_TEMPLATE = "{prompt_template}"
 os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
     custom_evaluator = TextGeneration(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
             api_key=os.environ["HF_TOKEN"],
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
-from src.distilabel_dataset_generator.pipelines.base import (
     MODEL,
-    _get_next_api_key,
 )
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -144,6 +146,7 @@ def get_prompt_generator(temperature):
             api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
             generation_kwargs={
                 "temperature": temperature,
                 "max_new_tokens": 2048,
@@ -165,8 +168,9 @@ def get_magpie_generator(system_prompt, num_turns, is_sample):
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
                 api_key=_get_next_api_key(),
-                magpie_pre_query_template="llama3",
                 generation_kwargs={
                     "temperature": 0.9,
                     "do_sample": True,
@@ -184,8 +188,9 @@ def get_magpie_generator(system_prompt, num_turns, is_sample):
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
                 api_key=_get_next_api_key(),
-                magpie_pre_query_template="llama3",
                 generation_kwargs={
                     "temperature": 0.9,
                     "do_sample": True,
@@ -208,6 +213,7 @@ def get_response_generator(system_prompt, num_turns, is_sample):
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": 0.8,
@@ -223,6 +229,7 @@ def get_response_generator(system_prompt, num_turns, is_sample):
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": 0.8,
@@ -247,14 +254,16 @@ from distilabel.steps.tasks import MagpieGenerator
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 SYSTEM_PROMPT = "{system_prompt}"
-os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
                 "temperature": 0.9,
@@ -262,7 +271,7 @@ with Pipeline(name="sft") as pipeline:
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
-            api_key=os.environ["HF_TOKEN"],
         ),
         n_turns={num_turns},
         num_rows={num_rows},

 from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
+from distilabel_dataset_generator.constants import (
+    BASE_URL,
+    MAGPIE_PRE_QUERY_TEMPLATE,
     MODEL,
 )
+from distilabel_dataset_generator.pipelines.base import _get_next_api_key
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
             api_key=_get_next_api_key(),
             model_id=MODEL,
             tokenizer_id=MODEL,
+            base_url=BASE_URL,
             generation_kwargs={
                 "temperature": temperature,
                 "max_new_tokens": 2048,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
+                base_url=BASE_URL,
                 api_key=_get_next_api_key(),
+                magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
                     "temperature": 0.9,
                     "do_sample": True,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
+                base_url=BASE_URL,
                 api_key=_get_next_api_key(),
+                magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
                     "temperature": 0.9,
                     "do_sample": True,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
+                base_url=BASE_URL,
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": 0.8,
             llm=InferenceEndpointsLLM(
                 model_id=MODEL,
                 tokenizer_id=MODEL,
+                base_url=BASE_URL,
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": 0.8,
 from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
+BASE_URL = "{BASE_URL}"
 SYSTEM_PROMPT = "{system_prompt}"
+os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             tokenizer_id=MODEL,
+            base_url=BASE_URL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
                 "temperature": 0.9,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
+            api_key=os.environ["BASE_URL"],
         ),
         n_turns={num_turns},
         num_rows={num_rows},

src/distilabel_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import random
-from pydantic import BaseModel, Field
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
@@ -8,12 +7,11 @@ from distilabel.steps.tasks import (
     TextClassification,
     TextGeneration,
 )
-from src.distilabel_dataset_generator.pipelines.base import (
-    MODEL,
-    _get_next_api_key,
-)
-from src.distilabel_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
@@ -73,7 +71,7 @@ def get_prompt_generator(temperature):
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             model_id=MODEL,
-            tokenizer_id=MODEL,
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
                 "temperature": temperature,
@@ -92,7 +90,7 @@ def get_textcat_generator(difficulty, clarity, is_sample):
     textcat_generator = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.9,
@@ -114,7 +112,7 @@ def get_labeller_generator(system_prompt, labels, num_labels):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.7,
@@ -149,8 +147,9 @@ from distilabel.steps import LoadDataFromDicts, KeepColumns
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
 MODEL = "{MODEL}"
 TEXT_CLASSIFICATION_TASK = "{system_prompt}"
-os.environ["HF_TOKEN"] = (
     "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 )
@@ -161,8 +160,8 @@ with Pipeline(name="textcat") as pipeline:
     textcat_generation = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=os.environ["HF_TOKEN"],
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
@@ -205,8 +204,8 @@ with Pipeline(name="textcat") as pipeline:
     textcat_labeller = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
-            tokenizer_id=MODEL,
-            api_key=os.environ["HF_TOKEN"],
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,

 import random
 from typing import List
 from distilabel.llms import InferenceEndpointsLLM
     TextClassification,
     TextGeneration,
 )
+from pydantic import BaseModel, Field
+from distilabel_dataset_generator.constants import BASE_URL, MODEL
+from distilabel_dataset_generator.pipelines.base import _get_next_api_key
+from distilabel_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             model_id=MODEL,
+            base_url=BASE_URL,
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
                 "temperature": temperature,
     textcat_generator = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.9,
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.7,
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
 MODEL = "{MODEL}"
+BASE_URL = "{BASE_URL}"
 TEXT_CLASSIFICATION_TASK = "{system_prompt}"
+os.environ["API_KEY"] = (
     "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 )
     textcat_generation = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
+            api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,
     textcat_labeller = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
+            base_url=BASE_URL,
+            api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.8,
                 "max_new_tokens": 2048,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -15,7 +15,7 @@ from gradio.oauth import (
 from huggingface_hub import whoami
 from jinja2 import Environment, meta
-from src.distilabel_dataset_generator import argilla_client
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"

 from huggingface_hub import whoami
 from jinja2 import Environment, meta
+from distilabel_dataset_generator.constants import argilla_client
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"