Commit
·
86f370f
1
Parent(s):
2e449ff
fix final nits and formatting
Browse files- assets/image.png +0 -0
- assets/ui-full.png +0 -0
- src/synthetic_dataset_generator/apps/base.py +37 -41
- src/synthetic_dataset_generator/apps/eval.py +1 -1
- src/synthetic_dataset_generator/apps/sft.py +3 -5
- src/synthetic_dataset_generator/apps/textcat.py +3 -5
- src/synthetic_dataset_generator/utils.py +1 -1
assets/image.png
DELETED
|
Binary file (657 kB)
|
|
|
assets/ui-full.png
CHANGED
|
|
src/synthetic_dataset_generator/apps/base.py
CHANGED
|
@@ -129,53 +129,49 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
|
|
| 129 |
client = get_argilla_client()
|
| 130 |
if client is None:
|
| 131 |
return gr.Markdown(
|
| 132 |
-
value=
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
</
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
</div>
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
|
| 145 |
-
Unfamiliar with Argilla? Here are some docs to help you get started:
|
| 146 |
-
<br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
|
| 147 |
-
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
|
| 148 |
-
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
|
| 149 |
-
</p>
|
| 150 |
-
</div>
|
| 151 |
-
"""
|
| 152 |
)
|
| 153 |
argilla_api_url = client.api_url
|
| 154 |
return gr.Markdown(
|
| 155 |
value=f"""
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
</div>
|
| 171 |
-
|
| 172 |
-
</div>
|
| 173 |
-
<p style="margin-top: 1em; color: #333;">
|
| 174 |
-
Unfamiliar with Argilla? Here are some docs to help you get started:
|
| 175 |
-
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
|
| 176 |
-
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
|
| 177 |
-
</p>
|
| 178 |
-
""",
|
| 179 |
visible=True,
|
| 180 |
)
|
| 181 |
|
|
|
|
| 129 |
client = get_argilla_client()
|
| 130 |
if client is None:
|
| 131 |
return gr.Markdown(
|
| 132 |
+
value="""
|
| 133 |
+
<div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
|
| 134 |
+
<h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
|
| 135 |
+
<p style="margin-top: 0.5em;">
|
| 136 |
+
The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
|
| 137 |
+
<a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" class="lg primary svelte-cmf5ev" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
|
| 138 |
+
Open in Hub
|
| 139 |
+
</a>
|
| 140 |
+
</p>
|
| 141 |
+
<p style="margin-top: 1em; color: #333;">
|
| 142 |
+
By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
|
| 143 |
+
Unfamiliar with Argilla? Here are some docs to help you get started:
|
| 144 |
+
<br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
|
| 145 |
+
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
|
| 146 |
+
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
|
| 147 |
+
</p>
|
| 148 |
</div>
|
| 149 |
+
""",
|
| 150 |
+
visible=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
)
|
| 152 |
argilla_api_url = client.api_url
|
| 153 |
return gr.Markdown(
|
| 154 |
value=f"""
|
| 155 |
+
<div style="padding: 1em; background-color: var(--block-background-fill); border-color: var(--border-color-primary); border-width: 1px; border-radius: 5px;">
|
| 156 |
+
<h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
|
| 157 |
+
<p style="margin-top: 0.5em;">
|
| 158 |
+
The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
|
| 159 |
+
<div style="display: flex; gap: 10px;">
|
| 160 |
+
<a href="{argilla_api_url}" target="_blank" class="lg primary svelte-cmf5ev" style="color: white !important; margin-top: 0.5em; text-decoration: none;">
|
| 161 |
+
Open in Argilla
|
| 162 |
+
</a>
|
| 163 |
+
<a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" class="lg secondary svelte-cmf5ev" style="color: black !important; margin-top: 0.5em; text-decoration: none;">
|
| 164 |
+
Open in Hub
|
| 165 |
+
</a>
|
| 166 |
+
</div>
|
| 167 |
+
</p>
|
| 168 |
+
<p style="margin-top: 1em; color: #333;">
|
| 169 |
+
Unfamiliar with Argilla? Here are some docs to help you get started:
|
| 170 |
+
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
|
| 171 |
+
<br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
|
| 172 |
+
</p>
|
| 173 |
</div>
|
| 174 |
+
""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
visible=True,
|
| 176 |
)
|
| 177 |
|
src/synthetic_dataset_generator/apps/eval.py
CHANGED
|
@@ -713,7 +713,7 @@ with gr.Blocks() as app:
|
|
| 713 |
with gr.Column(scale=3):
|
| 714 |
success_message = gr.Markdown(visible=True)
|
| 715 |
with gr.Accordion(
|
| 716 |
-
"
|
| 717 |
open=False,
|
| 718 |
visible=False,
|
| 719 |
) as pipeline_code_ui:
|
|
|
|
| 713 |
with gr.Column(scale=3):
|
| 714 |
success_message = gr.Markdown(visible=True)
|
| 715 |
with gr.Accordion(
|
| 716 |
+
"Customize your pipeline with distilabel",
|
| 717 |
open=False,
|
| 718 |
visible=False,
|
| 719 |
) as pipeline_code_ui:
|
src/synthetic_dataset_generator/apps/sft.py
CHANGED
|
@@ -381,15 +381,13 @@ with gr.Blocks() as app:
|
|
| 381 |
"Create",
|
| 382 |
variant="primary",
|
| 383 |
)
|
| 384 |
-
with gr.Column(scale=
|
| 385 |
examples = gr.Examples(
|
| 386 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
| 387 |
inputs=[dataset_description],
|
| 388 |
cache_examples=False,
|
| 389 |
label="Examples",
|
| 390 |
)
|
| 391 |
-
with gr.Column(scale=1):
|
| 392 |
-
pass
|
| 393 |
|
| 394 |
gr.HTML(value="<hr>")
|
| 395 |
gr.Markdown(value="## 2. Configure your dataset")
|
|
@@ -437,12 +435,12 @@ with gr.Blocks() as app:
|
|
| 437 |
scale=1,
|
| 438 |
)
|
| 439 |
temperature = gr.Slider(
|
|
|
|
| 440 |
minimum=0.1,
|
| 441 |
maximum=1,
|
| 442 |
value=0.8,
|
| 443 |
step=0.1,
|
| 444 |
interactive=True,
|
| 445 |
-
show_label=False,
|
| 446 |
)
|
| 447 |
private = gr.Checkbox(
|
| 448 |
label="Private dataset",
|
|
@@ -456,7 +454,7 @@ with gr.Blocks() as app:
|
|
| 456 |
with gr.Column(scale=3):
|
| 457 |
success_message = gr.Markdown(visible=True)
|
| 458 |
with gr.Accordion(
|
| 459 |
-
"
|
| 460 |
open=False,
|
| 461 |
visible=False,
|
| 462 |
) as pipeline_code_ui:
|
|
|
|
| 381 |
"Create",
|
| 382 |
variant="primary",
|
| 383 |
)
|
| 384 |
+
with gr.Column(scale=3):
|
| 385 |
examples = gr.Examples(
|
| 386 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
| 387 |
inputs=[dataset_description],
|
| 388 |
cache_examples=False,
|
| 389 |
label="Examples",
|
| 390 |
)
|
|
|
|
|
|
|
| 391 |
|
| 392 |
gr.HTML(value="<hr>")
|
| 393 |
gr.Markdown(value="## 2. Configure your dataset")
|
|
|
|
| 435 |
scale=1,
|
| 436 |
)
|
| 437 |
temperature = gr.Slider(
|
| 438 |
+
label="Temperature",
|
| 439 |
minimum=0.1,
|
| 440 |
maximum=1,
|
| 441 |
value=0.8,
|
| 442 |
step=0.1,
|
| 443 |
interactive=True,
|
|
|
|
| 444 |
)
|
| 445 |
private = gr.Checkbox(
|
| 446 |
label="Private dataset",
|
|
|
|
| 454 |
with gr.Column(scale=3):
|
| 455 |
success_message = gr.Markdown(visible=True)
|
| 456 |
with gr.Accordion(
|
| 457 |
+
"Customize your pipeline with distilabel",
|
| 458 |
open=False,
|
| 459 |
visible=False,
|
| 460 |
) as pipeline_code_ui:
|
src/synthetic_dataset_generator/apps/textcat.py
CHANGED
|
@@ -355,15 +355,13 @@ with gr.Blocks() as app:
|
|
| 355 |
"Create",
|
| 356 |
variant="primary",
|
| 357 |
)
|
| 358 |
-
with gr.Column(scale=
|
| 359 |
examples = gr.Examples(
|
| 360 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
| 361 |
inputs=[dataset_description],
|
| 362 |
cache_examples=False,
|
| 363 |
label="Examples",
|
| 364 |
)
|
| 365 |
-
with gr.Column(scale=1):
|
| 366 |
-
pass
|
| 367 |
|
| 368 |
gr.HTML("<hr>")
|
| 369 |
gr.Markdown("## 2. Configure your dataset")
|
|
@@ -441,12 +439,12 @@ with gr.Blocks() as app:
|
|
| 441 |
scale=1,
|
| 442 |
)
|
| 443 |
temperature = gr.Slider(
|
|
|
|
| 444 |
minimum=0.1,
|
| 445 |
maximum=1,
|
| 446 |
value=0.8,
|
| 447 |
step=0.1,
|
| 448 |
interactive=True,
|
| 449 |
-
show_label=False,
|
| 450 |
)
|
| 451 |
private = gr.Checkbox(
|
| 452 |
label="Private dataset",
|
|
@@ -458,7 +456,7 @@ with gr.Blocks() as app:
|
|
| 458 |
with gr.Column(scale=3):
|
| 459 |
success_message = gr.Markdown(visible=True)
|
| 460 |
with gr.Accordion(
|
| 461 |
-
"
|
| 462 |
open=False,
|
| 463 |
visible=False,
|
| 464 |
) as pipeline_code_ui:
|
|
|
|
| 355 |
"Create",
|
| 356 |
variant="primary",
|
| 357 |
)
|
| 358 |
+
with gr.Column(scale=3):
|
| 359 |
examples = gr.Examples(
|
| 360 |
examples=DEFAULT_DATASET_DESCRIPTIONS,
|
| 361 |
inputs=[dataset_description],
|
| 362 |
cache_examples=False,
|
| 363 |
label="Examples",
|
| 364 |
)
|
|
|
|
|
|
|
| 365 |
|
| 366 |
gr.HTML("<hr>")
|
| 367 |
gr.Markdown("## 2. Configure your dataset")
|
|
|
|
| 439 |
scale=1,
|
| 440 |
)
|
| 441 |
temperature = gr.Slider(
|
| 442 |
+
label="Temperature",
|
| 443 |
minimum=0.1,
|
| 444 |
maximum=1,
|
| 445 |
value=0.8,
|
| 446 |
step=0.1,
|
| 447 |
interactive=True,
|
|
|
|
| 448 |
)
|
| 449 |
private = gr.Checkbox(
|
| 450 |
label="Private dataset",
|
|
|
|
| 456 |
with gr.Column(scale=3):
|
| 457 |
success_message = gr.Markdown(visible=True)
|
| 458 |
with gr.Accordion(
|
| 459 |
+
"Customize your pipeline with distilabel",
|
| 460 |
open=False,
|
| 461 |
visible=False,
|
| 462 |
) as pipeline_code_ui:
|
src/synthetic_dataset_generator/utils.py
CHANGED
|
@@ -28,7 +28,7 @@ def list_orgs(oauth_token: Union[OAuthToken, None] = None):
|
|
| 28 |
if data["auth"]["type"] == "oauth":
|
| 29 |
organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
|
| 30 |
elif data["auth"]["type"] == "access_token":
|
| 31 |
-
organizations = [org["name"] for org in data["orgs"]]
|
| 32 |
else:
|
| 33 |
organizations = [
|
| 34 |
entry["entity"]["name"]
|
|
|
|
| 28 |
if data["auth"]["type"] == "oauth":
|
| 29 |
organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
|
| 30 |
elif data["auth"]["type"] == "access_token":
|
| 31 |
+
organizations = [data["name"]] + [org["name"] for org in data["orgs"]]
|
| 32 |
else:
|
| 33 |
organizations = [
|
| 34 |
entry["entity"]["name"]
|