Spaces:
Sleeping
Sleeping
add full generation
Browse files
README.md
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
---
|
2 |
title: Dataset Rewriter
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.42.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Dataset Rewriter
|
3 |
+
emoji: ✍️✨
|
4 |
colorFrom: purple
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.42.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
hf_oauth: true
|
11 |
+
hf_oauth_scopes:
|
12 |
+
- read-repos
|
13 |
+
- write-repos
|
14 |
+
- manage-repos
|
15 |
---
|
16 |
|
17 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -4,13 +4,13 @@ import time
|
|
4 |
from itertools import count, islice
|
5 |
from multiprocessing.pool import ThreadPool
|
6 |
from queue import Queue, Empty
|
7 |
-
from typing import Any, Callable, Iterable, Iterator, TypeVar
|
8 |
|
9 |
import gradio as gr
|
10 |
import ijson
|
11 |
import pandas as pd
|
12 |
import requests
|
13 |
-
from datasets import Features, Value, Sequence
|
14 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
15 |
from huggingface_hub import InferenceClient
|
16 |
|
@@ -20,45 +20,83 @@ from utils import StringIteratorIO
|
|
20 |
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
21 |
client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN_INFERENCE_API"))
|
22 |
|
|
|
23 |
session = requests.Session()
|
24 |
empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
|
25 |
|
|
|
|
|
26 |
NUM_ROWS_PREVIEW = 3
|
27 |
-
|
|
|
28 |
"A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
|
29 |
-
"They want you to rewrite the dataset and apply this
|
30 |
"The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
|
31 |
-
"
|
32 |
-
"Try to keep some of the text or meaning intact, and apply the requested
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
)
|
|
|
|
|
|
|
|
|
|
|
41 |
with gr.Row():
|
42 |
-
with gr.Column(scale=
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
search_type="dataset",
|
47 |
)
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
|
64 |
############
|
@@ -110,19 +148,42 @@ with gr.Blocks() as demo:
|
|
110 |
break
|
111 |
|
112 |
|
113 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
prompt = prompt[:1000] if prompt.strip() else ""
|
115 |
messages = [{"role": "user", "content": REWRITE_DATASET.format(
|
116 |
dataset=dataset,
|
117 |
rows=json.dumps({"data": rows}),
|
118 |
prompt=prompt,
|
|
|
|
|
119 |
)}]
|
120 |
-
response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "
|
121 |
-
print("
|
122 |
-
yield from
|
123 |
print("done")
|
124 |
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
|
127 |
for i, result in enumerate(func(**kwargs)):
|
128 |
queue.put(result)
|
@@ -195,6 +256,7 @@ with gr.Blocks() as demo:
|
|
195 |
split = default_split if default_split in splits else splits[0]
|
196 |
dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
|
197 |
return subset, split, {
|
|
|
198 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
199 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
200 |
output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
|
@@ -213,29 +275,60 @@ with gr.Blocks() as demo:
|
|
213 |
}
|
214 |
|
215 |
|
216 |
-
@dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
217 |
def show_input_from_dataset_search(dataset: str) -> dict:
|
218 |
return _show_input_preview(dataset, default_subset="default", default_split="train")
|
219 |
|
220 |
-
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
221 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
222 |
return _show_input_preview(dataset, default_subset=subset, default_split="train")
|
223 |
|
224 |
-
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
|
225 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
226 |
return _show_input_preview(dataset, default_subset=subset, default_split=split)
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
|
229 |
-
@
|
230 |
-
def
|
231 |
rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
|
232 |
format = output_format_df.to_dict(orient="records")
|
233 |
format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
|
234 |
output_rows = []
|
235 |
-
print(f"ReWriting {dataset} with
|
236 |
-
|
237 |
-
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
|
241 |
demo.launch()
|
|
|
4 |
from itertools import count, islice
|
5 |
from multiprocessing.pool import ThreadPool
|
6 |
from queue import Queue, Empty
|
7 |
+
from typing import Any, Callable, Iterable, Iterator, Optional, TypeVar
|
8 |
|
9 |
import gradio as gr
|
10 |
import ijson
|
11 |
import pandas as pd
|
12 |
import requests
|
13 |
+
from datasets import Dataset, Features, Value, Sequence
|
14 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
15 |
from huggingface_hub import InferenceClient
|
16 |
|
|
|
20 |
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
21 |
client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN_INFERENCE_API"))
|
22 |
|
23 |
+
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
|
24 |
session = requests.Session()
|
25 |
empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
|
26 |
|
27 |
+
NAMESPACE = "lhoestq"
|
28 |
+
|
29 |
NUM_ROWS_PREVIEW = 3
|
30 |
+
MAX_NUM_ROWS_TO_REWRITE = 10
|
31 |
+
REWRITE_DATASET_PREVIEW = (
|
32 |
"A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
|
33 |
+
"They want you to rewrite the dataset and apply this instruction, which can be about transforming, translating or filtering the rows: {prompt}."
|
34 |
"The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
|
35 |
+
"Apply the instruction to those rows from the '{dataset}' dataset and output the resulting rows using the same JSON format. "
|
36 |
+
"Try to keep some of the text or meaning intact, and apply the requested instruction '{prompt}'."
|
37 |
+
)
|
38 |
+
REWRITE_DATASET= (
|
39 |
+
"A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
|
40 |
+
"They want you to rewrite the dataset and apply this instruction, which can be about transforming, translating or filtering the rows: {prompt}."
|
41 |
+
"Here is an example:\n\nOriginal rows:\n{input_preview_rows}\n\Resulting rows:\n{output_preview_rows}\n\n"
|
42 |
+
"The rows of the dataset are below in JSON format:\n\n{rows}\n\n"
|
43 |
+
"Apply the instruction to those rows from the '{dataset}' dataset and output the resulting rows using the same JSON format. "
|
44 |
+
"Try to keep some of the text or meaning intact, and apply the requested instruction '{prompt}'."
|
45 |
+
)
|
46 |
+
FIND_NEW_NAME = (
|
47 |
+
"You are a helpful assistant specialized in transforming english sentences for machine learning practitioners."
|
48 |
+
"Your job is to take input sentences like 'Take this dataset and apply the instruction xxx' and rephrase them them as 'The dataset should be yyy'. "
|
49 |
+
"You shoud use adjectives and exactly follow the output formula 'The dataset should be yyy'. "
|
50 |
+
"Here is your first job: rephrase the sentence 'Take this dataset and apply the instruction \"{prompt}\"'"
|
51 |
)
|
52 |
|
53 |
+
css = """
|
54 |
+
.settings {
|
55 |
+
background: transparent;
|
56 |
+
}
|
57 |
+
.settings button span {
|
58 |
+
color: var(--body-text-color-subdued);
|
59 |
+
}
|
60 |
+
"""
|
61 |
+
|
62 |
+
with gr.Blocks(css=css) as demo:
|
63 |
+
dataset_info_json = gr.JSON(visible=False)
|
64 |
with gr.Row():
|
65 |
+
with gr.Column(scale=10):
|
66 |
+
gr.Markdown(
|
67 |
+
"# 🤗 WIP Dataset ReWriter ✍️✨\n\n"
|
68 |
+
"Adjust, translate or transform completely existing datasets.\n\n"
|
|
|
69 |
)
|
70 |
+
with gr.Row():
|
71 |
+
with gr.Column(scale=3):
|
72 |
+
dataset_search = HuggingfaceHubSearch(
|
73 |
+
label="Hub Dataset ID",
|
74 |
+
placeholder="Search for dataset id on Huggingface",
|
75 |
+
search_type="dataset",
|
76 |
+
)
|
77 |
+
subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
|
78 |
+
split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
|
79 |
+
|
80 |
+
gr.Markdown("### Input")
|
81 |
+
pretty_input_preview = gr.DataFrame(interactive=False)
|
82 |
+
|
83 |
+
gr.Markdown("### ReWrite")
|
84 |
+
with gr.Group():
|
85 |
+
input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
|
86 |
+
with gr.Accordion("(Advanced) Edit columns", open=False):
|
87 |
+
output_format_dataframe = gr.DataFrame(col_count=(2, "fixed"), headers=["column", "type"])
|
88 |
+
rewrite_preview_button = gr.Button("Preview Results", variant="primary")
|
89 |
+
pretty_output_preview = gr.DataFrame(interactive=False)
|
90 |
+
rewrite_full_dataset_button = gr.Button("ReWrite Full Dataset", interactive=False)
|
91 |
+
full_dataset_generation_label = gr.Label(visible=False, show_label=False)
|
92 |
+
full_dataset_generation_success_markdown = gr.Markdown("")
|
93 |
+
with gr.Column(scale=4, min_width="200px"):
|
94 |
+
with gr.Accordion("Settings", open=False, elem_classes="settings"):
|
95 |
+
gr.Markdown("Save datasets to your account")
|
96 |
+
gr.LoginButton()
|
97 |
+
select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
|
98 |
+
gr.Markdown("Save datasets as public or private datasets")
|
99 |
+
visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
|
100 |
|
101 |
|
102 |
############
|
|
|
148 |
break
|
149 |
|
150 |
|
151 |
+
def stream_rewrite_dataset_preview_row_by_row(dataset: str, rows: list[dict[str, str]], prompt: str, format: str) -> Iterator[dict[str, str]]:
|
152 |
+
prompt = prompt[:1000] if prompt.strip() else ""
|
153 |
+
messages = [{"role": "user", "content": REWRITE_DATASET_PREVIEW.format(
|
154 |
+
dataset=dataset,
|
155 |
+
rows=json.dumps({"data": rows}),
|
156 |
+
prompt=prompt,
|
157 |
+
)}]
|
158 |
+
response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format}}, "required": ["data"]}}
|
159 |
+
print("streaming preview")
|
160 |
+
yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4)
|
161 |
+
print("done")
|
162 |
+
|
163 |
+
|
164 |
+
def stream_rewrite_dataset_row_by_row(dataset: str, rows: list[dict[str, str]], prompt: str, format: str, input_preview_rows: list[dict[str, str]], output_preview_rows: list[dict[str, str]]) -> Iterator[dict[str, str]]:
|
165 |
prompt = prompt[:1000] if prompt.strip() else ""
|
166 |
messages = [{"role": "user", "content": REWRITE_DATASET.format(
|
167 |
dataset=dataset,
|
168 |
rows=json.dumps({"data": rows}),
|
169 |
prompt=prompt,
|
170 |
+
input_preview_rows=json.dumps({"data": input_preview_rows}),
|
171 |
+
output_preview_rows=json.dumps({"data": output_preview_rows}),
|
172 |
)}]
|
173 |
+
response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format}}, "required": ["data"]}}
|
174 |
+
print("streaming results")
|
175 |
+
yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4)
|
176 |
print("done")
|
177 |
|
178 |
|
179 |
+
def find_new_name(dataset: str, prompt: str) -> str:
|
180 |
+
messages = [{"role": "user", "content": FIND_NEW_NAME.format(prompt=prompt)}]
|
181 |
+
out = "".join(stream_reponse(messages))
|
182 |
+
if "should be" in out:
|
183 |
+
return dataset.split("/")[-1] + out.split("should be", 1)[1].replace(" ", "-").replace(".", "").replace(",", "")
|
184 |
+
else:
|
185 |
+
return dataset.split("/")[-1] + prompt.replace(" ", "-")
|
186 |
+
|
187 |
def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
|
188 |
for i, result in enumerate(func(**kwargs)):
|
189 |
queue.put(result)
|
|
|
256 |
split = default_split if default_split in splits else splits[0]
|
257 |
dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
|
258 |
return subset, split, {
|
259 |
+
dataset_info_json: info_resp["dataset_info"][subset],
|
260 |
subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
|
261 |
split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
|
262 |
output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
|
|
|
275 |
}
|
276 |
|
277 |
|
278 |
+
@dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
|
279 |
def show_input_from_dataset_search(dataset: str) -> dict:
|
280 |
return _show_input_preview(dataset, default_subset="default", default_split="train")
|
281 |
|
282 |
+
@subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
|
283 |
def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
|
284 |
return _show_input_preview(dataset, default_subset=subset, default_split="train")
|
285 |
|
286 |
+
@split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
|
287 |
def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
|
288 |
return _show_input_preview(dataset, default_subset=subset, default_split=split)
|
289 |
+
|
290 |
+
|
291 |
+
@input_prompt.change(outputs=[rewrite_full_dataset_button])
|
292 |
+
def disable_rewrite_full_dataset() -> dict:
|
293 |
+
return {rewrite_full_dataset_button: gr.Button(interactive=False)}
|
294 |
|
295 |
|
296 |
+
@rewrite_preview_button.click(inputs=[dataset_search, pretty_input_preview, input_prompt, output_format_dataframe], outputs=[pretty_output_preview, rewrite_full_dataset_button, full_dataset_generation_label])
|
297 |
+
def rewrite_preview(dataset: str, pretty_input_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
|
298 |
rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
|
299 |
format = output_format_df.to_dict(orient="records")
|
300 |
format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
|
301 |
output_rows = []
|
302 |
+
print(f"ReWriting {dataset} preview with instruction '{prompt}'")
|
303 |
+
yield {rewrite_full_dataset_button: gr.Button(interactive=False), full_dataset_generation_label: gr.Label(visible=False)}
|
304 |
+
for row in stream_rewrite_dataset_preview_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format):
|
305 |
+
output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in output_format_df["column"]})
|
306 |
+
yield {pretty_output_preview: gr.DataFrame(pd.DataFrame(output_rows))}
|
307 |
+
yield {rewrite_full_dataset_button: gr.Button(interactive=True)}
|
308 |
+
|
309 |
+
|
310 |
+
@rewrite_full_dataset_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, pretty_input_preview, pretty_output_preview, input_prompt, output_format_dataframe, dataset_info_json, select_namespace_dropdown], outputs=[full_dataset_generation_label, full_dataset_generation_success_markdown])
|
311 |
+
def rewrite_full_dataset(dataset: str, subset: str, split: str, pretty_input_preview_df: pd.DataFrame, pretty_output_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame, dataset_info: dict[str, Any], namespace: str, oauth_token: Optional[gr.OAuthToken]) -> Iterator[pd.DataFrame]:
|
312 |
+
input_preview_rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
|
313 |
+
output_preview_rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_output_preview_df.to_dict(orient="records")]
|
314 |
+
format = output_format_df.to_dict(orient="records")
|
315 |
+
format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
|
316 |
+
output_rows = []
|
317 |
+
num_examples = dataset_info["splits"][split]["num_examples"]
|
318 |
+
total = min(num_examples, MAX_NUM_ROWS_TO_REWRITE)
|
319 |
+
print(f"ReWriting {dataset} (full dataset) with instruction '{prompt}'")
|
320 |
+
yield {full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": 0.}, visible=True)}
|
321 |
+
for rows in batched(islice(stream_rows(dataset=dataset, subset=subset, split=split), total), n=10):
|
322 |
+
for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format, input_preview_rows=input_preview_rows, output_preview_rows=output_preview_rows):
|
323 |
+
print(row)
|
324 |
+
output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in output_format_df["column"]})
|
325 |
+
yield {full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": len(output_rows) / total})}
|
326 |
+
# repo_id = namespace + "/" + find_new_name(dataset, prompt)
|
327 |
+
# yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"⚙️ Saving to {repo_id}": 0.})}
|
328 |
+
# token = oauth_token.token if oauth_token else save_dataset_hf_token
|
329 |
+
# Dataset.from_list(output_rows).push_to_hub(repo_id, config_name=subset, split=split, token=token)
|
330 |
+
# yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"✅ Saving to {repo_id}": 1.})}
|
331 |
+
# yield {full_dataset_generation_success_markdown: f"# Open the ReWriten dataset in a new tab: [{repo_id}](https://huggingface.co/datasets/{repo_id})"}
|
332 |
|
333 |
|
334 |
demo.launch()
|