lhoestq HF staff commited on
Commit
bd6dcb9
·
1 Parent(s): 91400d0

add full generation

Browse files
Files changed (2) hide show
  1. README.md +6 -1
  2. app.py +137 -44
README.md CHANGED
@@ -1,12 +1,17 @@
1
  ---
2
  title: Dataset Rewriter
3
- emoji: 🏃
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Dataset Rewriter
3
+ emoji: ✍️✨
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ hf_oauth_scopes:
12
+ - read-repos
13
+ - write-repos
14
+ - manage-repos
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -4,13 +4,13 @@ import time
4
  from itertools import count, islice
5
  from multiprocessing.pool import ThreadPool
6
  from queue import Queue, Empty
7
- from typing import Any, Callable, Iterable, Iterator, TypeVar
8
 
9
  import gradio as gr
10
  import ijson
11
  import pandas as pd
12
  import requests
13
- from datasets import Features, Value, Sequence
14
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
15
  from huggingface_hub import InferenceClient
16
 
@@ -20,45 +20,83 @@ from utils import StringIteratorIO
20
  model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
21
  client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN_INFERENCE_API"))
22
 
 
23
  session = requests.Session()
24
  empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
25
 
 
 
26
  NUM_ROWS_PREVIEW = 3
27
- REWRITE_DATASET = (
 
28
  "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
29
- "They want you to rewrite the dataset and apply this transformation: {prompt}."
30
  "The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
31
- "Rewrite those rows from the '{dataset}' dataset using the same JSON format. "
32
- "Try to keep some of the text or meaning intact, and apply the requested transformation '{prompt}'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  )
34
 
35
-
36
- with gr.Blocks() as demo:
37
- gr.Markdown(
38
- "# 🤗 WIP Dataset ReWriter ✍️✨\n\n"
39
- "Adjust, translate or transform completely existing datasets.\n\n"
40
- )
 
 
 
 
 
41
  with gr.Row():
42
- with gr.Column(scale=3):
43
- dataset_search = HuggingfaceHubSearch(
44
- label="Hub Dataset ID",
45
- placeholder="Search for dataset id on Huggingface",
46
- search_type="dataset",
47
  )
48
- subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
49
- split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
50
-
51
- gr.Markdown("### Input")
52
- pretty_input_preview = gr.DataFrame(interactive=False)
53
-
54
- gr.Markdown("### ReWrite")
55
- with gr.Group():
56
- input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
57
- with gr.Accordion("(Advanced) Edit columns", open=False):
58
- output_format_dataframe = gr.DataFrame(col_count=(2, "fixed"), headers=["column", "type"])
59
- rewrite_button = gr.Button("ReWrite Dataset", variant="primary")
60
- pretty_output_preview = gr.DataFrame(interactive=False)
61
- save_button = gr.Button("ReWrite Full Dataset", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  ############
@@ -110,19 +148,42 @@ with gr.Blocks() as demo:
110
  break
111
 
112
 
113
- def stream_rewrite_dataset_row_by_row(dataset: str, rows: list[dict[str, str]], prompt: str, format: str) -> Iterator[dict[str, str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  prompt = prompt[:1000] if prompt.strip() else ""
115
  messages = [{"role": "user", "content": REWRITE_DATASET.format(
116
  dataset=dataset,
117
  rows=json.dumps({"data": rows}),
118
  prompt=prompt,
 
 
119
  )}]
120
- response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "maxItems": len(rows), "minItems": len(rows), "items": format}}, "required": ["data"]}}
121
- print("go")
122
- yield from islice(ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4), len(rows))
123
  print("done")
124
 
125
 
 
 
 
 
 
 
 
 
126
  def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
127
  for i, result in enumerate(func(**kwargs)):
128
  queue.put(result)
@@ -195,6 +256,7 @@ with gr.Blocks() as demo:
195
  split = default_split if default_split in splits else splits[0]
196
  dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
197
  return subset, split, {
 
198
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
199
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
200
  output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
@@ -213,29 +275,60 @@ with gr.Blocks() as demo:
213
  }
214
 
215
 
216
- @dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
217
  def show_input_from_dataset_search(dataset: str) -> dict:
218
  return _show_input_preview(dataset, default_subset="default", default_split="train")
219
 
220
- @subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
221
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
222
  return _show_input_preview(dataset, default_subset=subset, default_split="train")
223
 
224
- @split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe])
225
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
226
  return _show_input_preview(dataset, default_subset=subset, default_split=split)
 
 
 
 
 
227
 
228
 
229
- @rewrite_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, pretty_input_preview, input_prompt, output_format_dataframe], outputs=[pretty_output_preview])
230
- def rewrite(dataset: str, subset: str, split: str, pretty_input_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
231
  rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
232
  format = output_format_df.to_dict(orient="records")
233
  format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
234
  output_rows = []
235
- print(f"ReWriting {dataset} with instructions '{prompt}'")
236
- for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format):
237
- output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in pretty_input_preview_df.columns})
238
- yield pd.DataFrame(output_rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
 
241
  demo.launch()
 
4
  from itertools import count, islice
5
  from multiprocessing.pool import ThreadPool
6
  from queue import Queue, Empty
7
+ from typing import Any, Callable, Iterable, Iterator, Optional, TypeVar
8
 
9
  import gradio as gr
10
  import ijson
11
  import pandas as pd
12
  import requests
13
+ from datasets import Dataset, Features, Value, Sequence
14
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
15
  from huggingface_hub import InferenceClient
16
 
 
20
  model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
21
  client = InferenceClient(model_id, token=os.environ.get("HF_TOKEN_INFERENCE_API"))
22
 
23
+ save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
24
  session = requests.Session()
25
  empty_dataframe = pd.DataFrame({"1": [], "2": [], "3": []})
26
 
27
+ NAMESPACE = "lhoestq"
28
+
29
  NUM_ROWS_PREVIEW = 3
30
+ MAX_NUM_ROWS_TO_REWRITE = 10
31
+ REWRITE_DATASET_PREVIEW = (
32
  "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
33
+ "They want you to rewrite the dataset and apply this instruction, which can be about transforming, translating or filtering the rows: {prompt}."
34
  "The first rows of the dataset are below in JSON format:\n\n{rows}\n\n"
35
+ "Apply the instruction to those rows from the '{dataset}' dataset and output the resulting rows using the same JSON format. "
36
+ "Try to keep some of the text or meaning intact, and apply the requested instruction '{prompt}'."
37
+ )
38
+ REWRITE_DATASET= (
39
+ "A Machine Learning practitioner is looking for a dataset similar to '{dataset}' but slightly different. "
40
+ "They want you to rewrite the dataset and apply this instruction, which can be about transforming, translating or filtering the rows: {prompt}."
41
+ "Here is an example:\n\nOriginal rows:\n{input_preview_rows}\n\Resulting rows:\n{output_preview_rows}\n\n"
42
+ "The rows of the dataset are below in JSON format:\n\n{rows}\n\n"
43
+ "Apply the instruction to those rows from the '{dataset}' dataset and output the resulting rows using the same JSON format. "
44
+ "Try to keep some of the text or meaning intact, and apply the requested instruction '{prompt}'."
45
+ )
46
+ FIND_NEW_NAME = (
47
+ "You are a helpful assistant specialized in transforming english sentences for machine learning practitioners."
48
+ "Your job is to take input sentences like 'Take this dataset and apply the instruction xxx' and rephrase them them as 'The dataset should be yyy'. "
49
+ "You shoud use adjectives and exactly follow the output formula 'The dataset should be yyy'. "
50
+ "Here is your first job: rephrase the sentence 'Take this dataset and apply the instruction \"{prompt}\"'"
51
  )
52
 
53
+ css = """
54
+ .settings {
55
+ background: transparent;
56
+ }
57
+ .settings button span {
58
+ color: var(--body-text-color-subdued);
59
+ }
60
+ """
61
+
62
+ with gr.Blocks(css=css) as demo:
63
+ dataset_info_json = gr.JSON(visible=False)
64
  with gr.Row():
65
+ with gr.Column(scale=10):
66
+ gr.Markdown(
67
+ "# 🤗 WIP Dataset ReWriter ✍️✨\n\n"
68
+ "Adjust, translate or transform completely existing datasets.\n\n"
 
69
  )
70
+ with gr.Row():
71
+ with gr.Column(scale=3):
72
+ dataset_search = HuggingfaceHubSearch(
73
+ label="Hub Dataset ID",
74
+ placeholder="Search for dataset id on Huggingface",
75
+ search_type="dataset",
76
+ )
77
+ subset_dropdown = gr.Dropdown(info="Subset", show_label=False, visible=False)
78
+ split_dropdown = gr.Dropdown(info="Split", show_label=False, visible=False)
79
+
80
+ gr.Markdown("### Input")
81
+ pretty_input_preview = gr.DataFrame(interactive=False)
82
+
83
+ gr.Markdown("### ReWrite")
84
+ with gr.Group():
85
+ input_prompt = gr.Textbox(label="Enter the adjustment or transformation to apply to the dataset:")
86
+ with gr.Accordion("(Advanced) Edit columns", open=False):
87
+ output_format_dataframe = gr.DataFrame(col_count=(2, "fixed"), headers=["column", "type"])
88
+ rewrite_preview_button = gr.Button("Preview Results", variant="primary")
89
+ pretty_output_preview = gr.DataFrame(interactive=False)
90
+ rewrite_full_dataset_button = gr.Button("ReWrite Full Dataset", interactive=False)
91
+ full_dataset_generation_label = gr.Label(visible=False, show_label=False)
92
+ full_dataset_generation_success_markdown = gr.Markdown("")
93
+ with gr.Column(scale=4, min_width="200px"):
94
+ with gr.Accordion("Settings", open=False, elem_classes="settings"):
95
+ gr.Markdown("Save datasets to your account")
96
+ gr.LoginButton()
97
+ select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
98
+ gr.Markdown("Save datasets as public or private datasets")
99
+ visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
100
 
101
 
102
  ############
 
148
  break
149
 
150
 
151
+ def stream_rewrite_dataset_preview_row_by_row(dataset: str, rows: list[dict[str, str]], prompt: str, format: str) -> Iterator[dict[str, str]]:
152
+ prompt = prompt[:1000] if prompt.strip() else ""
153
+ messages = [{"role": "user", "content": REWRITE_DATASET_PREVIEW.format(
154
+ dataset=dataset,
155
+ rows=json.dumps({"data": rows}),
156
+ prompt=prompt,
157
+ )}]
158
+ response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format}}, "required": ["data"]}}
159
+ print("streaming preview")
160
+ yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4)
161
+ print("done")
162
+
163
+
164
+ def stream_rewrite_dataset_row_by_row(dataset: str, rows: list[dict[str, str]], prompt: str, format: str, input_preview_rows: list[dict[str, str]], output_preview_rows: list[dict[str, str]]) -> Iterator[dict[str, str]]:
165
  prompt = prompt[:1000] if prompt.strip() else ""
166
  messages = [{"role": "user", "content": REWRITE_DATASET.format(
167
  dataset=dataset,
168
  rows=json.dumps({"data": rows}),
169
  prompt=prompt,
170
+ input_preview_rows=json.dumps({"data": input_preview_rows}),
171
+ output_preview_rows=json.dumps({"data": output_preview_rows}),
172
  )}]
173
+ response_format = {"type": "json", "value": {"properties": {"data": {"type": "array", "items": format}}, "required": ["data"]}}
174
+ print("streaming results")
175
+ yield from ijson.items(StringIteratorIO(stream_reponse(messages, response_format=response_format)), "data.item", buf_size=4)
176
  print("done")
177
 
178
 
179
+ def find_new_name(dataset: str, prompt: str) -> str:
180
+ messages = [{"role": "user", "content": FIND_NEW_NAME.format(prompt=prompt)}]
181
+ out = "".join(stream_reponse(messages))
182
+ if "should be" in out:
183
+ return dataset.split("/")[-1] + out.split("should be", 1)[1].replace(" ", "-").replace(".", "").replace(",", "")
184
+ else:
185
+ return dataset.split("/")[-1] + prompt.replace(" ", "-")
186
+
187
  def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
188
  for i, result in enumerate(func(**kwargs)):
189
  queue.put(result)
 
256
  split = default_split if default_split in splits else splits[0]
257
  dict_format = features_to_format(Features.from_dict(info_resp["dataset_info"][subset]["features"]))
258
  return subset, split, {
259
+ dataset_info_json: info_resp["dataset_info"][subset],
260
  subset_dropdown: gr.Dropdown(value=subset, choices=subsets, visible=len(subsets) > 1),
261
  split_dropdown: gr.Dropdown(value=split, choices=splits, visible=len(splits) > 1),
262
  output_format_dataframe: pd.DataFrame([{"column": col, "type": json.dumps(format_type)} for col, format_type in dict_format["properties"].items()])
 
275
  }
276
 
277
 
278
+ @dataset_search.change(inputs=[dataset_search], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
279
  def show_input_from_dataset_search(dataset: str) -> dict:
280
  return _show_input_preview(dataset, default_subset="default", default_split="train")
281
 
282
+ @subset_dropdown.change(inputs=[dataset_search, subset_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
283
  def show_input_from_subset_dropdown(dataset: str, subset: str) -> dict:
284
  return _show_input_preview(dataset, default_subset=subset, default_split="train")
285
 
286
+ @split_dropdown.change(inputs=[dataset_search, subset_dropdown, split_dropdown], outputs=[pretty_input_preview, subset_dropdown, split_dropdown, output_format_dataframe, dataset_info_json])
287
  def show_input_from_split_dropdown(dataset: str, subset: str, split: str) -> dict:
288
  return _show_input_preview(dataset, default_subset=subset, default_split=split)
289
+
290
+
291
+ @input_prompt.change(outputs=[rewrite_full_dataset_button])
292
+ def disable_rewrite_full_dataset() -> dict:
293
+ return {rewrite_full_dataset_button: gr.Button(interactive=False)}
294
 
295
 
296
+ @rewrite_preview_button.click(inputs=[dataset_search, pretty_input_preview, input_prompt, output_format_dataframe], outputs=[pretty_output_preview, rewrite_full_dataset_button, full_dataset_generation_label])
297
+ def rewrite_preview(dataset: str, pretty_input_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame) -> Iterator[pd.DataFrame]:
298
  rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
299
  format = output_format_df.to_dict(orient="records")
300
  format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
301
  output_rows = []
302
+ print(f"ReWriting {dataset} preview with instruction '{prompt}'")
303
+ yield {rewrite_full_dataset_button: gr.Button(interactive=False), full_dataset_generation_label: gr.Label(visible=False)}
304
+ for row in stream_rewrite_dataset_preview_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format):
305
+ output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in output_format_df["column"]})
306
+ yield {pretty_output_preview: gr.DataFrame(pd.DataFrame(output_rows))}
307
+ yield {rewrite_full_dataset_button: gr.Button(interactive=True)}
308
+
309
+
310
+ @rewrite_full_dataset_button.click(inputs=[dataset_search, subset_dropdown, split_dropdown, pretty_input_preview, pretty_output_preview, input_prompt, output_format_dataframe, dataset_info_json, select_namespace_dropdown], outputs=[full_dataset_generation_label, full_dataset_generation_success_markdown])
311
+ def rewrite_full_dataset(dataset: str, subset: str, split: str, pretty_input_preview_df: pd.DataFrame, pretty_output_preview_df: pd.DataFrame, prompt: str, output_format_df: pd.DataFrame, dataset_info: dict[str, Any], namespace: str, oauth_token: Optional[gr.OAuthToken]) -> Iterator[pd.DataFrame]:
312
+ input_preview_rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_input_preview_df.to_dict(orient="records")]
313
+ output_preview_rows = [{k: json.loads(v) for k, v in row.items()} for row in pretty_output_preview_df.to_dict(orient="records")]
314
+ format = output_format_df.to_dict(orient="records")
315
+ format = {"properties": {x["column"]: json.loads(x["type"]) for x in format}, "required": [x["column"] for x in format]}
316
+ output_rows = []
317
+ num_examples = dataset_info["splits"][split]["num_examples"]
318
+ total = min(num_examples, MAX_NUM_ROWS_TO_REWRITE)
319
+ print(f"ReWriting {dataset} (full dataset) with instruction '{prompt}'")
320
+ yield {full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": 0.}, visible=True)}
321
+ for rows in batched(islice(stream_rows(dataset=dataset, subset=subset, split=split), total), n=10):
322
+ for row in stream_rewrite_dataset_row_by_row(dataset=dataset, rows=rows, prompt=prompt, format=format, input_preview_rows=input_preview_rows, output_preview_rows=output_preview_rows):
323
+ print(row)
324
+ output_rows.append({k: json.dumps(row[k], ensure_ascii=False) for k in output_format_df["column"]})
325
+ yield {full_dataset_generation_label: gr.Label({f"⚙️ ReWriting {dataset}": len(output_rows) / total})}
326
+ # repo_id = namespace + "/" + find_new_name(dataset, prompt)
327
+ # yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"⚙️ Saving to {repo_id}": 0.})}
328
+ # token = oauth_token.token if oauth_token else save_dataset_hf_token
329
+ # Dataset.from_list(output_rows).push_to_hub(repo_id, config_name=subset, split=split, token=token)
330
+ # yield {full_dataset_generation_label: gr.Label({f"✅ ReWriting {dataset}": len(output_rows) / total, f"✅ Saving to {repo_id}": 1.})}
331
+ # yield {full_dataset_generation_success_markdown: f"# Open the ReWriten dataset in a new tab: [{repo_id}](https://huggingface.co/datasets/{repo_id})"}
332
 
333
 
334
  demo.launch()