Spaces:
Sleeping
Sleeping
File size: 4,726 Bytes
421b068 ca1279e 7d529e0 734af25 7af3e0d ca1279e 734af25 ca1279e 734af25 ca1279e 734af25 ca1279e 734af25 ca1279e 421b068 7d529e0 421b068 7d529e0 421b068 7d529e0 6d0709a 2d53b10 6d0709a 2d53b10 6d0709a 2d53b10 6d0709a 7d529e0 ca1279e 421b068 7d529e0 ca1279e 7d529e0 421b068 2d53b10 421b068 7d529e0 421b068 7af3e0d 421b068 7d529e0 421b068 2d53b10 421b068 2d53b10 421b068 2d53b10 6d0709a 2d53b10 6d0709a 7d529e0 6d0709a 2d53b10 6d0709a 7d529e0 2d53b10 7d529e0 6d0709a 2d53b10 6d0709a 7d529e0 2d53b10 7d529e0 2d53b10 7d529e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client
"""
TODOs:
- Handle erros
- Add more commands to the notebook
- Parametrize the commands (Move to another file)
- How to handle configs and splits? -> Got from /compatible-libraries
- Let user choose the framework
- Use an LLM to suggest commands
- Add commands for auto training
- Improve logs
- Enable 'generate notebook' button only if dataset is available and supports library
"""
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
headers = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=headers)
def get_compatible_libraries(dataset: str):
resp = client.get(
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
)
return resp.json()
def create_notebook_file(cell_commands, notebook_name):
nb = nbf.v4.new_notebook()
nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
with open(notebook_name, "w") as f:
nbf.write(nb, f)
def push_notebook(file_path, dataset_id, token):
notebook_name = "dataset_analysis.ipynb"
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=notebook_name,
repo_id=dataset_id,
repo_type="dataset",
)
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
html = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>'
return gr.HTML(value=html, visible=True)
def generate_notebook(dataset_id):
first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
try:
libraries = get_compatible_libraries(dataset_id)["libraries"]
except Exception as err:
print(f"Error: {err}")
return gr.File(visible=False), gr.Row.update(visible=False)
if pandas_library := next(
(element for element in libraries if element["library"] == "pandas"), None
):
first_code = pandas_library["loading_codes"][0]["code"]
else:
return gr.File(visible=False), gr.Row.update(visible=False)
commands = [
"!pip install pandas",
first_code,
"df.head()",
]
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
create_notebook_file(commands, notebook_name=notebook_name)
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)
with gr.Blocks() as demo:
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Search for dataset id on Huggingface",
search_type="dataset",
value="",
)
@gr.render(inputs=dataset_name)
def embed(name):
if not name:
return gr.Markdown("### No dataset provided")
html_code = f"""
<iframe
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
frameborder="0"
width="100%"
height="350px"
></iframe>
"""
return gr.HTML(value=html_code)
generate_btn = gr.Button("Generate notebook")
download_link = gr.File(label="Download notebook", visible=False)
with gr.Row(visible=False) as auth_page:
with gr.Column():
auth_title = gr.Markdown(
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
)
token_box = gr.Textbox(
"", label="token", placeholder="hf_xxx", type="password"
)
auth_error = gr.Markdown("", visible=False)
push_btn = gr.Button("Push notebook to hub", visible=False)
output_lbl = gr.HTML(value="", visible=False)
generate_btn.click(
generate_notebook,
inputs=[dataset_name],
outputs=[download_link, auth_page],
)
def auth(token):
if not token:
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button(visible=False),
}
return {
auth_error: gr.Markdown(value="", visible=False),
push_btn: gr.Button("Push notebook to hub", visible=True),
}
token_box.change(
auth,
inputs=token_box,
outputs=[auth_error, push_btn],
)
push_btn.click(
push_notebook,
inputs=[download_link, dataset_name, token_box],
outputs=output_lbl,
)
demo.launch()
|