Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

File size: 4,726 Bytes

import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi
from httpx import Client

"""
TODOs:
- Handle erros
- Add more commands to the notebook
- Parametrize the commands (Move to another file)
- How to handle configs and splits? -> Got from /compatible-libraries
- Let user choose the framework
- Use an LLM to suggest commands
- Add commands for auto training
- Improve logs
- Enable 'generate notebook' button only if dataset is available and supports library
"""

BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
headers = {"Accept": "application/json", "Content-Type": "application/json"}
client = Client(headers=headers)


def get_compatible_libraries(dataset: str):
    resp = client.get(
        f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
    )
    return resp.json()


def create_notebook_file(cell_commands, notebook_name):
    nb = nbf.v4.new_notebook()
    nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]

    with open(notebook_name, "w") as f:
        nbf.write(nb, f)


def push_notebook(file_path, dataset_id, token):
    notebook_name = "dataset_analysis.ipynb"
    api = HfApi(token=token)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=notebook_name,
        repo_id=dataset_id,
        repo_type="dataset",
    )
    link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
    html = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>'

    return gr.HTML(value=html, visible=True)


def generate_notebook(dataset_id):
    first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
    try:
        libraries = get_compatible_libraries(dataset_id)["libraries"]
    except Exception as err:
        print(f"Error: {err}")
        return gr.File(visible=False), gr.Row.update(visible=False)
    if pandas_library := next(
        (element for element in libraries if element["library"] == "pandas"), None
    ):
        first_code = pandas_library["loading_codes"][0]["code"]
    else:
        return gr.File(visible=False), gr.Row.update(visible=False)

    commands = [
        "!pip install pandas",
        first_code,
        "df.head()",
    ]
    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
    create_notebook_file(commands, notebook_name=notebook_name)
    return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)


with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Search for dataset id on Huggingface",
        search_type="dataset",
        value="",
    )

    @gr.render(inputs=dataset_name)
    def embed(name):
        if not name:
            return gr.Markdown("### No dataset provided")
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="350px"
        ></iframe>
        """
        return gr.HTML(value=html_code)

    generate_btn = gr.Button("Generate notebook")
    download_link = gr.File(label="Download notebook", visible=False)
    with gr.Row(visible=False) as auth_page:
        with gr.Column():
            auth_title = gr.Markdown(
                "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
            )
            token_box = gr.Textbox(
                "", label="token", placeholder="hf_xxx", type="password"
            )
            auth_error = gr.Markdown("", visible=False)

    push_btn = gr.Button("Push notebook to hub", visible=False)
    output_lbl = gr.HTML(value="", visible=False)

    generate_btn.click(
        generate_notebook,
        inputs=[dataset_name],
        outputs=[download_link, auth_page],
    )

    def auth(token):
        if not token:
            return {
                auth_error: gr.Markdown(value="", visible=False),
                push_btn: gr.Button(visible=False),
            }
        return {
            auth_error: gr.Markdown(value="", visible=False),
            push_btn: gr.Button("Push notebook to hub", visible=True),
        }

    token_box.change(
        auth,
        inputs=token_box,
        outputs=[auth_error, push_btn],
    )

    push_btn.click(
        push_notebook,
        inputs=[download_link, dataset_name, token_box],
        outputs=output_lbl,
    )

demo.launch()