File size: 3,846 Bytes
421b068
 
 
 
 
7d529e0
734af25
 
 
 
 
 
 
 
 
 
 
 
 
421b068
 
7d529e0
421b068
7d529e0
421b068
7d529e0
421b068
 
7d529e0
6d0709a
 
 
 
 
 
 
 
7d529e0
6d0709a
7d529e0
 
 
6d0709a
 
7d529e0
 
6d0709a
421b068
7d529e0
 
6d0709a
7d529e0
421b068
 
 
 
6d0709a
421b068
7d529e0
421b068
 
 
7d529e0
 
 
 
 
421b068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734af25
6d0709a
734af25
7d529e0
 
 
6d0709a
 
 
 
 
7d529e0
 
6d0709a
 
 
 
 
 
7d529e0
 
 
6d0709a
 
7d529e0
6d0709a
 
734af25
6d0709a
 
 
 
 
421b068
6d0709a
7d529e0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi


"""
TODOs:
- Show auth and push button only after notebook creation
- Improve the link to the result notebook
- Handle erros
- Add more commands to the notebook
- Parametrize the commands
- How to handle configs and splits?
- Let user choose the framework
- Improve logs
"""


def create_notebook_file(cell_commands, notebook_name="generated_notebook.ipynb"):
    nb = nbf.v4.new_notebook()
    nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]

    with open(notebook_name, "w") as f:
        nbf.write(nb, f)

    print(f"Notebook '{notebook_name}' created successfully.")


def push_notebook(file_path, dataset_id, token):
    api = HfApi(token=token)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo="dataset_analysis.ipynb",
        repo_id=dataset_id,
        repo_type="dataset",
    )
    # TODO: Handle permission error
    print("Notebook uploaded to Huggingface Hub.")
    link = (
        f"https://huggingface.co/datasets/{dataset_id}/blob/main/dataset_analyst.ipynb"
    )
    return f'<a target="_blank" href="{link}"  style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">See notebook</a>'


def generate_notebook(dataset_id):
    # TODO: Get first config and split? or generate a dataframe per each split maybe?
    commands = [
        "!pip install pandas",
        "import pandas as pd",
        f"df = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')",
        "df.head()",
    ]
    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
    create_notebook_file(commands, notebook_name=notebook_name)
    print("Notebook uploaded to Huggingface Hub.")
    return notebook_name


with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dataset auto analyst creator 🕵️")
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Search for dataset id on Huggingface",
        search_type="dataset",
        value="",
    )

    @gr.render(inputs=dataset_name)
    def embed(name):
        if not name:
            return gr.Markdown("### No dataset provided")
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="600px"
        ></iframe>
            """
        return gr.HTML(value=html_code)

    generate_btn = gr.Button("Generate notebook", visible=True)

    download_link = gr.File(label="Download notebook")
    generate_btn.click(
        generate_notebook, inputs=[dataset_name], outputs=[download_link]
    )
    with gr.Row() as auth_page:
        with gr.Column():
            auth_title = gr.Markdown(
                "Enter your token ([settings](https://huggingface.co/settings/tokens)):"
            )
            token_box = gr.Textbox(
                "", label="token", placeholder="hf_xxx", type="password"
            )
            auth_error = gr.Markdown("", visible=False)

    def auth(token):
        if not token:
            return {
                auth_error: gr.Markdown(value="", visible=False),
                push_btn: gr.Row(visible=False),
            }
        return {
            auth_error: gr.Markdown(value="", visible=False),
            push_btn: gr.Row(visible=True),
        }

    push_btn = gr.Button("Push notebook to hub", visible=False)
    token_box.change(
        auth,
        inputs=token_box,
        outputs=[auth_error, push_btn],
    )
    output_lbl = gr.HTML(value="")

    push_btn.click(
        push_notebook,
        inputs=[download_link, dataset_name, token_box],
        outputs=[output_lbl],
    )
demo.launch()