File size: 3,794 Bytes
421b068
 
 
 
 
7d529e0
734af25
 
 
 
 
 
 
 
 
 
 
421b068
 
7d529e0
421b068
7d529e0
421b068
7d529e0
421b068
 
7d529e0
6d0709a
2d53b10
6d0709a
 
 
2d53b10
6d0709a
 
 
 
2d53b10
 
 
 
6d0709a
7d529e0
 
421b068
7d529e0
 
6d0709a
7d529e0
421b068
 
 
2d53b10
421b068
7d529e0
421b068
 
 
7d529e0
 
 
 
 
421b068
 
 
 
 
 
 
 
 
 
2d53b10
421b068
2d53b10
421b068
 
2d53b10
 
 
6d0709a
 
2d53b10
6d0709a
7d529e0
 
6d0709a
 
 
2d53b10
 
 
 
 
 
 
 
 
6d0709a
 
 
7d529e0
2d53b10
7d529e0
6d0709a
 
2d53b10
6d0709a
 
 
 
 
 
 
 
7d529e0
 
 
2d53b10
7d529e0
2d53b10
7d529e0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
from gradio_huggingfacehub_search import HuggingfaceHubSearch
import nbformat as nbf
from huggingface_hub import HfApi


"""
TODOs:
- Handle erros
- Add more commands to the notebook
- Parametrize the commands
- How to handle configs and splits?
- Let user choose the framework
- Improve logs
"""


def create_notebook_file(cell_commands, notebook_name="generated_notebook.ipynb"):
    nb = nbf.v4.new_notebook()
    nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]

    with open(notebook_name, "w") as f:
        nbf.write(nb, f)

    print(f"Notebook '{notebook_name}' created successfully.")


def push_notebook(file_path, dataset_id, token):
    notebook_name = "dataset_analysis.ipynb"
    api = HfApi(token=token)
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=notebook_name,
        repo_id=dataset_id,
        repo_type="dataset",
    )
    print("Notebook uploaded to Huggingface Hub.")
    link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
    html = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>'

    return gr.HTML(value=html, visible=True)


def generate_notebook(dataset_id):
    commands = [
        "!pip install pandas",
        "import pandas as pd",
        f"df = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')",
        "df.head()",
    ]
    notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
    create_notebook_file(commands, notebook_name=notebook_name)
    return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)


with gr.Blocks() as demo:
    gr.Markdown("# 🤖 Dataset auto analyst creator 🕵️")
    dataset_name = HuggingfaceHubSearch(
        label="Hub Dataset ID",
        placeholder="Search for dataset id on Huggingface",
        search_type="dataset",
        value="",
    )

    @gr.render(inputs=dataset_name)
    def embed(name):
        if not name:
            return gr.Markdown("### No dataset provided")
        html_code = f"""
        <iframe
          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
          frameborder="0"
          width="100%"
          height="350px"
        ></iframe>
        """
        return gr.HTML(value=html_code)

    generate_btn = gr.Button("Generate notebook")
    download_link = gr.File(label="Download notebook", visible=False)
    with gr.Row(visible=False) as auth_page:
        with gr.Column():
            auth_title = gr.Markdown(
                "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):"
            )
            token_box = gr.Textbox(
                "", label="token", placeholder="hf_xxx", type="password"
            )
            auth_error = gr.Markdown("", visible=False)

    push_btn = gr.Button("Push notebook to hub", visible=False)
    output_lbl = gr.HTML(value="", visible=False)

    generate_btn.click(
        generate_notebook,
        inputs=[dataset_name],
        outputs=[download_link, auth_page],
    )

    def auth(token):
        if not token:
            return {
                auth_error: gr.Markdown(value="", visible=False),
                push_btn: gr.Button(visible=False),
            }
        return {
            auth_error: gr.Markdown(value="", visible=False),
            push_btn: gr.Button("Push notebook to hub", visible=True),
        }

    token_box.change(
        auth,
        inputs=token_box,
        outputs=[auth_error, push_btn],
    )

    push_btn.click(
        push_notebook,
        inputs=[download_link, dataset_name, token_box],
        outputs=output_lbl,
    )

demo.launch()