File size: 4,803 Bytes
2171b06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import json
import requests
import streamlit as st

st.title("The Stack Bot πŸ€–")

intro = """
The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode), 
such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.

We show information about existing programming languages and models trained on them. If you trained a model on The Stack, let us know so we feature your model! πŸš€
"""
st.markdown(intro, unsafe_allow_html=True)

@st.cache()
def load_languages():
    with open("languages.json", "r") as f:
        languages = json.load(f)
    return languages

def how_to_load(language):
    text = f"""
    ```python
    from datasets import load_dataset
    dataset = load_dataset("bigcode/the-stack", data_dir=f"data/{language}, split="train")
    ```
    """
    st.markdown(text)

def load_model(values, language):
    model = values["model"]
    if not model:
        text = f"""No model available for {language.capitalize()}. If you trained a model on this language, let us know at [email protected] to feature your model!\n\
        You can also train your own model on The Stack using the instructions below πŸš€"""
        st.write(text)
        if st.button("Fine-tune your own model", key=4):
            st.write("Code available at [GitHub link] + add preview + example of time & required hardware estimation")
    else:
        text = f"""{model} is a model that was trained on the {language} from The Stack. Here's how to use it:"""
        code = f"""
        ```python
        from transformers import AutoModelForCausalLM, AutoTokenizer

        device = "cuda" # for GPU usage or "cpu" for CPU usage

        tokenizer = AutoTokenizer.from_pretrained({model})
        model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True).to(device)

        inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
        outputs = model.generate(inputs)
        print(tokenizer.decode(outputs[0]))
        ```
        """
        st.write(text)
        st.markdown(code)
        st.write("The scores of this model are the following:")
        for key, value in values["scores"].items():
                st.write(f"{key}: {value}")

def generate_code(
   demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
):
    # call space using its API endpoint
    try:
        url = (
            f"https://hf.space/embed/{demo.lower()}/+/api/predict/"
        )
        r = requests.post(
            url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
        )
        generated_text = r.json()["data"][0]
    except:
        generated_text = ""
    return generated_text

def init_nested_buttons():
    if "Models trained on dataset" not in st.session_state:
        st.session_state["Models trained on dataset"] = False

    if "Generate code" not in st.session_state:
        st.session_state["Generate code"] = False

    if st.button("Models trained on dataset"):
        st.session_state["Models trained on dataset"] = not st.session_state["Models trained on dataset"]


languages = load_languages()

col1, col2 = st.columns([1, 2])
with col1:
    selected_language = st.selectbox("Languages of The Stack", list(languages.keys()), key=1)

st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
code = how_to_load(selected_language)
if st.button("More info about the dataset", key=2):
    st.write(f"The dataset contains {languages[selected_language]['num_examples']} examples.")
    # we can add some stats about files

init_nested_buttons()
if st.session_state["Models trained on dataset"]:
    load_model(languages[selected_language], selected_language)

    if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
        st.write(f"Here's a demo to try the model, for more flxibilty you can use the original at [Gradio demo](hf.co/{languages[selected_language]['gradio_demo']})")
        gen_prompt = st.text_area(
        "Generate code with prompt:",
        value="# print hello world",
        height=100,
        ).strip()

        if st.button("Generate code"):
            st.session_state["Generate code"] = not st.session_state["Generate code"]
        if st.session_state["Generate code"]:
            generated_text = generate_code(
                demo=languages[selected_language]["gradio_demo"],
                gen_prompt=gen_prompt,
            )
            if not generated_text:
                st.write(f"Error: could not generate code. Make sure the Gradio demo at hf.co/{languages[selected_language]['gradio_demo']} works.")
            else:
                st.code(generated_text)