File size: 4,600 Bytes
2171b06
 
 
 
72c2877
 
 
 
 
 
a9e00bb
2171b06
 
 
 
 
 
 
 
 
72c2877
2171b06
 
 
 
 
 
 
72c2877
 
 
 
 
2171b06
 
 
 
 
 
 
8d9fca0
7659c19
2171b06
 
 
72c2877
2171b06
f6c8688
2171b06
 
 
 
7659c19
 
2171b06
72c2877
2171b06
 
 
 
f6c8688
2171b06
 
 
 
 
 
a9e00bb
 
 
 
 
 
 
 
 
 
2171b06
 
 
 
7659c19
 
72c2877
2171b06
7659c19
2171b06
 
 
7659c19
 
2171b06
 
 
7659c19
 
 
2171b06
 
 
8d9fca0
2171b06
 
72c2877
2171b06
 
 
 
72c2877
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import json
import requests
import streamlit as st

st.set_page_config(layout="wide")
with open("utils/table_contents.md", "r") as f:
    contents = f.read()

st.sidebar.markdown(contents)

st.title("The Stack Bot πŸ’¬")

intro = """
The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode), 
such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
"""
st.markdown(intro, unsafe_allow_html=True)

@st.cache()
def load_languages():
    with open("utils/languages.json", "r") as f:
        languages = json.load(f)
    return languages

def how_to_load(language):
    text = f"""
    ```python
    from datasets import load_dataset

    dataset = load_dataset("bigcode/the-stack", data_dir="data/{language}", split="train")

    # print first element
    print(dataset[0])
    ```
    """
    st.markdown(text)

def load_model(values, language):
    model = values["model"]
    if not model:
        text = f"""No model is available for {language.capitalize()}. If you trained a model on this language, let us know in\
        in the [Community tab](https://huggingface.co/spaces/loubnabnl/the-stack-bot/discussions) to feature your model!\n\n\
        You can also train your own model on The Stack using the instructions below πŸš€"""
        st.write(text)
        if st.button("Fine-tune your own model", key=4):
            st.write("Code available at [GitHub link] + add preview")
    else:
        text = f"""[{model}](https://huggingface.co/{model}) is a model trained on the {language.capitalize()} subset of The Stack. Here's how to use it:"""
        code = f"""
        ```python
        from transformers import AutoModelForCausalLM, AutoTokenizer

        tokenizer = AutoTokenizer.from_pretrained("{model}")
        model = AutoModelForCausalLM.from_pretrained("{model}", trust_remote_code=True)

        inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt")
        outputs = model.generate(inputs)
        print(tokenizer.decode(outputs[0]))
        ```
        """
        st.markdown(text)
        st.markdown(code)

def generate_code(
   demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
):
    # call space using its API endpoint
    try:
        url = (
            f"{demo}/run/predict/"
        )
        r = requests.post(
            url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
        )
        generated_text = r.json()["data"][0]
    except:
        generated_text = ""
    return generated_text

languages = load_languages()

st.header("Languages of The Stack πŸ“‘")
st.markdown("The Stack contains over 6TB of permissively-licensed source code files covering 358 programming languages. Select one to get started:")
col1, col2 = st.columns([1, 1.5])
with col1:
    selected_language = st.selectbox("Programming Language", list(languages.keys()), label_visibility="collapsed", key=1)

st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
code = how_to_load(selected_language)

with st.expander("More info about the dataset"):
    st.write(f"The dataset contains {languages[selected_language]['num_examples']} examples.")
    # we can add some stats about files

st.header("Models trained on The Stack πŸ€–")
st.write("Here we show models trained on the language you select as part of BigCode project.")
with st.expander(f"Models trained on {selected_language.capitalize()}"):
    load_model(languages[selected_language], selected_language)

    if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
        st.write(f"Here's a demo to try it, for more flexibilty you can use the original [Gradio demo]({languages[selected_language]['gradio_demo']}).")
        gen_prompt = st.text_area(
        "Generate code with prompt:",
        value="# Implement a function to print hello world",
        height=100,
        ).strip()

        if st.button("Generate code"):
            with st.spinner("Generating code..."):
                generated_text = generate_code(
                    demo=languages[selected_language]["gradio_demo"],
                    gen_prompt=gen_prompt,
                )
                if not generated_text:
                    st.markdown(f"Error: could not generate code. Make sure the Gradio demo at [{languages[selected_language]['gradio_demo']}]({languages[selected_language]['gradio_demo']}) works.")
                else:
                    st.code(generated_text)