loubnabnl HF Staff commited on
Commit
2171b06
Β·
1 Parent(s): 4ebeff6
Files changed (2) hide show
  1. app.py +121 -0
  2. languages.json +20 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import streamlit as st
4
+
5
+ st.title("The Stack Bot πŸ€–")
6
+
7
+ intro = """
8
+ The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
9
+ such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
10
+
11
+ We show information about existing programming languages and models trained on them. If you trained a model on The Stack, let us know so we feature your model! πŸš€
12
+ """
13
+ st.markdown(intro, unsafe_allow_html=True)
14
+
15
+ @st.cache()
16
+ def load_languages():
17
+ with open("languages.json", "r") as f:
18
+ languages = json.load(f)
19
+ return languages
20
+
21
+ def how_to_load(language):
22
+ text = f"""
23
+ ```python
24
+ from datasets import load_dataset
25
+ dataset = load_dataset("bigcode/the-stack", data_dir=f"data/{language}, split="train")
26
+ ```
27
+ """
28
+ st.markdown(text)
29
+
30
+ def load_model(values, language):
31
+ model = values["model"]
32
+ if not model:
33
+ text = f"""No model available for {language.capitalize()}. If you trained a model on this language, let us know at [email protected] to feature your model!\n\
34
+ You can also train your own model on The Stack using the instructions below πŸš€"""
35
+ st.write(text)
36
+ if st.button("Fine-tune your own model", key=4):
37
+ st.write("Code available at [GitHub link] + add preview + example of time & required hardware estimation")
38
+ else:
39
+ text = f"""{model} is a model that was trained on the {language} from The Stack. Here's how to use it:"""
40
+ code = f"""
41
+ ```python
42
+ from transformers import AutoModelForCausalLM, AutoTokenizer
43
+
44
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
45
+
46
+ tokenizer = AutoTokenizer.from_pretrained({model})
47
+ model = AutoModelForCausalLM.from_pretrained({model}, trust_remote_code=True).to(device)
48
+
49
+ inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device)
50
+ outputs = model.generate(inputs)
51
+ print(tokenizer.decode(outputs[0]))
52
+ ```
53
+ """
54
+ st.write(text)
55
+ st.markdown(code)
56
+ st.write("The scores of this model are the following:")
57
+ for key, value in values["scores"].items():
58
+ st.write(f"{key}: {value}")
59
+
60
+ def generate_code(
61
+ demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
62
+ ):
63
+ # call space using its API endpoint
64
+ try:
65
+ url = (
66
+ f"https://hf.space/embed/{demo.lower()}/+/api/predict/"
67
+ )
68
+ r = requests.post(
69
+ url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
70
+ )
71
+ generated_text = r.json()["data"][0]
72
+ except:
73
+ generated_text = ""
74
+ return generated_text
75
+
76
+ def init_nested_buttons():
77
+ if "Models trained on dataset" not in st.session_state:
78
+ st.session_state["Models trained on dataset"] = False
79
+
80
+ if "Generate code" not in st.session_state:
81
+ st.session_state["Generate code"] = False
82
+
83
+ if st.button("Models trained on dataset"):
84
+ st.session_state["Models trained on dataset"] = not st.session_state["Models trained on dataset"]
85
+
86
+
87
+ languages = load_languages()
88
+
89
+ col1, col2 = st.columns([1, 2])
90
+ with col1:
91
+ selected_language = st.selectbox("Languages of The Stack", list(languages.keys()), key=1)
92
+
93
+ st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
94
+ code = how_to_load(selected_language)
95
+ if st.button("More info about the dataset", key=2):
96
+ st.write(f"The dataset contains {languages[selected_language]['num_examples']} examples.")
97
+ # we can add some stats about files
98
+
99
+ init_nested_buttons()
100
+ if st.session_state["Models trained on dataset"]:
101
+ load_model(languages[selected_language], selected_language)
102
+
103
+ if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
104
+ st.write(f"Here's a demo to try the model, for more flxibilty you can use the original at [Gradio demo](hf.co/{languages[selected_language]['gradio_demo']})")
105
+ gen_prompt = st.text_area(
106
+ "Generate code with prompt:",
107
+ value="# print hello world",
108
+ height=100,
109
+ ).strip()
110
+
111
+ if st.button("Generate code"):
112
+ st.session_state["Generate code"] = not st.session_state["Generate code"]
113
+ if st.session_state["Generate code"]:
114
+ generated_text = generate_code(
115
+ demo=languages[selected_language]["gradio_demo"],
116
+ gen_prompt=gen_prompt,
117
+ )
118
+ if not generated_text:
119
+ st.write(f"Error: could not generate code. Make sure the Gradio demo at hf.co/{languages[selected_language]['gradio_demo']} works.")
120
+ else:
121
+ st.code(generated_text)
languages.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"python": {"num_examples": 10, "model": "bigcode/santacoder", "scores": {"HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
2
+ "java": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
3
+ "javascript": {"num_examples": 10, "model": "bigcode/santacoder", "scores": { "HumanEval-pass@1": 10, "HumanEval-pass@10": 20, "HumanEval-pass@100": 40}, "gradio_demo": "bigcode/santacoder-demo"},
4
+ "typescript": {"num_examples": 10, "model": ""},
5
+ "go": {"num_examples": 10, "model": ""},
6
+ "php": {"num_examples": 10, "model": ""},
7
+ "c": {"num_examples": 10, "model": ""},
8
+ "c-sharp": {"num_examples": 10, "model": ""},
9
+ "ruby": {"num_examples": 10, "model": ""},
10
+ "swift": {"num_examples": 10, "model": ""},
11
+ "scala": {"num_examples": 10, "model": ""},
12
+ "r": {"num_examples": 10, "model": ""},
13
+ "julia": {"num_examples": 10, "model": ""},
14
+ "perl": {"num_examples": 10, "model": ""},
15
+ "racket": {"num_examples": 10, "model": ""},
16
+ "haskell": {"num_examples": 10, "model": ""},
17
+ "erlang": {"num_examples": 10, "model": ""},
18
+ "elixir": {"num_examples": 10, "model": ""},
19
+ "ocaml": {"num_examples": 10, "model": ""}
20
+ }