Spaces:
Runtime error
Runtime error
File size: 4,600 Bytes
2171b06 72c2877 a9e00bb 2171b06 72c2877 2171b06 72c2877 2171b06 8d9fca0 7659c19 2171b06 72c2877 2171b06 f6c8688 2171b06 7659c19 2171b06 72c2877 2171b06 f6c8688 2171b06 a9e00bb 2171b06 7659c19 72c2877 2171b06 7659c19 2171b06 7659c19 2171b06 7659c19 2171b06 8d9fca0 2171b06 72c2877 2171b06 72c2877 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import json
import requests
import streamlit as st
st.set_page_config(layout="wide")
with open("utils/table_contents.md", "r") as f:
contents = f.read()
st.sidebar.markdown(contents)
st.title("The Stack Bot π¬")
intro = """
The Stack Bot is a tool to help you get started with tools developed in [BigCode](https://huggingface.co/bigcode),
such as [The Stack](https://huggingface.co/bigcode/the-stack) dataset and [SantaCoder](https://huggingface.co/bigcode/santacoder) model.
"""
st.markdown(intro, unsafe_allow_html=True)
@st.cache()
def load_languages():
with open("utils/languages.json", "r") as f:
languages = json.load(f)
return languages
def how_to_load(language):
text = f"""
```python
from datasets import load_dataset
dataset = load_dataset("bigcode/the-stack", data_dir="data/{language}", split="train")
# print first element
print(dataset[0])
```
"""
st.markdown(text)
def load_model(values, language):
model = values["model"]
if not model:
text = f"""No model is available for {language.capitalize()}. If you trained a model on this language, let us know in\
in the [Community tab](https://huggingface.co/spaces/loubnabnl/the-stack-bot/discussions) to feature your model!\n\n\
You can also train your own model on The Stack using the instructions below π"""
st.write(text)
if st.button("Fine-tune your own model", key=4):
st.write("Code available at [GitHub link] + add preview")
else:
text = f"""[{model}](https://huggingface.co/{model}) is a model trained on the {language.capitalize()} subset of The Stack. Here's how to use it:"""
code = f"""
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("{model}")
model = AutoModelForCausalLM.from_pretrained("{model}", trust_remote_code=True)
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))
```
"""
st.markdown(text)
st.markdown(code)
def generate_code(
demo, gen_prompt, max_new_tokens=40, temperature=0.2, seed=0
):
# call space using its API endpoint
try:
url = (
f"{demo}/run/predict/"
)
r = requests.post(
url=url, json={"data": [gen_prompt, max_new_tokens, temperature, seed]}
)
generated_text = r.json()["data"][0]
except:
generated_text = ""
return generated_text
languages = load_languages()
st.header("Languages of The Stack π")
st.markdown("The Stack contains over 6TB of permissively-licensed source code files covering 358 programming languages. Select one to get started:")
col1, col2 = st.columns([1, 1.5])
with col1:
selected_language = st.selectbox("Programming Language", list(languages.keys()), label_visibility="collapsed", key=1)
st.write(f"Here's how you can load the {selected_language.capitalize()} subset of The Stack:")
code = how_to_load(selected_language)
with st.expander("More info about the dataset"):
st.write(f"The dataset contains {languages[selected_language]['num_examples']} examples.")
# we can add some stats about files
st.header("Models trained on The Stack π€")
st.write("Here we show models trained on the language you select as part of BigCode project.")
with st.expander(f"Models trained on {selected_language.capitalize()}"):
load_model(languages[selected_language], selected_language)
if languages[selected_language]["model"] and languages[selected_language]["gradio_demo"]:
st.write(f"Here's a demo to try it, for more flexibilty you can use the original [Gradio demo]({languages[selected_language]['gradio_demo']}).")
gen_prompt = st.text_area(
"Generate code with prompt:",
value="# Implement a function to print hello world",
height=100,
).strip()
if st.button("Generate code"):
with st.spinner("Generating code..."):
generated_text = generate_code(
demo=languages[selected_language]["gradio_demo"],
gen_prompt=gen_prompt,
)
if not generated_text:
st.markdown(f"Error: could not generate code. Make sure the Gradio demo at [{languages[selected_language]['gradio_demo']}]({languages[selected_language]['gradio_demo']}) works.")
else:
st.code(generated_text) |