|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
|
|
extension_to_language = { |
|
"clj": "Clojure", |
|
"cpp": "C++", |
|
"cs": "C#", |
|
"d": "D", |
|
"dart": "Dart", |
|
"elixir": "Elixir", |
|
"go": "Go", |
|
"hs": "Haskell", |
|
"java": "Java", |
|
"jl": "Julia", |
|
"js": "JavaScript", |
|
"lua": "Lua", |
|
"ml": "OCaml", |
|
"php": "PHP", |
|
"pl": "Perl", |
|
"r": "R", |
|
"rb": "Ruby", |
|
"rkt": "Racket", |
|
"rs": "Rust", |
|
"scala": "Scala", |
|
"sh": "Shell", |
|
"swift": "Swift", |
|
"ts": "TypeScript" |
|
} |
|
|
|
|
|
df = pd.read_csv('passk.csv') |
|
|
|
|
|
def extract_info(dataset): |
|
parts = dataset.split('-') |
|
language = parts[1] |
|
model = '-'.join(parts[2:-2]) |
|
return pd.Series({'Language': language, 'Model': model}) |
|
|
|
|
|
df[['Language', 'Model']] = df['Dataset'].apply(extract_info) |
|
|
|
|
|
model_to_friendly = { |
|
"starcoder2_15b": "StarCoder2-15B", |
|
"deepseekcoder_v2lite_base": "DeepSeekCoder2-Lite-Base" |
|
} |
|
|
|
|
|
def get_friendly_name(model): |
|
return model_to_friendly.get(model, model) |
|
|
|
|
|
pivot = df.pivot(index='Model', columns='Language', values='Estimate') |
|
|
|
|
|
languages = sorted(pivot.columns) |
|
models = sorted(pivot.index) |
|
|
|
|
|
def update_table(selected_languages): |
|
if not selected_languages: |
|
return pd.DataFrame({'Model': [get_friendly_name(model) for model in models]}) |
|
|
|
display_data = pivot[selected_languages].replace(np.nan, "-") |
|
display_data = display_data.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x) |
|
|
|
|
|
display_data.insert(0, 'Model', [get_friendly_name(model) for model in display_data.index]) |
|
|
|
|
|
display_data = display_data.reset_index(drop=True) |
|
|
|
|
|
display_data.columns = ['Model'] + [extension_to_language.get(lang, lang) for lang in selected_languages] |
|
|
|
return display_data |
|
|
|
|
|
def get_initial_table(): |
|
return update_table(languages) |
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown(""" |
|
# MultiPL-E Results |
|
|
|
[MultiPL-E](https://huggingface.co/datasets/nuprl/MultiPL-E) is a dataset for |
|
evaluating large language models for code generation that supports several |
|
programming languages. It takes the OpenAI HumanEval and the Mostly Basic |
|
Python Programs (MBPP) benchmarks and uses little compilers to translate them |
|
to other languages. It is easy to add support for new languages and benchmarks. |
|
|
|
This table shows how some recent Code LLMs perform on MultiPL-HumanEval. |
|
|
|
We use the MultiPL-E 3.0 problems, which incorporates several fixes and |
|
supports several new programming languages. |
|
|
|
""") |
|
|
|
with gr.Row(): |
|
language_checkboxes = gr.CheckboxGroup( |
|
choices=[f"{extension_to_language[lang]} ({lang})" for lang in languages], |
|
label="Select Languages", |
|
value=[f"{extension_to_language[lang]} ({lang})" for lang in languages] |
|
) |
|
|
|
table = gr.Dataframe( |
|
value=get_initial_table, |
|
headers=['Model'] + [extension_to_language.get(lang, lang) for lang in languages], |
|
type="pandas" |
|
) |
|
|
|
def update_table_wrapper(selected_languages): |
|
|
|
selected_codes = [lang.split('(')[-1].strip(')') for lang in selected_languages] |
|
return update_table(selected_codes) |
|
|
|
language_checkboxes.change(update_table_wrapper, inputs=[language_checkboxes], outputs=[table]) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |