| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import sys | 
					
					
						
						| 
							 | 
						sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation"))) | 
					
					
						
						| 
							 | 
						sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard"))) | 
					
					
						
						| 
							 | 
						os.chdir(os.path.dirname(os.path.abspath(__file__))) | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						import datetime | 
					
					
						
						| 
							 | 
						from email.utils import parseaddr | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import gradio as gr | 
					
					
						
						| 
							 | 
						import pandas as pd | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from datasets import load_dataset | 
					
					
						
						| 
							 | 
						from apscheduler.schedulers.background import BackgroundScheduler | 
					
					
						
						| 
							 | 
						from huggingface_hub import HfApi | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink | 
					
					
						
						| 
							 | 
						from evaluation.eval import eval_score | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						TOKEN = os.environ.get("TOKEN", None) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						OWNER="osunlp" | 
					
					
						
						| 
							 | 
						DATA_DATASET = f"{OWNER}/TravelBench" | 
					
					
						
						| 
							 | 
						EVAL_DATASET = f"{OWNER}/TravelBenchEval" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						api = HfApi() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						YEAR_VERSION = "2024" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						os.makedirs("scored", exist_ok=True) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) | 
					
					
						
						| 
							 | 
						def get_dataframe_from_results(eval_results, split): | 
					
					
						
						| 
							 | 
						    local_df = eval_results[split] | 
					
					
						
						| 
							 | 
						    local_df = local_df.remove_columns(["Mail"]) | 
					
					
						
						| 
							 | 
						    df = pd.DataFrame(local_df) | 
					
					
						
						| 
							 | 
						    df = df.sort_values(by=["Final Pass Rate"], ascending=False) | 
					
					
						
						| 
							 | 
						    numeric_cols = [c for c in local_df.column_names if "Rate" in c] | 
					
					
						
						| 
							 | 
						    df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2) | 
					
					
						
						| 
							 | 
						    return df | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") | 
					
					
						
						| 
							 | 
						eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def load_line_json_data(filename): | 
					
					
						
						| 
							 | 
						    data = [] | 
					
					
						
						| 
							 | 
						    with open(filename, 'r', encoding='utf-8') as f: | 
					
					
						
						| 
							 | 
						        for line in f.read().strip().split('\n'): | 
					
					
						
						| 
							 | 
						            unit = json.loads(line) | 
					
					
						
						| 
							 | 
						            data.append(unit) | 
					
					
						
						| 
							 | 
						    return data | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def add_new_eval( | 
					
					
						
						| 
							 | 
						    val_or_test: str, | 
					
					
						
						| 
							 | 
						    eval_mode: str, | 
					
					
						
						| 
							 | 
						    model: str, | 
					
					
						
						| 
							 | 
						    planning_strategy: str, | 
					
					
						
						| 
							 | 
						    organization: str, | 
					
					
						
						| 
							 | 
						    mail: str, | 
					
					
						
						| 
							 | 
						    path_to_file: str, | 
					
					
						
						| 
							 | 
						): | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    _, parsed_mail = parseaddr(mail) | 
					
					
						
						| 
							 | 
						    if not "@" in parsed_mail: | 
					
					
						
						| 
							 | 
						        return format_warning("Please provide a valid email adress.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    print("Adding new eval") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if path_to_file is None: | 
					
					
						
						| 
							 | 
						        return format_warning("Please attach a file.") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    api.upload_file( | 
					
					
						
						| 
							 | 
						        repo_id=EVAL_DATASET,  | 
					
					
						
						| 
							 | 
						        path_or_fileobj=path_to_file.name,  | 
					
					
						
						| 
							 | 
						        path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl", | 
					
					
						
						| 
							 | 
						        repo_type="dataset",  | 
					
					
						
						| 
							 | 
						        token=TOKEN | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    file_path = path_to_file.name      | 
					
					
						
						| 
							 | 
						    result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)    | 
					
					
						
						| 
							 | 
						    with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file: | 
					
					
						
						| 
							 | 
						        scored_file.write(json.dumps(result) + "\n") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    api.upload_file( | 
					
					
						
						| 
							 | 
						        repo_id=EVAL_DATASET,  | 
					
					
						
						| 
							 | 
						        path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", | 
					
					
						
						| 
							 | 
						        path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",  | 
					
					
						
						| 
							 | 
						        repo_type="dataset",  | 
					
					
						
						| 
							 | 
						        token=TOKEN | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    eval_entry = { | 
					
					
						
						| 
							 | 
						        "Model": model, | 
					
					
						
						| 
							 | 
						        "Planning Strategy": planning_strategy, | 
					
					
						
						| 
							 | 
						        "Organization": organization, | 
					
					
						
						| 
							 | 
						        "Mail": mail, | 
					
					
						
						| 
							 | 
						        "Delivery Rate": result['Delivery Rate'], | 
					
					
						
						| 
							 | 
						        "Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'], | 
					
					
						
						| 
							 | 
						        "Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'], | 
					
					
						
						| 
							 | 
						        "Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'], | 
					
					
						
						| 
							 | 
						        "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'], | 
					
					
						
						| 
							 | 
						        "Final Pass Rate":result['Final Pass Rate'] | 
					
					
						
						| 
							 | 
						    } | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    print(eval_results) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    eval_results.push_to_hub(EVAL_DATASET, config_name = 'scores', token=TOKEN) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def refresh(): | 
					
					
						
						| 
							 | 
						    eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True) | 
					
					
						
						| 
							 | 
						    eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation") | 
					
					
						
						| 
							 | 
						    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test") | 
					
					
						
						| 
							 | 
						    return eval_dataframe_val, eval_dataframe_test | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						demo = gr.Blocks() | 
					
					
						
						| 
							 | 
						with demo: | 
					
					
						
						| 
							 | 
						    gr.HTML(TITLE) | 
					
					
						
						| 
							 | 
						    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    with gr.Tab("Results: Validation"): | 
					
					
						
						| 
							 | 
						        leaderboard_table_val = gr.components.Dataframe( | 
					
					
						
						| 
							 | 
						            value=eval_dataframe_val, interactive=False, | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						    with gr.Tab("Results: Test"): | 
					
					
						
						| 
							 | 
						        leaderboard_table_test = gr.components.Dataframe( | 
					
					
						
						| 
							 | 
						            value=eval_dataframe_test, interactive=False, | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    refresh_button = gr.Button("Refresh") | 
					
					
						
						| 
							 | 
						    refresh_button.click( | 
					
					
						
						| 
							 | 
						        refresh, | 
					
					
						
						| 
							 | 
						        inputs=[], | 
					
					
						
						| 
							 | 
						        outputs=[ | 
					
					
						
						| 
							 | 
						            leaderboard_table_val, | 
					
					
						
						| 
							 | 
						            leaderboard_table_test, | 
					
					
						
						| 
							 | 
						        ], | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						    with gr.Accordion("Submit a new file for evaluation"): | 
					
					
						
						| 
							 | 
						        with gr.Row(): | 
					
					
						
						| 
							 | 
						            with gr.Column(): | 
					
					
						
						| 
							 | 
						                level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") | 
					
					
						
						| 
							 | 
						                eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode") | 
					
					
						
						| 
							 | 
						                model = gr.Textbox(label="Foundation Model") | 
					
					
						
						| 
							 | 
						                planning_strategy = gr.Textbox(label="Planning Strategy") | 
					
					
						
						| 
							 | 
						            with gr.Column(): | 
					
					
						
						| 
							 | 
						                organization = gr.Textbox(label="Organization") | 
					
					
						
						| 
							 | 
						                mail = gr.Textbox(label="Contact email") | 
					
					
						
						| 
							 | 
						                file_output = gr.File() | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        submit_button = gr.Button("Submit Eval") | 
					
					
						
						| 
							 | 
						        submission_result = gr.Markdown() | 
					
					
						
						| 
							 | 
						        submit_button.click( | 
					
					
						
						| 
							 | 
						            add_new_eval, | 
					
					
						
						| 
							 | 
						            [ | 
					
					
						
						| 
							 | 
						                level_of_test, | 
					
					
						
						| 
							 | 
						                eval_mode, | 
					
					
						
						| 
							 | 
						                model, | 
					
					
						
						| 
							 | 
						                planning_strategy, | 
					
					
						
						| 
							 | 
						                organization, | 
					
					
						
						| 
							 | 
						                mail, | 
					
					
						
						| 
							 | 
						                file_output, | 
					
					
						
						| 
							 | 
						            ], | 
					
					
						
						| 
							 | 
						            submission_result, | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						demo.launch(debug=True) | 
					
					
						
						| 
							 | 
						
 |