Spaces:
Running
Running
| import json | |
| import pandas as pd | |
| import gradio as gr | |
| from content import * | |
| from css import * | |
| def model_hyperlink(link, model_name): | |
| return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
| NONE_COL = "Ranking" | |
| AGENT_COLS = ["Method", "Model" , "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL] | |
| AGENT_TYPES = ["str", "str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"] | |
| model_name_adic = { | |
| "qwen-plus": "Qwen-Plus", | |
| "qwen2.5-72b-instruct": "Qwen2.5-72B", | |
| "qwen2.5-7b-instruct": "Qwen2.5-7B", | |
| "qwen2.5-14b-instruct": "Qwen2.5-14B", | |
| "qwen2.5-32b-instruct": "Qwen2.5-32B", | |
| "gpt-4o": "GPT-4o", | |
| } | |
| method_name_adic = { | |
| "reflexion": "Relfexion", | |
| "react": "React", | |
| "seeker": "WebWalker", | |
| } | |
| rag_name_adic = { | |
| "kimi": "Kimi", | |
| "mindsearch": "MindSearch", | |
| "navie": "Navie RAG", | |
| "o1": "o1", | |
| "tongyi": "Tongyi", | |
| "wenxin": "ERNIE", | |
| "gemini": "Gemini", | |
| "gemini_search": "Gemini w/ Search", | |
| "doubao": "Doubao", | |
| } | |
| agent_ranking = [] | |
| with open("agents_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| agent_ranking.append([method_name_adic[item["method"]], model_name_adic[item["model"]], item["overall"]]) | |
| agent_ranking = sorted(agent_ranking, key=lambda x: x[2], reverse=False) | |
| ranking_dict = {} | |
| for i, (method, model, score) in enumerate(agent_ranking): | |
| ranking_dict[method+model] = i | |
| agent_df = [] | |
| with open("agents_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| agent_df.append([method_name_adic[item["method"]], model_name_adic[item["model"]], | |
| f"{item['ss_easy'] * 100:.2f}", | |
| f"{item['ss_medium'] * 100:.2f}", | |
| f"{item['ss_hard'] * 100:.2f}", | |
| f"{item['ms_easy'] * 100:.2f}", | |
| f"{item['ms_medium'] * 100:.2f}", | |
| f"{item['ms_hard'] * 100:.2f}", | |
| f"{item['overall'] * 100:.2f}", | |
| ranking_dict[method_name_adic[item["method"]] + model_name_adic[item["model"]]]]) | |
| agent_df = pd.DataFrame.from_records(agent_df, columns=AGENT_COLS) | |
| agent_df = agent_df.sort_values(by=["Ranking"], ascending=False) | |
| agent_df = agent_df[AGENT_COLS] | |
| RAG_COLS = ["System", "SS Easy", "SS Medium", "SS Hard", "MS Easy", "MS Meduium", "MS Hard", "Overall", NONE_COL] | |
| RAG_TYPES = ["str", "number", "number", "number", "number", "number", "number", "number", "number" , "number"] | |
| rag_ranking = [] | |
| with open("rag_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| rag_ranking.append([rag_name_adic[item["system"]], item["overall"]]) | |
| rag_ranking = sorted(rag_ranking, key=lambda x: x[1], reverse=False) | |
| ranking_dict = {} | |
| for i, (system, score) in enumerate(rag_ranking): | |
| ranking_dict[system] = i | |
| rag_df = [] | |
| with open("rag_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| rag_df.append([rag_name_adic[item["system"]], | |
| f"{item['ss_easy'] * 100:.2f}", | |
| f"{item['ss_medium'] * 100:.2f}", | |
| f"{item['ss_hard'] * 100:.2f}", | |
| f"{item['ms_easy'] * 100:.2f}", | |
| f"{item['ms_medium'] * 100:.2f}", | |
| f"{item['ms_hard'] * 100:.2f}", | |
| f"{item['overall'] * 100:.2f}", | |
| ranking_dict[rag_name_adic[item["system"]]]]) | |
| rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS) | |
| rag_df = rag_df.sort_values(by=["Ranking"], ascending=False) | |
| rag_df = rag_df[RAG_COLS] | |
| deep_search_ranking = [] | |
| with open("deepsearch_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| deep_search_ranking.append([item["method"], item["model"], item["overall"]]) | |
| deep_search_ranking = sorted(deep_search_ranking, key=lambda x: x[2], reverse=False) | |
| ranking_dict = {} | |
| for i, (method, model, score) in enumerate(deep_search_ranking): | |
| ranking_dict[score] = i | |
| deep_search_df = [] | |
| with open("deepsearch_result.jsonl", "r") as f: | |
| for line in f: | |
| item = json.loads(line) | |
| deep_search_df.append([item["org"], item["method"], item["model"], f"{item['overall'] * 100:.2f}", item["link"], ranking_dict[item["overall"]]]) | |
| deep_search_df = pd.DataFrame.from_records(deep_search_df, columns=["Organisation","Method", "Backbone", "Overall", "Link", NONE_COL]) | |
| deep_search_df = deep_search_df.sort_values(by=["Overall"], ascending=False) | |
| deep_search_df = deep_search_df[["Organisation", "Method", "Backbone", "Overall", "Link", NONE_COL]] | |
| demo = gr.Blocks(css=CUSTOM_CSS) | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") | |
| gr.Markdown(HOW_TO, elem_classes="markdown-text") | |
| gr.Markdown("## Leaderboard") | |
| with gr.Group(): | |
| with gr.Tab("Results: Deep Search Agent 🤖🔎"): | |
| leaderboard_table_test = gr.components.Dataframe( | |
| value=deep_search_df, datatype=AGENT_TYPES, interactive=False, | |
| column_widths = ["10%", "18%", "18%", "10%"] | |
| ) | |
| with gr.Tab("Results: Web Traversal Agent 🤖️"): | |
| leaderboard_table_test = gr.components.Dataframe( | |
| value=agent_df, datatype=AGENT_TYPES, interactive=False, | |
| column_widths = ["20%"] * len(agent_df.columns) | |
| ) | |
| with gr.Tab("Results: RAG-system 🔍"): | |
| leaderboard_table_val = gr.components.Dataframe( | |
| value=rag_df, datatype=RAG_TYPES, interactive=False, | |
| column_widths=["20%"] | |
| ) | |
| gr.Markdown("SS denotes single-source, and MS denotes multi-source. Easy, Medium, and Hard denote the difficulty level of the question.") | |
| gr.Markdown(CREDIT, elem_classes="markdown-text") | |
| gr.Markdown(CITATION, elem_classes="markdown-text") | |
| demo.launch(share=True) |