Spaces:
Running
Running
add deep search benchmark
Browse files- app.py +28 -1
- deepsearch_result.jsonl +16 -0
app.py
CHANGED
@@ -3,6 +3,8 @@ import pandas as pd
|
|
3 |
import gradio as gr
|
4 |
from content import *
|
5 |
from css import *
|
|
|
|
|
6 |
|
7 |
NONE_COL = "Ranking"
|
8 |
|
@@ -90,6 +92,26 @@ rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
|
|
90 |
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
|
91 |
rag_df = rag_df[RAG_COLS]
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
94 |
with demo:
|
95 |
gr.HTML(TITLE)
|
@@ -97,7 +119,12 @@ with demo:
|
|
97 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
98 |
gr.Markdown("## Leaderboard")
|
99 |
with gr.Group():
|
100 |
-
with gr.Tab("Results: Agent
|
|
|
|
|
|
|
|
|
|
|
101 |
leaderboard_table_test = gr.components.Dataframe(
|
102 |
value=agent_df, datatype=AGENT_TYPES, interactive=False,
|
103 |
column_widths = ["20%"] * len(agent_df.columns)
|
|
|
3 |
import gradio as gr
|
4 |
from content import *
|
5 |
from css import *
|
6 |
+
def model_hyperlink(link, model_name):
|
7 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
8 |
|
9 |
NONE_COL = "Ranking"
|
10 |
|
|
|
92 |
rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
|
93 |
rag_df = rag_df[RAG_COLS]
|
94 |
|
95 |
+
deep_search_ranking = []
|
96 |
+
with open("deepsearch_result.jsonl", "r") as f:
|
97 |
+
for line in f:
|
98 |
+
item = json.loads(line)
|
99 |
+
deep_search_ranking.append([item["method"], item["model"], item["overall"]])
|
100 |
+
deep_search_ranking = sorted(deep_search_ranking, key=lambda x: x[2], reverse=False)
|
101 |
+
ranking_dict = {}
|
102 |
+
for i, (method, model, score) in enumerate(deep_search_ranking):
|
103 |
+
ranking_dict[score] = i
|
104 |
+
deep_search_df = []
|
105 |
+
with open("deepsearch_result.jsonl", "r") as f:
|
106 |
+
for line in f:
|
107 |
+
item = json.loads(line)
|
108 |
+
deep_search_df.append([item["org"], item["method"], item["model"], f"{item['overall'] * 100:.2f}", item["link"], ranking_dict[item["overall"]]])
|
109 |
+
|
110 |
+
deep_search_df = pd.DataFrame.from_records(deep_search_df, columns=["Organisation","Method", "Backbone", "Overall", "Link", NONE_COL])
|
111 |
+
deep_search_df = deep_search_df.sort_values(by=["Overall"], ascending=False)
|
112 |
+
deep_search_df = deep_search_df[["Organisation", "Method", "Backbone", "Overall", "Link", NONE_COL]]
|
113 |
+
|
114 |
+
|
115 |
demo = gr.Blocks(css=CUSTOM_CSS)
|
116 |
with demo:
|
117 |
gr.HTML(TITLE)
|
|
|
119 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
120 |
gr.Markdown("## Leaderboard")
|
121 |
with gr.Group():
|
122 |
+
with gr.Tab("Results: Deep Search Agent 🤖🔎"):
|
123 |
+
leaderboard_table_test = gr.components.Dataframe(
|
124 |
+
value=deep_search_df, datatype=AGENT_TYPES, interactive=False,
|
125 |
+
column_widths = ["10%", "18%", "18%", "10%"]
|
126 |
+
)
|
127 |
+
with gr.Tab("Results: Web Traversal Agent 🤖️"):
|
128 |
leaderboard_table_test = gr.components.Dataframe(
|
129 |
value=agent_df, datatype=AGENT_TYPES, interactive=False,
|
130 |
column_widths = ["20%"] * len(agent_df.columns)
|
deepsearch_result.jsonl
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-Base", "model": "qwq-32B", "overall": 0.419}
|
2 |
+
{"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-RL", "model": "qwq-32B", "overall": 0.465}
|
3 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-7b-instruct", "overall": 0.36}
|
4 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-32b-instruct", "overall": 0.384}
|
5 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwq-32b", "overall": 0.479}
|
6 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-32b-instruct", "overall": 0.514}
|
7 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwq-32b", "overall": 0.497}
|
8 |
+
{"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-72b-instruct", "overall": 0.522}
|
9 |
+
{"org": "Tencent","link": "https://github.com/TencentCloudADP/youtu-agent","method": "Youtu-agent", "model": "deepseek-v3.1", "overall": 0.7147}
|
10 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-8b", "overall": 0.413}
|
11 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-8b", "overall": 0.457}
|
12 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-32b", "overall": 0.457}
|
13 |
+
{"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-32b", "overall": 0.493}
|
14 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-SFT", "model": "qwen2.5-32b-instruct", "overall": 0.615}
|
15 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-32b-instruct", "overall": 0.630}
|
16 |
+
{"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-7b-instruct", "overall": 0.556}
|