callanwu commited on
Commit
13f10a4
·
1 Parent(s): 3c48f70

add deep search benchmark

Browse files
Files changed (2) hide show
  1. app.py +28 -1
  2. deepsearch_result.jsonl +16 -0
app.py CHANGED
@@ -3,6 +3,8 @@ import pandas as pd
3
  import gradio as gr
4
  from content import *
5
  from css import *
 
 
6
 
7
  NONE_COL = "Ranking"
8
 
@@ -90,6 +92,26 @@ rag_df = pd.DataFrame.from_records(rag_df, columns=RAG_COLS)
90
  rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
91
  rag_df = rag_df[RAG_COLS]
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  demo = gr.Blocks(css=CUSTOM_CSS)
94
  with demo:
95
  gr.HTML(TITLE)
@@ -97,7 +119,12 @@ with demo:
97
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
98
  gr.Markdown("## Leaderboard")
99
  with gr.Group():
100
- with gr.Tab("Results: Agent 🤖️"):
 
 
 
 
 
101
  leaderboard_table_test = gr.components.Dataframe(
102
  value=agent_df, datatype=AGENT_TYPES, interactive=False,
103
  column_widths = ["20%"] * len(agent_df.columns)
 
3
  import gradio as gr
4
  from content import *
5
  from css import *
6
+ def model_hyperlink(link, model_name):
7
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
 
9
  NONE_COL = "Ranking"
10
 
 
92
  rag_df = rag_df.sort_values(by=["Ranking"], ascending=False)
93
  rag_df = rag_df[RAG_COLS]
94
 
95
+ deep_search_ranking = []
96
+ with open("deepsearch_result.jsonl", "r") as f:
97
+ for line in f:
98
+ item = json.loads(line)
99
+ deep_search_ranking.append([item["method"], item["model"], item["overall"]])
100
+ deep_search_ranking = sorted(deep_search_ranking, key=lambda x: x[2], reverse=False)
101
+ ranking_dict = {}
102
+ for i, (method, model, score) in enumerate(deep_search_ranking):
103
+ ranking_dict[score] = i
104
+ deep_search_df = []
105
+ with open("deepsearch_result.jsonl", "r") as f:
106
+ for line in f:
107
+ item = json.loads(line)
108
+ deep_search_df.append([item["org"], item["method"], item["model"], f"{item['overall'] * 100:.2f}", item["link"], ranking_dict[item["overall"]]])
109
+
110
+ deep_search_df = pd.DataFrame.from_records(deep_search_df, columns=["Organisation","Method", "Backbone", "Overall", "Link", NONE_COL])
111
+ deep_search_df = deep_search_df.sort_values(by=["Overall"], ascending=False)
112
+ deep_search_df = deep_search_df[["Organisation", "Method", "Backbone", "Overall", "Link", NONE_COL]]
113
+
114
+
115
  demo = gr.Blocks(css=CUSTOM_CSS)
116
  with demo:
117
  gr.HTML(TITLE)
 
119
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
120
  gr.Markdown("## Leaderboard")
121
  with gr.Group():
122
+ with gr.Tab("Results: Deep Search Agent 🤖🔎"):
123
+ leaderboard_table_test = gr.components.Dataframe(
124
+ value=deep_search_df, datatype=AGENT_TYPES, interactive=False,
125
+ column_widths = ["10%", "18%", "18%", "10%"]
126
+ )
127
+ with gr.Tab("Results: Web Traversal Agent 🤖️"):
128
  leaderboard_table_test = gr.components.Dataframe(
129
  value=agent_df, datatype=AGENT_TYPES, interactive=False,
130
  column_widths = ["20%"] * len(agent_df.columns)
deepsearch_result.jsonl ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-Base", "model": "qwq-32B", "overall": 0.419}
2
+ {"org": "RUC","link": "https://github.com/RUC-NLPIR/WebThinker","method": "WebThinker-RL", "model": "qwq-32B", "overall": 0.465}
3
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-7b-instruct", "overall": 0.36}
4
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwen2.5-32b-instruct", "overall": 0.384}
5
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebDancer", "model": "qwq-32b", "overall": 0.479}
6
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-32b-instruct", "overall": 0.514}
7
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwq-32b", "overall": 0.497}
8
+ {"org": "Alibaba","link": "https://github.com/Alibaba-NLP/WebAgent/","method": "WebShaper", "model": "qwen2.5-72b-instruct", "overall": 0.522}
9
+ {"org": "Tencent","link": "https://github.com/TencentCloudADP/youtu-agent","method": "Youtu-agent", "model": "deepseek-v3.1", "overall": 0.7147}
10
+ {"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-8b", "overall": 0.413}
11
+ {"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-8b", "overall": 0.457}
12
+ {"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-SFT-v0.1", "model": "qwen3-32b", "overall": 0.457}
13
+ {"org": "Miromind","link": "https://github.com/MiroMindAI/MiroThinker","method": "MiroThinker-DPO-v0.1", "model": "qwen3-32b", "overall": 0.493}
14
+ {"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-SFT", "model": "qwen2.5-32b-instruct", "overall": 0.615}
15
+ {"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-32b-instruct", "overall": 0.630}
16
+ {"org": "OPPO","link": "https://github.com/OPPO-PersonalAI/Agent_Foundation_Models","method": "AFM-RL", "model": "qwen2.5-7b-instruct", "overall": 0.556}