Spaces:
Running
Running
Commit
·
05dfa56
1
Parent(s):
1b57635
feat: add Groq provider models and show provider info in UI
Browse files- Add GPT-OSS-20B, GPT-OSS-120B, and Llama-4-Scout-17B models via Groq provider
- Update models_registry.py to support Groq provider with chat.completions.create
- Add provider information to result dictionary in evaluator.py
- Display provider info in both Evaluate and Global Leaderboard tabs
- Group leaderboard by model_name and provider for accurate comparison
- Enable comparison of same model across different providers
- app.py +7 -4
- config/models.yaml +31 -1
- src/evaluator.py +1 -0
- src/models_registry.py +1 -1
app.py
CHANGED
@@ -66,8 +66,11 @@ class LeaderboardManager:
|
|
66 |
# Group by model and calculate averages
|
67 |
numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
|
68 |
|
69 |
-
# Calculate averages for numeric columns
|
70 |
-
model_aggregated = self.leaderboard.groupby('model_name')[numeric_columns].mean().reset_index()
|
|
|
|
|
|
|
71 |
|
72 |
# Sort by composite score (descending) to get proper ranking
|
73 |
model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
|
@@ -82,7 +85,7 @@ class LeaderboardManager:
|
|
82 |
leaderboard_config = config_loader.get_leaderboard_config()
|
83 |
column_mapping = {
|
84 |
'Rank': 'rank',
|
85 |
-
'Model': '
|
86 |
'Composite Score': 'composite_score',
|
87 |
'Correctness': 'correctness_exact',
|
88 |
'Result F1': 'result_match_f1',
|
@@ -243,7 +246,7 @@ def run_evaluation(dataset_name: str, dialect: str, case_selection: str,
|
|
243 |
# Format for display using config
|
244 |
results.append([
|
245 |
len(results) + 1, # Rank (1-based)
|
246 |
-
model_name,
|
247 |
formatting["composite_score"].format(result['composite_score']),
|
248 |
formatting["correctness_exact"].format(result['correctness_exact']),
|
249 |
formatting["result_match_f1"].format(result['result_match_f1']),
|
|
|
66 |
# Group by model and calculate averages
|
67 |
numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
|
68 |
|
69 |
+
# Calculate averages for numeric columns, keeping provider info
|
70 |
+
model_aggregated = self.leaderboard.groupby(['model_name', 'provider'])[numeric_columns].mean().reset_index()
|
71 |
+
|
72 |
+
# Create combined model name with provider
|
73 |
+
model_aggregated['model_display'] = model_aggregated['model_name'] + ' (' + model_aggregated['provider'] + ')'
|
74 |
|
75 |
# Sort by composite score (descending) to get proper ranking
|
76 |
model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
|
|
|
85 |
leaderboard_config = config_loader.get_leaderboard_config()
|
86 |
column_mapping = {
|
87 |
'Rank': 'rank',
|
88 |
+
'Model': 'model_display',
|
89 |
'Composite Score': 'composite_score',
|
90 |
'Correctness': 'correctness_exact',
|
91 |
'Result F1': 'result_match_f1',
|
|
|
246 |
# Format for display using config
|
247 |
results.append([
|
248 |
len(results) + 1, # Rank (1-based)
|
249 |
+
f"{model_name} ({result['provider']})", # Include provider in model name
|
250 |
formatting["composite_score"].format(result['composite_score']),
|
251 |
formatting["correctness_exact"].format(result['correctness_exact']),
|
252 |
formatting["result_match_f1"].format(result['result_match_f1']),
|
config/models.yaml
CHANGED
@@ -37,4 +37,34 @@ models:
|
|
37 |
max_new_tokens: 256
|
38 |
temperature: 0.1
|
39 |
top_p: 0.9
|
40 |
-
description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
max_new_tokens: 256
|
38 |
temperature: 0.1
|
39 |
top_p: 0.9
|
40 |
+
description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
|
41 |
+
|
42 |
+
# GPT-OSS-20B with Groq Provider
|
43 |
+
- name: "GPT-OSS-20B (Groq)"
|
44 |
+
provider: "groq"
|
45 |
+
model_id: "openai/gpt-oss-20b"
|
46 |
+
params:
|
47 |
+
max_new_tokens: 256
|
48 |
+
temperature: 0.1
|
49 |
+
top_p: 0.9
|
50 |
+
description: "GPT-OSS-20B - OpenAI's 20B parameter model via Groq"
|
51 |
+
|
52 |
+
# GPT-OSS-120B with Groq Provider
|
53 |
+
- name: "GPT-OSS-120B (Groq)"
|
54 |
+
provider: "groq"
|
55 |
+
model_id: "openai/gpt-oss-120b"
|
56 |
+
params:
|
57 |
+
max_new_tokens: 256
|
58 |
+
temperature: 0.1
|
59 |
+
top_p: 0.9
|
60 |
+
description: "GPT-OSS-120B - OpenAI's 120B parameter model via Groq"
|
61 |
+
|
62 |
+
# Llama-4-Scout-17B with Groq Provider
|
63 |
+
- name: "Llama-4-Scout-17B (Groq)"
|
64 |
+
provider: "groq"
|
65 |
+
model_id: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
66 |
+
params:
|
67 |
+
max_new_tokens: 256
|
68 |
+
temperature: 0.1
|
69 |
+
top_p: 0.9
|
70 |
+
description: "Llama-4-Scout-17B - Meta's latest multimodal model via Groq"
|
src/evaluator.py
CHANGED
@@ -359,6 +359,7 @@ class Evaluator:
|
|
359 |
|
360 |
return {
|
361 |
'model_name': model_name,
|
|
|
362 |
'dataset_name': dataset_name,
|
363 |
'case_id': case_id,
|
364 |
'dialect': dialect,
|
|
|
359 |
|
360 |
return {
|
361 |
'model_name': model_name,
|
362 |
+
'provider': model_config.provider,
|
363 |
'dataset_name': dataset_name,
|
364 |
'case_id': case_id,
|
365 |
'dialect': dialect,
|
src/models_registry.py
CHANGED
@@ -86,7 +86,7 @@ class HuggingFaceInference:
|
|
86 |
)
|
87 |
|
88 |
# Use different methods based on provider capabilities
|
89 |
-
if provider == "nebius" or provider == "together":
|
90 |
# Nebius provider only supports conversational tasks, use chat completion
|
91 |
completion = client.chat.completions.create(
|
92 |
model=model_id,
|
|
|
86 |
)
|
87 |
|
88 |
# Use different methods based on provider capabilities
|
89 |
+
if provider == "nebius" or provider == "together" or provider == "groq":
|
90 |
# Nebius provider only supports conversational tasks, use chat completion
|
91 |
completion = client.chat.completions.create(
|
92 |
model=model_id,
|