uparekh01151 commited on
Commit
05dfa56
·
1 Parent(s): 1b57635

feat: add Groq provider models and show provider info in UI

Browse files

- Add GPT-OSS-20B, GPT-OSS-120B, and Llama-4-Scout-17B models via Groq provider
- Update models_registry.py to support Groq provider with chat.completions.create
- Add provider information to result dictionary in evaluator.py
- Display provider info in both Evaluate and Global Leaderboard tabs
- Group leaderboard by model_name and provider for accurate comparison
- Enable comparison of same model across different providers

Files changed (4) hide show
  1. app.py +7 -4
  2. config/models.yaml +31 -1
  3. src/evaluator.py +1 -0
  4. src/models_registry.py +1 -1
app.py CHANGED
@@ -66,8 +66,11 @@ class LeaderboardManager:
66
  # Group by model and calculate averages
67
  numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
68
 
69
- # Calculate averages for numeric columns
70
- model_aggregated = self.leaderboard.groupby('model_name')[numeric_columns].mean().reset_index()
 
 
 
71
 
72
  # Sort by composite score (descending) to get proper ranking
73
  model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
@@ -82,7 +85,7 @@ class LeaderboardManager:
82
  leaderboard_config = config_loader.get_leaderboard_config()
83
  column_mapping = {
84
  'Rank': 'rank',
85
- 'Model': 'model_name',
86
  'Composite Score': 'composite_score',
87
  'Correctness': 'correctness_exact',
88
  'Result F1': 'result_match_f1',
@@ -243,7 +246,7 @@ def run_evaluation(dataset_name: str, dialect: str, case_selection: str,
243
  # Format for display using config
244
  results.append([
245
  len(results) + 1, # Rank (1-based)
246
- model_name,
247
  formatting["composite_score"].format(result['composite_score']),
248
  formatting["correctness_exact"].format(result['correctness_exact']),
249
  formatting["result_match_f1"].format(result['result_match_f1']),
 
66
  # Group by model and calculate averages
67
  numeric_columns = ['composite_score', 'correctness_exact', 'result_match_f1', 'exec_success', 'latency_ms']
68
 
69
+ # Calculate averages for numeric columns, keeping provider info
70
+ model_aggregated = self.leaderboard.groupby(['model_name', 'provider'])[numeric_columns].mean().reset_index()
71
+
72
+ # Create combined model name with provider
73
+ model_aggregated['model_display'] = model_aggregated['model_name'] + ' (' + model_aggregated['provider'] + ')'
74
 
75
  # Sort by composite score (descending) to get proper ranking
76
  model_aggregated = model_aggregated.sort_values('composite_score', ascending=False).reset_index(drop=True)
 
85
  leaderboard_config = config_loader.get_leaderboard_config()
86
  column_mapping = {
87
  'Rank': 'rank',
88
+ 'Model': 'model_display',
89
  'Composite Score': 'composite_score',
90
  'Correctness': 'correctness_exact',
91
  'Result F1': 'result_match_f1',
 
246
  # Format for display using config
247
  results.append([
248
  len(results) + 1, # Rank (1-based)
249
+ f"{model_name} ({result['provider']})", # Include provider in model name
250
  formatting["composite_score"].format(result['composite_score']),
251
  formatting["correctness_exact"].format(result['correctness_exact']),
252
  formatting["result_match_f1"].format(result['result_match_f1']),
config/models.yaml CHANGED
@@ -37,4 +37,34 @@ models:
37
  max_new_tokens: 256
38
  temperature: 0.1
39
  top_p: 0.9
40
- description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  max_new_tokens: 256
38
  temperature: 0.1
39
  top_p: 0.9
40
+ description: "DeepSeek-R1 - DeepSeek's reasoning model via Together AI"
41
+
42
+ # GPT-OSS-20B with Groq Provider
43
+ - name: "GPT-OSS-20B (Groq)"
44
+ provider: "groq"
45
+ model_id: "openai/gpt-oss-20b"
46
+ params:
47
+ max_new_tokens: 256
48
+ temperature: 0.1
49
+ top_p: 0.9
50
+ description: "GPT-OSS-20B - OpenAI's 20B parameter model via Groq"
51
+
52
+ # GPT-OSS-120B with Groq Provider
53
+ - name: "GPT-OSS-120B (Groq)"
54
+ provider: "groq"
55
+ model_id: "openai/gpt-oss-120b"
56
+ params:
57
+ max_new_tokens: 256
58
+ temperature: 0.1
59
+ top_p: 0.9
60
+ description: "GPT-OSS-120B - OpenAI's 120B parameter model via Groq"
61
+
62
+ # Llama-4-Scout-17B with Groq Provider
63
+ - name: "Llama-4-Scout-17B (Groq)"
64
+ provider: "groq"
65
+ model_id: "meta-llama/Llama-4-Scout-17B-16E-Instruct"
66
+ params:
67
+ max_new_tokens: 256
68
+ temperature: 0.1
69
+ top_p: 0.9
70
+ description: "Llama-4-Scout-17B - Meta's latest multimodal model via Groq"
src/evaluator.py CHANGED
@@ -359,6 +359,7 @@ class Evaluator:
359
 
360
  return {
361
  'model_name': model_name,
 
362
  'dataset_name': dataset_name,
363
  'case_id': case_id,
364
  'dialect': dialect,
 
359
 
360
  return {
361
  'model_name': model_name,
362
+ 'provider': model_config.provider,
363
  'dataset_name': dataset_name,
364
  'case_id': case_id,
365
  'dialect': dialect,
src/models_registry.py CHANGED
@@ -86,7 +86,7 @@ class HuggingFaceInference:
86
  )
87
 
88
  # Use different methods based on provider capabilities
89
- if provider == "nebius" or provider == "together":
90
  # Nebius provider only supports conversational tasks, use chat completion
91
  completion = client.chat.completions.create(
92
  model=model_id,
 
86
  )
87
 
88
  # Use different methods based on provider capabilities
89
+ if provider == "nebius" or provider == "together" or provider == "groq":
90
  # Nebius provider only supports conversational tasks, use chat completion
91
  completion = client.chat.completions.create(
92
  model=model_id,