Presidentlin commited on
Commit
a9899bf
·
1 Parent(s): be79a27
src/App.tsx CHANGED
@@ -9,7 +9,7 @@ import { PricingTable } from "@/components/PricingTable";
9
  import { BenchmarkTable } from "./components/BenchmarkTable";
10
  import { benchmarkData } from "./lib/benchmarks/ index";
11
  import { BenchmarkComparisonSelector } from "./components/BenchmarkComparisonSelector";
12
-
13
 
14
  export interface FlattenedModel extends Model {
15
  provider: string;
@@ -125,32 +125,32 @@ const App: React.FC = () => {
125
 
126
  ]);
127
 
128
- const sortedBenchmarkedModels = useMemo(() => {
129
- if (!benchmarkSortConfig) return filteredBenchmarkedModels;
130
 
131
- return [...filteredBenchmarkedModels].sort((a, b) => {
132
- const key = benchmarkSortConfig.key;
133
 
134
- const isTopLevelKey = ["provider", "name", "inputPrice", "outputPrice"].includes(key);
135
 
136
- const aVal = isTopLevelKey
137
- ? (a as any)[key]
138
- : a.benchmark?.[key] ?? -Infinity;
139
- const bVal = isTopLevelKey
140
- ? (b as any)[key]
141
- : b.benchmark?.[key] ?? -Infinity;
142
 
143
- if (typeof aVal === "string" && typeof bVal === "string") {
144
- return benchmarkSortConfig.direction === "ascending"
145
- ? aVal.localeCompare(bVal)
146
- : bVal.localeCompare(aVal);
147
- }
148
 
149
- return benchmarkSortConfig.direction === "ascending"
150
- ? aVal - bVal
151
- : bVal - aVal;
152
- });
153
- }, [filteredBenchmarkedModels, benchmarkSortConfig]);
154
 
155
 
156
  const pricingProviders = useMemo(() => {
@@ -305,7 +305,9 @@ const sortedBenchmarkedModels = useMemo(() => {
305
  {/* Benchmark Table */}
306
  <h3 className="text-lg font-semibold mt-12 mb-2">Select Benchmark Metrics to Compare</h3>
307
  <BenchmarkComparisonSelector
308
- allMetrics={Array.from(new Set(benchmarkedModels.flatMap((m) => Object.keys(m.benchmark ?? {})))).sort()}
 
 
309
  selected={benchmarkComparisonMetrics}
310
  onChange={(metric, checked) =>
311
  setBenchmarkComparisonMetrics((prev) =>
 
9
  import { BenchmarkTable } from "./components/BenchmarkTable";
10
  import { benchmarkData } from "./lib/benchmarks/ index";
11
  import { BenchmarkComparisonSelector } from "./components/BenchmarkComparisonSelector";
12
+ import { benchmarkMetricOrder } from "./lib/benchmarks/types";
13
 
14
  export interface FlattenedModel extends Model {
15
  provider: string;
 
125
 
126
  ]);
127
 
128
+ const sortedBenchmarkedModels = useMemo(() => {
129
+ if (!benchmarkSortConfig) return filteredBenchmarkedModels;
130
 
131
+ return [...filteredBenchmarkedModels].sort((a, b) => {
132
+ const key = benchmarkSortConfig.key;
133
 
134
+ const isTopLevelKey = ["provider", "name", "inputPrice", "outputPrice"].includes(key);
135
 
136
+ const aVal = isTopLevelKey
137
+ ? (a as any)[key]
138
+ : a.benchmark?.[key] ?? -Infinity;
139
+ const bVal = isTopLevelKey
140
+ ? (b as any)[key]
141
+ : b.benchmark?.[key] ?? -Infinity;
142
 
143
+ if (typeof aVal === "string" && typeof bVal === "string") {
144
+ return benchmarkSortConfig.direction === "ascending"
145
+ ? aVal.localeCompare(bVal)
146
+ : bVal.localeCompare(aVal);
147
+ }
148
 
149
+ return benchmarkSortConfig.direction === "ascending"
150
+ ? aVal - bVal
151
+ : bVal - aVal;
152
+ });
153
+ }, [filteredBenchmarkedModels, benchmarkSortConfig]);
154
 
155
 
156
  const pricingProviders = useMemo(() => {
 
305
  {/* Benchmark Table */}
306
  <h3 className="text-lg font-semibold mt-12 mb-2">Select Benchmark Metrics to Compare</h3>
307
  <BenchmarkComparisonSelector
308
+ allMetrics={benchmarkMetricOrder.filter(
309
+ (metric) => benchmarkedModels.some((m) => m.benchmark?.[metric] !== undefined)
310
+ )}
311
  selected={benchmarkComparisonMetrics}
312
  onChange={(metric, checked) =>
313
  setBenchmarkComparisonMetrics((prev) =>
src/lib/benchmarks/google.ts CHANGED
@@ -61,7 +61,7 @@ export const googleBenchmarks: Benchmark[] = [
61
  humanitys_last_exam: 11.0,
62
  mrcr_v2_avg_128k: 74.0,
63
  mrcr_v2_pointwise_1m: 32.0,
64
- vibe_eval_reka: 65.4,
65
  },
66
  },
67
  {
@@ -79,8 +79,8 @@ export const googleBenchmarks: Benchmark[] = [
79
  lbpp_v2: 51.1,
80
  bigcodebench: 44.2,
81
  mmmu: 76.7,
82
- humanitys_last_exam: 12.1,
83
- vibe_eval_reka: 62.0,
84
  },
85
  },
86
  {
@@ -101,7 +101,7 @@ export const googleBenchmarks: Benchmark[] = [
101
  humanitys_last_exam: 5.1,
102
  mrcr_v2_avg_128k: 36.0,
103
  mrcr_v2_pointwise_1m: 6.0,
104
- vibe_eval_reka: 56.4,
105
  },
106
  },
107
  {
@@ -120,7 +120,7 @@ export const googleBenchmarks: Benchmark[] = [
120
  swe_bench_verified: 63.2,
121
  simpleqa: 50.8,
122
  mmmu: 79.6,
123
- vibe_eval_reka: 65.6,
124
  video_mme: 84.8,
125
  mrcr_v2_avg_128k: 93.0,
126
  mrcr_v2_pointwise_1m: 82.9,
@@ -143,7 +143,6 @@ export const googleBenchmarks: Benchmark[] = [
143
  swe_bench_verified: 63.8,
144
  simpleqa: 52.9,
145
  mmmu: 81.7,
146
- vibe_eval_reka: 69.4,
147
  mrcr_v2_avg_128k: 94.5,
148
  mrcr_v2_pointwise_1m: 83.1,
149
  global_mmlu_lite: 89.8,
 
61
  humanitys_last_exam: 11.0,
62
  mrcr_v2_avg_128k: 74.0,
63
  mrcr_v2_pointwise_1m: 32.0,
64
+
65
  },
66
  },
67
  {
 
79
  lbpp_v2: 51.1,
80
  bigcodebench: 44.2,
81
  mmmu: 76.7,
82
+ humanitys_last_exam: 12.1
83
+
84
  },
85
  },
86
  {
 
101
  humanitys_last_exam: 5.1,
102
  mrcr_v2_avg_128k: 36.0,
103
  mrcr_v2_pointwise_1m: 6.0,
104
+
105
  },
106
  },
107
  {
 
120
  swe_bench_verified: 63.2,
121
  simpleqa: 50.8,
122
  mmmu: 79.6,
123
+
124
  video_mme: 84.8,
125
  mrcr_v2_avg_128k: 93.0,
126
  mrcr_v2_pointwise_1m: 82.9,
 
143
  swe_bench_verified: 63.8,
144
  simpleqa: 52.9,
145
  mmmu: 81.7,
 
146
  mrcr_v2_avg_128k: 94.5,
147
  mrcr_v2_pointwise_1m: 83.1,
148
  global_mmlu_lite: 89.8,
src/lib/benchmarks/types.ts CHANGED
@@ -1,30 +1,36 @@
1
  export type BenchmarkMetric =
2
- | "aime_24"
3
- | "aime_2025"
4
- | "gpqa"
5
- | "gpqa_diamond"
6
- | "lcb"
7
  | "mmlu_pro"
 
 
8
  | "loft"
9
- | "simpleqa"
10
  | "mmmu"
11
- | "egoschema"
12
- | "livecodebench_v6"
13
- | "bigcodebench"
14
- | "lbpp_v2"
15
- | "swe_bench_verified"
 
16
  | "humaneval"
17
  | "mbpp"
 
 
 
 
 
 
18
  | "bigbench_extra_hard"
19
  | "global_mmlu_lite"
20
- // ADD THESE:
 
21
  | "facts_grounding"
22
  | "humanitys_last_exam"
23
  | "mrcr_v2_avg_128k"
24
  | "mrcr_v2_pointwise_1m"
25
- | "video_mme"
26
- | "vibe_eval_reka";
27
 
 
28
 
29
  export interface Benchmark {
30
  model: string;
@@ -35,3 +41,36 @@ export interface Benchmark {
35
  source: string;
36
  version?: string;
37
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  export type BenchmarkMetric =
2
+ // Most common and high-priority
3
+ | "simpleqa"
 
 
 
4
  | "mmlu_pro"
5
+ | "gpqa"
6
+ | "egoschema"
7
  | "loft"
 
8
  | "mmmu"
9
+ | "lcb"
10
+ | "aime_24"
11
+ | "aime_2025"
12
+ | "gpqa_diamond"
13
+
14
+ // Code benchmarks (frequent)
15
  | "humaneval"
16
  | "mbpp"
17
+ | "bigcodebench"
18
+ | "livecodebench_v6"
19
+ | "swe_bench_verified"
20
+ | "lbpp_v2"
21
+
22
+ // General reasoning & robustness
23
  | "bigbench_extra_hard"
24
  | "global_mmlu_lite"
25
+
26
+ // Optional: less frequent but still potentially useful
27
  | "facts_grounding"
28
  | "humanitys_last_exam"
29
  | "mrcr_v2_avg_128k"
30
  | "mrcr_v2_pointwise_1m"
31
+ | "video_mme";
 
32
 
33
+ // Note: "vibe_eval_reka" is intentionally excluded for now.
34
 
35
  export interface Benchmark {
36
  model: string;
 
41
  source: string;
42
  version?: string;
43
  }
44
+
45
+ export const benchmarkMetricOrder: BenchmarkMetric[] = [
46
+ // Most common and high-priority
47
+ "simpleqa",
48
+ "mmlu_pro",
49
+ "gpqa",
50
+ "egoschema",
51
+ "loft",
52
+ "mmmu",
53
+ "lcb",
54
+ "aime_24",
55
+ "aime_2025",
56
+ "gpqa_diamond",
57
+
58
+ // // Code benchmarks (frequent)
59
+ // "humaneval",
60
+ // "mbpp",
61
+ // "bigcodebench",
62
+ // "livecodebench_v6",
63
+ // "swe_bench_verified",
64
+ // "lbpp_v2",
65
+
66
+ // // General reasoning & robustness
67
+ // "bigbench_extra_hard",
68
+ // "global_mmlu_lite",
69
+
70
+ // // Optional: less frequent but still potentially useful
71
+ // "facts_grounding",
72
+ // "humanitys_last_exam",
73
+ // "mrcr_v2_avg_128k",
74
+ // "mrcr_v2_pointwise_1m",
75
+ // "video_mme",
76
+ ];