Presidentlin commited on
Commit
be79a27
·
1 Parent(s): f23b928
src/App.tsx CHANGED
@@ -125,30 +125,33 @@ const App: React.FC = () => {
125
 
126
  ]);
127
 
 
 
128
 
129
- const sortedBenchmarkedModels = useMemo(() => {
130
- if (!benchmarkSortConfig) return filteredBenchmarkedModels;
131
 
132
- return [...benchmarkedModels].sort((a, b) => {
133
- const key = benchmarkSortConfig.key;
134
- const aVal = key === "provider" || key === "name"
135
- ? (a as any)[key]?.toLowerCase?.() ?? ""
136
- : a.benchmark?.[key] ?? -Infinity;
137
- const bVal = key === "provider" || key === "name"
138
- ? (b as any)[key]?.toLowerCase?.() ?? ""
139
- : b.benchmark?.[key] ?? -Infinity;
140
 
141
- if (typeof aVal === "string" && typeof bVal === "string") {
142
- return benchmarkSortConfig.direction === "ascending"
143
- ? aVal.localeCompare(bVal)
144
- : bVal.localeCompare(aVal);
145
- }
 
146
 
 
147
  return benchmarkSortConfig.direction === "ascending"
148
- ? aVal - bVal
149
- : bVal - aVal;
150
- });
151
- }, [filteredBenchmarkedModels, benchmarkSortConfig]);
 
 
 
 
 
 
152
 
153
  const pricingProviders = useMemo(() => {
154
  const grouped: Record<string, FlattenedModel[]> = {};
 
125
 
126
  ]);
127
 
128
+ const sortedBenchmarkedModels = useMemo(() => {
129
+ if (!benchmarkSortConfig) return filteredBenchmarkedModels;
130
 
131
+ return [...filteredBenchmarkedModels].sort((a, b) => {
132
+ const key = benchmarkSortConfig.key;
133
 
134
+ const isTopLevelKey = ["provider", "name", "inputPrice", "outputPrice"].includes(key);
 
 
 
 
 
 
 
135
 
136
+ const aVal = isTopLevelKey
137
+ ? (a as any)[key]
138
+ : a.benchmark?.[key] ?? -Infinity;
139
+ const bVal = isTopLevelKey
140
+ ? (b as any)[key]
141
+ : b.benchmark?.[key] ?? -Infinity;
142
 
143
+ if (typeof aVal === "string" && typeof bVal === "string") {
144
  return benchmarkSortConfig.direction === "ascending"
145
+ ? aVal.localeCompare(bVal)
146
+ : bVal.localeCompare(aVal);
147
+ }
148
+
149
+ return benchmarkSortConfig.direction === "ascending"
150
+ ? aVal - bVal
151
+ : bVal - aVal;
152
+ });
153
+ }, [filteredBenchmarkedModels, benchmarkSortConfig]);
154
+
155
 
156
  const pricingProviders = useMemo(() => {
157
  const grouped: Record<string, FlattenedModel[]> = {};
src/lib/benchmarks/google.ts CHANGED
@@ -40,4 +40,115 @@ export const googleBenchmarks: Benchmark[] = [
40
  },
41
  source: "https://deepmind.google/models/gemini-diffusion/",
42
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  ];
 
40
  },
41
  source: "https://deepmind.google/models/gemini-diffusion/",
42
  },
43
+
44
+ {
45
+ model: "Gemini 2.5 Flash Preview (05-20)",
46
+ provider: "Google",
47
+ inputPrice: 0.15,
48
+ outputPrice: 3.5,
49
+ source: "https://ai.google.dev/gemini-api/docs/thinking",
50
+ benchmark: {
51
+ aime_2025: 72.0,
52
+ gpqa_diamond: 82.8,
53
+ simpleqa: 26.9,
54
+ global_mmlu_lite: 88.4,
55
+ swe_bench_verified: 60.4,
56
+ livecodebench_v6: 63.9,
57
+ mmmu: 79.7,
58
+ lbpp_v2: 61.9,
59
+ bigcodebench: 56.7,
60
+ facts_grounding: 85.3,
61
+ humanitys_last_exam: 11.0,
62
+ mrcr_v2_avg_128k: 74.0,
63
+ mrcr_v2_pointwise_1m: 32.0,
64
+ vibe_eval_reka: 65.4,
65
+ },
66
+ },
67
+ {
68
+ model: "Gemini 2.5 Flash Preview (04-17) Thinking",
69
+ provider: "Google",
70
+ inputPrice: 0.15,
71
+ outputPrice: 3.5,
72
+ source: "https://ai.google.dev/gemini-api/docs/thinking",
73
+ benchmark: {
74
+ aime_2025: 78.0,
75
+ gpqa_diamond: 78.3,
76
+ simpleqa: 29.7,
77
+ global_mmlu_lite: 88.4,
78
+ livecodebench_v6: 63.5,
79
+ lbpp_v2: 51.1,
80
+ bigcodebench: 44.2,
81
+ mmmu: 76.7,
82
+ humanitys_last_exam: 12.1,
83
+ vibe_eval_reka: 62.0,
84
+ },
85
+ },
86
+ {
87
+ model: "Gemini 2.0 Flash",
88
+ provider: "Google",
89
+ inputPrice: 0.1,
90
+ outputPrice: 0.4,
91
+ source: "https://ai.google.dev/gemini-api/docs/thinking",
92
+ benchmark: {
93
+ aime_2025: 27.5,
94
+ gpqa_diamond: 60.1,
95
+ simpleqa: 29.9,
96
+ global_mmlu_lite: 83.4,
97
+ livecodebench_v6: 34.5,
98
+ lbpp_v2: 22.2,
99
+ mmmu: 71.7,
100
+ facts_grounding: 84.6,
101
+ humanitys_last_exam: 5.1,
102
+ mrcr_v2_avg_128k: 36.0,
103
+ mrcr_v2_pointwise_1m: 6.0,
104
+ vibe_eval_reka: 56.4,
105
+ },
106
+ },
107
+ {
108
+ model: "Gemini 2.5 Pro Preview (05-06)",
109
+ provider: "Google",
110
+ inputPrice: 2.5,
111
+ outputPrice: 15.0,
112
+ source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
113
+ benchmark: {
114
+ humanitys_last_exam: 17.8,
115
+ gpqa_diamond: 83.0,
116
+ aime_2025: 83.0,
117
+ livecodebench_v6: 75.6,
118
+ lbpp_v2: 76.5,
119
+ bigcodebench: 72.7,
120
+ swe_bench_verified: 63.2,
121
+ simpleqa: 50.8,
122
+ mmmu: 79.6,
123
+ vibe_eval_reka: 65.6,
124
+ video_mme: 84.8,
125
+ mrcr_v2_avg_128k: 93.0,
126
+ mrcr_v2_pointwise_1m: 82.9,
127
+ global_mmlu_lite: 88.6,
128
+ },
129
+ },
130
+ {
131
+ model: "Gemini 2.5 Pro Experimental (03-25)",
132
+ provider: "Google",
133
+ inputPrice: 2.5,
134
+ outputPrice: 15.0,
135
+ source: "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
136
+ benchmark: {
137
+ humanitys_last_exam: 18.8,
138
+ gpqa_diamond: 84.0,
139
+ aime_2025: 86.7,
140
+ livecodebench_v6: 70.4,
141
+ lbpp_v2: 74.0,
142
+ bigcodebench: 68.6,
143
+ swe_bench_verified: 63.8,
144
+ simpleqa: 52.9,
145
+ mmmu: 81.7,
146
+ vibe_eval_reka: 69.4,
147
+ mrcr_v2_avg_128k: 94.5,
148
+ mrcr_v2_pointwise_1m: 83.1,
149
+ global_mmlu_lite: 89.8,
150
+ },
151
+ },
152
+
153
+
154
  ];
src/lib/benchmarks/types.ts CHANGED
@@ -16,13 +16,21 @@ export type BenchmarkMetric =
16
  | "humaneval"
17
  | "mbpp"
18
  | "bigbench_extra_hard"
19
- | "global_mmlu_lite";
 
 
 
 
 
 
 
 
20
 
21
  export interface Benchmark {
22
  model: string;
23
  provider: string;
24
  benchmark: Partial<Record<BenchmarkMetric, number>>;
25
- inputPrice: number;
26
  outputPrice: number;
27
  source: string;
28
  version?: string;
 
16
  | "humaneval"
17
  | "mbpp"
18
  | "bigbench_extra_hard"
19
+ | "global_mmlu_lite"
20
+ // ADD THESE:
21
+ | "facts_grounding"
22
+ | "humanitys_last_exam"
23
+ | "mrcr_v2_avg_128k"
24
+ | "mrcr_v2_pointwise_1m"
25
+ | "video_mme"
26
+ | "vibe_eval_reka";
27
+
28
 
29
  export interface Benchmark {
30
  model: string;
31
  provider: string;
32
  benchmark: Partial<Record<BenchmarkMetric, number>>;
33
+ inputPrice: number;
34
  outputPrice: number;
35
  source: string;
36
  version?: string;