Spaces:
Running
on
Zero
Running
on
Zero
Update eval.jsonl
Browse files- eval.jsonl +6 -6
eval.jsonl
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
{"model_id": "Gemini 2.5 Pro", "benchmark": "MMLU", "subject": "ALL", "accuracy": 88.00, "sample_count": 1000, "timestamp": "2025-06-25T14:17:00.000000"}
|
2 |
-
{"model_id": "ChatGPT 4.5", "benchmark": "MMLU", "subject": "ALL", "accuracy": 86.50, "sample_count": 1000, "timestamp": "2025-06-25T14:17:01.000000"}
|
3 |
-
{"model_id": "Llama
|
4 |
-
{"model_id": "Qwen3
|
5 |
-
{"model_id": "Mistral-Small-3.2-24B-Instruct-2506", "benchmark": "MMLU", "subject": "ALL", "accuracy": 84.80, "sample_count": 1000, "timestamp": "2025-06-25T14:17:04.000000"}
|
6 |
-
{"model_id": "Claude 4 Opus", "benchmark": "MMLU", "subject": "ALL", "accuracy": 89.10, "sample_count": 1000, "timestamp": "2025-06-25T14:17:05.000000"}
|
|
|
1 |
+
{"model_id": "Gemini 2.5 Pro(Not on hub)", "benchmark": "MMLU", "subject": "ALL", "accuracy": 88.00, "sample_count": 1000, "timestamp": "2025-06-25T14:17:00.000000"}
|
2 |
+
{"model_id": "ChatGPT 4.5(Not on hub)", "benchmark": "MMLU", "subject": "ALL", "accuracy": 86.50, "sample_count": 1000, "timestamp": "2025-06-25T14:17:01.000000"}
|
3 |
+
{"model_id": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "benchmark": "MMLU", "subject": "ALL", "accuracy": 85.00, "sample_count": 1000, "timestamp": "2025-06-25T14:17:02.000000"}
|
4 |
+
{"model_id": "Qwen/Qwen3-235B-A22B", "benchmark": "MMLU", "subject": "ALL", "accuracy": 87.20, "sample_count": 1000, "timestamp": "2025-06-25T14:17:03.000000"}
|
5 |
+
{"model_id": "Mistralai/Mistral-Small-3.2-24B-Instruct-2506", "benchmark": "MMLU", "subject": "ALL", "accuracy": 84.80, "sample_count": 1000, "timestamp": "2025-06-25T14:17:04.000000"}
|
6 |
+
{"model_id": "Claude 4 Opus(Not on hub)", "benchmark": "MMLU", "subject": "ALL", "accuracy": 89.10, "sample_count": 1000, "timestamp": "2025-06-25T14:17:05.000000"}
|