Update README.md
Browse files
README.md
CHANGED
@@ -7,49 +7,4 @@ sdk: static
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
-
|
11 |
-
No sorting
|
12 |
-
| Benchmark | Metric [%] | Gemma2-2b-it | QWEN-2-1.5b | GPT-3.5 Turbo | Claude3 Haiku | Llama3 8B | Gemma2 9B It | Gemma2 27B It |
|
13 |
-
| --------------------------- | ---------- | ------------:| -----------:| -------------:| -------------:| ---------:| ------------:| -------------:|
|
14 |
-
| AGREE | ACC | 43.86 | 37.16 | 46.7 | 65.7 | 32.5 | 55.02 | 59.58 |
|
15 |
-
| ANLI | ACC | 39.58 | 36.67 | 44.7 | 51.5 | 43 | 57.17 | 61.33 |
|
16 |
-
| | F1 | 25.39 | 34.53 | 41.9 | 50.8 | 39.2 | 57.32 | 61.32 |
|
17 |
-
| ANLI EN | ACC | 41.33 | 39.92 | 44.25 | 55.34 | 46.8 | 55 | |
|
18 |
-
| | F1 | 38.26 | 33.19 | 40.58 | 54.16 | 38.8 | 52.97 | |
|
19 |
-
| ARC Challenge | ACC | 52.79 | 44.66 | 73.1 | 76.8 | 64.3 | 84.56 | 88.14 |
|
20 |
-
| ARC Challenge EN | ACC | 72.37 | 69.75 | 82.92 | 77.95 | 78.5 | 90.61 | |
|
21 |
-
| ARC Easy | ACC | 69.94 | | 85.8 | 85.3 | 79.4 | 92.28 | 95.01 |
|
22 |
-
| ARC Easy EN | ACC | 87.15 | 85.59 | 93.1 | 89.1 | 91.4 | 96.63 | |
|
23 |
-
| Belebele | ACC | 69.02 | 55.42 | 80.3 | 88.2 | 76.5 | 90.38 | 89.16 |
|
24 |
-
| Belebele EN | ACC | 79.33 | 70.84 | 87.04 | 90.95 | 83.4 | 94.06 | |
|
25 |
-
| CTKFacts | ACC | 56.63 | 55.56 | 61.8 | 69.6 | 61.7 | 64.34 | 70.79 |
|
26 |
-
| | F1 | 45.9 | 48.98 | 47.7 | 62 | 51.6 | 59.05 | 68.44 |
|
27 |
-
| CTKFacts EN | ACC | 46.95 | 50 | 67.56 | 68.06 | 69 | 70.97 | |
|
28 |
-
| | F1 | 44.56 | 39.94 | 63.23 | 62.22 | 65 | 69.32 | |
|
29 |
-
| Czech News | ACC | 62.9 | 17.8 | 78.9 | 81.3 | 71.6 | 79.5 | 82.55 |
|
30 |
-
| | F1 | 60.5 | 14.83 | 78.5 | 81.3 | 70.7 | 78.9 | 81.64 |
|
31 |
-
| Facebook Comments | ACC | 70.7 | 52.8 | 71.5 | 75.8 | 66.8 | 75.2 | 75.24\* |
|
32 |
-
| | F1 | 69.73 | 46.79 | 69 | 74.1 | 64.1 | 74.51 | 73.78\* |
|
33 |
-
| GSM8K | ACC | 28.84 | 18.29 | 64.2 | 78.6 | 67.07\* | \- | 75.71\* |
|
34 |
-
| GSM8K EN | ACC | 66.89 | 54.59 | 83.14 | 88.98 | \- | 87.96 | |
|
35 |
-
| Klokánek | ACC | 23.51 | 21.29 | 29.3 | 24.5 | 21.8 | 29.6 | 28.93 |
|
36 |
-
| Mall Reviews | ACC | 62.1 | 48.87 | 59.8 | 57.7 | 59.5 | 60.93 | 61.03 |
|
37 |
-
| | F1 | 60.13 | 43.89 | 55.4 | 55.2 | 57.3 | 59.88 | 60.38 |
|
38 |
-
| MMLU | ACC | 43.62 | 39.86 | 58 | 67.3 | 46.8 | 54.49 | 68.07 |
|
39 |
-
| MMLU EN | ACC | 56.82 | 53.04 | 64.89 | 73.01 | 53.5 | 49.84 | |
|
40 |
-
| SNLI | ACC | 57.44 | 59.06 | 61.8 | 71.7 | 60.9 | 77.56 | |
|
41 |
-
| | F1 | 57.48 | 58.41 | 51.5 | 70.5 | 58.7 | 77.76 | |
|
42 |
-
| SNLI EN | ACC | 61.61 | 48.74 | 60.57 | 72.74 | 65.5 | 78.29 | |
|
43 |
-
| | F1 | 46.59 | 30.85 | 43.32 | 53.78 | 47.4 | 59.09 | |
|
44 |
-
| SQAD | EM Acc | 67.85 | 58.84 | 66.2 | 59.8 | 67.6 | 78.29 | |
|
45 |
-
| | BoW F1 | 79.18 | 68.83 | 83.5 | 76.3 | 82.5 | 88.77 | |
|
46 |
-
| SQuAD (Generation) | EM Acc | 34.05 | 16.9 | 37.3 | 36.3 | 36.6 | 52.52 | |
|
47 |
-
| | BoW F1 | 39.29 | 22.04 | 43 | 44.7 | 44.5 | 57.87 | |
|
48 |
-
| SQuAD (No-Answer Detection) | EM Acc | 56.2 | 45.75 | 52.4 | 60.3 | 56.8 | 64.88 | |
|
49 |
-
| | BoW F1 | 50.76 | 31.67 | 44.2 | 56.4 | 51.3 | 62.47 | |
|
50 |
-
| Subjectivity | ACC | 68.9 | 50.95 | 80.2 | 81.5 | 78.3 | 85.9 | |
|
51 |
-
| | F1 | 66.79 | 36.62 | 80.2 | 81.2 | 77.6 | 85.68 | |
|
52 |
-
| Subjectivity EN | ACC | 77.6 | 53.1 | 86.8 | 86.6 | 87.1 | 86 | |
|
53 |
-
| | F1 | 76.7 | 53.05 | 86.79 | 86.59 | 87.1 | 85.85 | |
|
54 |
-
| TruthfulQA | ACC | 45.81 | 31.65 | 53.5 | 65.8 | 41 | 62.62 | |
|
55 |
-
| TruthfulQA EN | ACC | 49.75 | 32.02 | 58.5 | 70.81 | 41.6 | 68.06 |
|
|
|
7 |
pinned: false
|
8 |
---
|
9 |
|
10 |
+
Placeholder README
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|