atlas5301
commited on
Commit
·
ccfe614
1
Parent(s):
a8a1b3d
update format and adjust name
Browse files- app.py +4 -4
- pages/benchmark_viewer.py +1 -1
- pages/long_context.py +10 -3
- pages/{zero_context.py → zero_noise.py} +6 -1
app.py
CHANGED
@@ -13,13 +13,13 @@ def main():
|
|
13 |
|
14 |
# Each "with" block corresponds to a content area for that tab.
|
15 |
with tabs[0]:
|
16 |
-
from pages import zero_context
|
17 |
-
zero_context.show()
|
18 |
-
|
19 |
-
with tabs[1]:
|
20 |
from pages import long_context
|
21 |
long_context.show()
|
22 |
|
|
|
|
|
|
|
|
|
23 |
with tabs[2]:
|
24 |
from pages import benchmark_viewer
|
25 |
benchmark_viewer.show()
|
|
|
13 |
|
14 |
# Each "with" block corresponds to a content area for that tab.
|
15 |
with tabs[0]:
|
|
|
|
|
|
|
|
|
16 |
from pages import long_context
|
17 |
long_context.show()
|
18 |
|
19 |
+
with tabs[1]:
|
20 |
+
from pages import zero_noise
|
21 |
+
zero_noise.show()
|
22 |
+
|
23 |
with tabs[2]:
|
24 |
from pages import benchmark_viewer
|
25 |
benchmark_viewer.show()
|
pages/benchmark_viewer.py
CHANGED
@@ -18,7 +18,7 @@ def show():
|
|
18 |
epsilon = 1e-6
|
19 |
num_zero_acc = (df['accuracy'] <= 0).sum()
|
20 |
if num_zero_acc > 0:
|
21 |
-
st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
|
22 |
df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
|
23 |
|
24 |
df['log_accuracy'] = np.log(df['accuracy'])
|
|
|
18 |
epsilon = 1e-6
|
19 |
num_zero_acc = (df['accuracy'] <= 0).sum()
|
20 |
if num_zero_acc > 0:
|
21 |
+
# st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
|
22 |
df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
|
23 |
|
24 |
df['log_accuracy'] = np.log(df['accuracy'])
|
pages/long_context.py
CHANGED
@@ -39,9 +39,16 @@ def show():
|
|
39 |
- 8K: 8,000 tokens
|
40 |
- 16K: 16,000 tokens
|
41 |
- 32K: 32,000 tokens
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
**Benchmark Details**:
|
44 |
-
- Evaluated on Symbolic, Medium, and Hard subtasks
|
45 |
-
- AUC
|
46 |
-
-
|
|
|
|
|
47 |
""")
|
|
|
39 |
- 8K: 8,000 tokens
|
40 |
- 16K: 16,000 tokens
|
41 |
- 32K: 32,000 tokens
|
42 |
+
|
43 |
+
**Colors**:
|
44 |
+
- Yellow: reasoning model
|
45 |
+
- Green: linear attention hybrid model
|
46 |
+
- Blue: SSM hybrid model
|
47 |
|
48 |
**Benchmark Details**:
|
49 |
+
- Evaluated on Symbolic, Medium, and Hard subtasks.
|
50 |
+
- Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
|
51 |
+
- AUC is calculated using np.trapz function.
|
52 |
+
- AUC scores aggregated across context lengths.
|
53 |
+
- Larger context evaluations limited by compute constraints and model performance.
|
54 |
""")
|
pages/{zero_context.py → zero_noise.py}
RENAMED
@@ -40,8 +40,13 @@ def show():
|
|
40 |
|
41 |
# You can leave your explanation/description below
|
42 |
st.markdown("""
|
|
|
|
|
|
|
|
|
|
|
43 |
**Benchmark Details**:
|
44 |
- Evaluated on Symbolic, Medium, and Hard subtasks.
|
45 |
-
- Area Under Curve Metrics is Used to Compare between LLM Performance.
|
46 |
- AUC is calculated using np.trapz function.
|
47 |
""")
|
|
|
40 |
|
41 |
# You can leave your explanation/description below
|
42 |
st.markdown("""
|
43 |
+
**Colors**:
|
44 |
+
- Yellow: reasoning model
|
45 |
+
- Green: linear attention hybrid model
|
46 |
+
- Blue: SSM-hybrid model
|
47 |
+
|
48 |
**Benchmark Details**:
|
49 |
- Evaluated on Symbolic, Medium, and Hard subtasks.
|
50 |
+
- Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
|
51 |
- AUC is calculated using np.trapz function.
|
52 |
""")
|