atlas5301 commited on
Commit
ccfe614
·
1 Parent(s): a8a1b3d

update format and adjust name

Browse files
app.py CHANGED
@@ -13,13 +13,13 @@ def main():
13
 
14
  # Each "with" block corresponds to a content area for that tab.
15
  with tabs[0]:
16
- from pages import zero_context
17
- zero_context.show()
18
-
19
- with tabs[1]:
20
  from pages import long_context
21
  long_context.show()
22
 
 
 
 
 
23
  with tabs[2]:
24
  from pages import benchmark_viewer
25
  benchmark_viewer.show()
 
13
 
14
  # Each "with" block corresponds to a content area for that tab.
15
  with tabs[0]:
 
 
 
 
16
  from pages import long_context
17
  long_context.show()
18
 
19
+ with tabs[1]:
20
+ from pages import zero_noise
21
+ zero_noise.show()
22
+
23
  with tabs[2]:
24
  from pages import benchmark_viewer
25
  benchmark_viewer.show()
pages/benchmark_viewer.py CHANGED
@@ -18,7 +18,7 @@ def show():
18
  epsilon = 1e-6
19
  num_zero_acc = (df['accuracy'] <= 0).sum()
20
  if num_zero_acc > 0:
21
- st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
22
  df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
23
 
24
  df['log_accuracy'] = np.log(df['accuracy'])
 
18
  epsilon = 1e-6
19
  num_zero_acc = (df['accuracy'] <= 0).sum()
20
  if num_zero_acc > 0:
21
+ # st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
22
  df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
23
 
24
  df['log_accuracy'] = np.log(df['accuracy'])
pages/long_context.py CHANGED
@@ -39,9 +39,16 @@ def show():
39
  - 8K: 8,000 tokens
40
  - 16K: 16,000 tokens
41
  - 32K: 32,000 tokens
 
 
 
 
 
42
 
43
  **Benchmark Details**:
44
- - Evaluated on Symbolic, Medium, and Hard subtasks
45
- - AUC scores aggregated across context lengths
46
- - Larger context evaluations limited by compute constraints and model performance
 
 
47
  """)
 
39
  - 8K: 8,000 tokens
40
  - 16K: 16,000 tokens
41
  - 32K: 32,000 tokens
42
+
43
+ **Colors**:
44
+ - Yellow: reasoning model
45
+ - Green: linear attention hybrid model
46
+ - Blue: SSM hybrid model
47
 
48
  **Benchmark Details**:
49
+ - Evaluated on Symbolic, Medium, and Hard subtasks.
50
+ - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
51
+ - AUC is calculated using np.trapz function.
52
+ - AUC scores aggregated across context lengths.
53
+ - Larger context evaluations limited by compute constraints and model performance.
54
  """)
pages/{zero_context.py → zero_noise.py} RENAMED
@@ -40,8 +40,13 @@ def show():
40
 
41
  # You can leave your explanation/description below
42
  st.markdown("""
 
 
 
 
 
43
  **Benchmark Details**:
44
  - Evaluated on Symbolic, Medium, and Hard subtasks.
45
- - Area Under Curve Metrics is Used to Compare between LLM Performance.
46
  - AUC is calculated using np.trapz function.
47
  """)
 
40
 
41
  # You can leave your explanation/description below
42
  st.markdown("""
43
+ **Colors**:
44
+ - Yellow: reasoning model
45
+ - Green: linear attention hybrid model
46
+ - Blue: SSM-hybrid model
47
+
48
  **Benchmark Details**:
49
  - Evaluated on Symbolic, Medium, and Hard subtasks.
50
+ - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
51
  - AUC is calculated using np.trapz function.
52
  """)