Spaces:

InfiniAILab
/

GSM-Infinite-Leaderboard

Running

atlas5301 commited on Feb 6

Commit

ccfe614

1 Parent(s): a8a1b3d

update format and adjust name

Files changed (4) hide show

app.py CHANGED Viewed

@@ -13,13 +13,13 @@ def main():
     # Each "with" block corresponds to a content area for that tab.
     with tabs[0]:
-        from pages import zero_context
-        zero_context.show()
-    with tabs[1]:
         from pages import long_context
         long_context.show()
     with tabs[2]:
         from pages import benchmark_viewer
         benchmark_viewer.show()

     # Each "with" block corresponds to a content area for that tab.
     with tabs[0]:
         from pages import long_context
         long_context.show()
+    with tabs[1]:
+        from pages import zero_noise
+        zero_noise.show()
     with tabs[2]:
         from pages import benchmark_viewer
         benchmark_viewer.show()

pages/benchmark_viewer.py CHANGED Viewed

@@ -18,7 +18,7 @@ def show():
         epsilon = 1e-6
         num_zero_acc = (df['accuracy'] <= 0).sum()
         if num_zero_acc > 0:
-            st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
             df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
         df['log_accuracy'] = np.log(df['accuracy'])

         epsilon = 1e-6
         num_zero_acc = (df['accuracy'] <= 0).sum()
         if num_zero_acc > 0:
+            # st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
             df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
         df['log_accuracy'] = np.log(df['accuracy'])

pages/long_context.py CHANGED Viewed

@@ -39,9 +39,16 @@ def show():
     - 8K: 8,000 tokens
     - 16K: 16,000 tokens
     - 32K: 32,000 tokens
     **Benchmark Details**:
-    - Evaluated on Symbolic, Medium, and Hard subtasks
-    - AUC scores aggregated across context lengths
-    - Larger context evaluations limited by compute constraints and model performance
     """)

     - 8K: 8,000 tokens
     - 16K: 16,000 tokens
     - 32K: 32,000 tokens
+    **Colors**:
+    - Yellow: reasoning model
+    - Green: linear attention hybrid model
+    - Blue: SSM hybrid model
     **Benchmark Details**:
+    - Evaluated on Symbolic, Medium, and Hard subtasks.
+    - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
+    - AUC is calculated using np.trapz function.
+    - AUC scores aggregated across context lengths.
+    - Larger context evaluations limited by compute constraints and model performance.
     """)

pages/{zero_context.py → zero_noise.py} RENAMED Viewed

@@ -40,8 +40,13 @@ def show():
     # You can leave your explanation/description below
     st.markdown("""
     **Benchmark Details**:
     - Evaluated on Symbolic, Medium, and Hard subtasks.
-    - Area Under Curve Metrics is Used to Compare between LLM Performance.
     - AUC is calculated using np.trapz function.
     """)

     # You can leave your explanation/description below
     st.markdown("""
+    **Colors**:
+    - Yellow: reasoning model
+    - Green: linear attention hybrid model
+    - Blue: SSM-hybrid model
     **Benchmark Details**:
     - Evaluated on Symbolic, Medium, and Hard subtasks.
+    - Area Under Curve(AUC) Metrics is Used to Compare between LLM Performance.
     - AUC is calculated using np.trapz function.
     """)