Spaces:

nuprl
/

verbal-reasoning-challenge

Running

App Files Files

arjunguha commited on Jan 27

Commit

457470f

1 Parent(s): a92220b

Updatae

Browse files

Files changed (3) hide show

app.py +120 -30
metrics.py +79 -5
puzzles_cleaned.csv +2 -2

app.py CHANGED Viewed

@@ -19,7 +19,10 @@ app that displays the following:
 Note that not every model has a response for every puzzle.
 """
 import gradio as gr
-from metrics import load_results
 def get_model_response(prompt_id, model_name):
@@ -39,12 +42,18 @@ def display_puzzle(puzzle_id):
     puzzle = conn.sql(query).fetchone()
     return puzzle if puzzle else (None, None,None, None, None)
-def display_model_response(puzzle_id, model_name):
     response = get_model_response(puzzle_id, model_name)
     split_thoughts = response.split("</think>")
     if len(split_thoughts) > 1:
-        response = split_thoughts[-1].strip()
-    return "From " + model_name + ":\n" + response if response else "No response from this model."
 conn = load_results()
@@ -108,36 +117,117 @@ model_columns = {
 valid_model_indices = list(model_columns.keys())
 default_model = model_columns[valid_model_indices[0]]
-def create_interface():
-    with gr.Blocks() as demo:
-        # Using "markdown" as the datatype makes Gradio interpret newlines.
-        puzzle_list = gr.DataFrame(
-            value=relabelled_df,
-            datatype=["number", "str", "markdown", *["str"] * len(model_correct_columns)],
-            # headers=["ID", "Challenge", "Answer", *cleaned_model_names],
-        )
-        model_response = gr.Textbox(label="Model Response", interactive=False)
-        challenge = gr.Textbox(label="Challenge", interactive=False)
-        answer = gr.Textbox(label="Answer", interactive=False)
-        explanation = gr.Textbox(label="Explanation", interactive=False)
-        editors_note = gr.Textbox(label="Editor's Note", interactive=False)
         transcript = gr.Textbox(label="Transcript", interactive=False)
-        def update_puzzle(evt: gr.SelectData):
-            row = evt.index[0]
-            model_index = evt.index[1]
-            model_name = model_columns[model_index] if model_index in valid_model_indices else default_model
-            return (*display_puzzle(row), display_model_response(row, model_name))
-        puzzle_list.select(
-            fn=update_puzzle,
-            inputs=[],
-            outputs=[challenge, answer, transcript, explanation, editors_note, model_response]
-        )
-    demo.launch()
 if __name__ == "__main__":
     create_interface()

 Note that not every model has a response for every puzzle.
 """
 import gradio as gr
+import pandas as pd
+import numpy as np
+from metrics import load_results, accuracy_by_model_and_time
+import metrics
 def get_model_response(prompt_id, model_name):
     puzzle = conn.sql(query).fetchone()
     return puzzle if puzzle else (None, None,None, None, None)
+def display_model_response(puzzle_id, model_name, show_thoughts):
     response = get_model_response(puzzle_id, model_name)
+    if response is None:
+        return "No response from this model."
     split_thoughts = response.split("</think>")
     if len(split_thoughts) > 1:
+        if show_thoughts:
+            return response.strip()
+        else:
+            return split_thoughts[-1].strip()
+    else:
+        return response.strip()
 conn = load_results()
 valid_model_indices = list(model_columns.keys())
 default_model = model_columns[valid_model_indices[0]]
+def summary_view():
+    accuracy_over_time = accuracy_by_model_and_time(conn).to_df()
+    accuracy_over_time["model"] = accuracy_over_time["model"].apply(lambda x: x.replace("completions-", ""))
+    # This hack so that Gradio doesn't render a year 2020 as "2,020.0".
+    accuracy_over_time["year"] = accuracy_over_time["year"].astype(str)
+    accuracy_over_time.rename(columns={"model": "Model", "year": "Year", "accuracy": "Accuracy"}, inplace=True)
+    gr.LinePlot(
+        accuracy_over_time,
+        x="Year",
+        y="Accuracy",
+        color="Model",
+        title="Model Accuracy Over Time",
+        y_lim=[0, 1],
+        x_label="Year",
+        y_label="Accuracy",
+    )
+def r1_accuracy_by_completion_length():
+    r1_completions = metrics.r1_accuracy_by_completion_length(conn).to_df()
+    r1_completions["length"] = r1_completions["length"] / 3.2
+    r1_completions.rename(columns={"length": "Response Length", "cumulative_correct": "Cumulative Correct"}, inplace=True)
+    gr.LinePlot(
+        r1_completions,
+        x="Response Length",
+        y="Cumulative Correct",
+        title="R1 Accuracy by Completion Length",
+        x_label="Max Response Length (tokens)",
+        y_label="# Correct Answers",
+        x_lim=[0, 32_768],
+    )
+def all_challenges_view():
+    # Using "markdown" as the datatype makes Gradio interpret newlines.
+    puzzle_list = gr.DataFrame(
+        value=relabelled_df,
+        datatype=["number", "str", "markdown", *["str"] * len(model_correct_columns)],
+        # headers=["ID", "Challenge", "Answer", *cleaned_model_names],
+    )
+    with gr.Row(scale=2):
+        model_name = gr.State(value=default_model)
+        challenge_id = gr.State(value=0)
+        show_thoughts = gr.State(value=False)
+        with gr.Column():
+            challenge = gr.Textbox(label="Challenge", interactive=False)
+            answer = gr.Textbox(label="Answer", interactive=False)
+            explanation = gr.Textbox(label="Explanation", interactive=False)
+            editors_note = gr.Textbox(label="Editor's Note", interactive=False)
+        with gr.Column():
+            gr.Checkbox(
+                label="Show Thoughts", value=False
+            ).change(
+                fn=lambda x: x,  inputs=[show_thoughts], outputs=[show_thoughts]
+            )
+            model_response = gr.Textbox(label="Model Response", interactive=False)
         transcript = gr.Textbox(label="Transcript", interactive=False)
+    def select_table_item(evt: gr.SelectData):
+        model_index = evt.index[1]
+        challenge_id = evt.index[0]
+        model_name = model_columns[model_index] if model_index in valid_model_indices else default_model
+        return (model_name, challenge_id)
+    def update_puzzle(challenge_id: str, model_name: str, show_thoughts: bool):
+        return (*display_puzzle(challenge_id),
+                gr.Textbox(
+                    value=display_model_response(challenge_id, model_name, show_thoughts),
+                    label=model_name
+                ))
+    puzzle_list.select(
+        fn=select_table_item,
+        inputs=[],
+        outputs=[model_name, challenge_id]
+    )
+    model_name.change(
+        fn=update_puzzle,
+        inputs=[challenge_id, model_name, show_thoughts],
+        outputs=[challenge, answer, transcript, explanation, editors_note, model_response]
+    )
+    challenge_id.change(
+        fn=update_puzzle,
+        inputs=[challenge_id, model_name, show_thoughts],
+        outputs=[challenge, answer, transcript, explanation, editors_note, model_response]
+    )
+    show_thoughts.change(
+        fn=update_puzzle,
+        inputs=[challenge_id, model_name, show_thoughts],
+        outputs=[challenge, answer, transcript, explanation, editors_note, model_response]
+    )
+def create_interface():
+    with gr.Blocks() as demo:
+        with gr.Tabs():
+            with gr.TabItem("All Challenges"):
+                all_challenges_view()
+            with gr.TabItem("Accuracy by Model"):
+                gr.DataFrame(metrics.accuracy_by_model(conn).to_df())
+            with gr.TabItem("Accuracy Over Time"):
+                summary_view()
+            with gr.TabItem("DeepSeek R1 Analysis"):
+                r1_accuracy_by_completion_length()
+    demo.launch()
 if __name__ == "__main__":
     create_interface()

metrics.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 import duckdb
 import textwrap
 from typing import List, Tuple
 def _parse_answer(text: str) -> List[List[str]]:
     """
@@ -55,15 +56,81 @@ def _wrap_text(text: str, width: int) -> str:
 def load_results():
     conn = duckdb.connect(":memory:")
-    conn.execute("ATTACH DATABASE 'results.duckdb' AS results")
     conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
     conn.create_function("check_answer", _check_answer)
     conn.create_function("clip_text", _clip_text)
     conn.create_function("wrap_text", _wrap_text)
     return conn
-def accuracy_by_model(conn):
     model_accuracies = conn.sql("""
         WITH AnswerCheck AS (
             SELECT
                 results.parent_dir AS model,
@@ -87,8 +154,15 @@ def accuracy_by_model(conn):
             AnswerCheck
     """)
-    print(model_accuracies)
 if __name__ == "__main__":
-    conn = load_results()
-    accuracy_by_model(conn)

 import duckdb
 import textwrap
 from typing import List, Tuple
+import argparse
 def _parse_answer(text: str) -> List[List[str]]:
     """
 def load_results():
     conn = duckdb.connect(":memory:")
+    conn.execute("ATTACH DATABASE 'results.duckdb' AS results (READ_ONLY)")
     conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
     conn.create_function("check_answer", _check_answer)
     conn.create_function("clip_text", _clip_text)
     conn.create_function("wrap_text", _wrap_text)
     return conn
+def r1_accuracy_by_completion_length(conn):
+    """
+    For the responses from the completions-r1 model:
+    1. We calculate completion length and correctness for each problem.
+    2. We sort by length.
+    3. We compute cumulative number of correct responses.
+    """
+    # Use CTEs
+    r1_completions = conn.sql("""
+        WITH LengthsAndCorrectness AS (
+            SELECT
+                LENGTH(results.completion) AS length,
+                CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
+            FROM results.completions results JOIN  challenges
+            ON results.prompt_id = challenges.ID
+            WHERE results.parent_dir = 'completions-r1'
+        )
+        SELECT
+            length,
+            COUNT(*) OVER (ORDER BY length) AS cumulative_correct
+        FROM LengthsAndCorrectness
+    """)
+    return r1_completions
+def accuracy_by_model_and_time(conn):
     model_accuracies = conn.sql("""
+        WITH ChallengesWithDates AS (
+            SELECT
+                ID,
+                answer,
+                EXTRACT(YEAR FROM CAST(date AS DATE)) AS year
+            FROM
+                challenges
+        ),
+        DateAnswerCheck AS (
+            SELECT
+                results.parent_dir AS model,
+                dates.year,
+                COUNT(*) AS total,
+                SUM(CAST(check_answer(results.completion, dates.answer) AS INTEGER)) AS correct
+            FROM
+                results.completions results
+            JOIN
+                ChallengesWithDates dates
+            ON
+                results.prompt_id = dates.ID
+            GROUP BY
+                results.parent_dir,
+                dates.year
+        )
+        SELECT
+            model,
+            year,
+            total,
+            correct,
+            ROUND(correct / total, 2) AS accuracy
+        FROM
+            DateAnswerCheck
+        ORDER BY
+            model,
+            year
+    """)
+    return model_accuracies
+def accuracy_by_model(conn):
+    return conn.sql("""
         WITH AnswerCheck AS (
             SELECT
                 results.parent_dir AS model,
             AnswerCheck
     """)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--by-model-and-time", action="store_true")
+    args = parser.parse_args()
+    conn = load_results()
+    if args.by_model_and_time:
+        print(accuracy_by_model_and_time(conn))
+    else:
+        print(accuracy_by_model(conn))
 if __name__ == "__main__":
+    main()

puzzles_cleaned.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7efd3a2897270124ecc8a299b96d14fb54600f3c0faf27b790d8b0312720f3cd
-size 1132332

 version https://git-lfs.github.com/spec/v1
+oid sha256:257753179c4b2a5be8716ac03da2617c48d9037290cc39b4896ad55304e13337
+size 1119397