Spaces:

jannisborn
/

NumberTokenLoss

Running

App Files Files Community

jannisborn commited on Jun 9

Commit

be74e38

unverified ·

1 Parent(s): 0dc70d1

update

Browse files

Files changed (1) hide show

src/streamlit_app.py +425 -437

src/streamlit_app.py CHANGED Viewed

@@ -1,193 +1,392 @@
-import logging
-import time
 import altair as alt
-import numpy as np
 import pandas as pd
-import streamlit as st
 import streamlit_vertical_slider as svs
 import torch
-from scenarios import dirac, gauss, make_bimodal_scenarios
-logging.getLogger("streamlit.watcher.local_sources_watcher").setLevel(logging.ERROR)
-DEMO_INTERVAL = 1.5
-CE_SCALING = 0.25
-MAX_LOSS_PLOT = 6
-LAST_STEP = -1
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
-def compute_losses(probs: torch.Tensor, gt_token: str) -> tuple[float, float, float]:
-    """Compute CE, NTL-MAE, NTL-WAS losses for the given probability vector and ground truth token."""
-    ce_loss = CE_SCALING * -torch.log(
-        torch.clamp(probs[options.index(gt_token)], min=1e-9)
-    )
-    numeric_mass = probs[:10].sum()
-    if gt_token == "Text" or numeric_mass < 1e-6:
-        return ce_loss.item(), 0.0, 0.0
-    gt_numeric = int(gt_token)
-    token_vals = torch.arange(10, dtype=torch.float32)
-    mae = numeric_mass * abs(torch.dot(token_vals, probs[:10]) - gt_numeric)
-    was = numeric_mass * torch.dot(probs[:10], torch.abs(token_vals - gt_numeric))
-    return round(ce_loss.item(), 3), round(mae.item(), 3), round(was.item(), 3)
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
-if "running_demo" not in st.session_state:
     st.session_state.running_demo = False
-if "demo_step" not in st.session_state:
     st.session_state.demo_step = 0
-if "last_update_time" not in st.session_state:
     st.session_state.last_update_time = 0
-if "loss_container" not in st.session_state:
     st.session_state.loss_container = None
-if "previous_chart_html" not in st.session_state:
     st.session_state.previous_chart_html = ""
-if "active_scenarios" not in st.session_state:
-    # default if you want one to load on first show
-    st.session_state.active_scenarios = dirac
-if "loss_history" not in st.session_state:
-    st.session_state.loss_history = []
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
-        st.session_state[f"slider_{i}"] = 0
-if "ground_truth" not in st.session_state:
-    st.session_state["ground_truth"] = options[5]
-if "manual_ground_truth" not in st.session_state:
-    st.session_state["manual_ground_truth"] = options[5]
-if "demo_name" not in st.session_state:
-    st.session_state["demo_name"] = "Dirac"
-st.title("NTL -- The Number Token Loss 🚀")
-st.markdown(
-    """This is the interactive demo for our [ICML 2025](https://arxiv.org/abs/2411.02083) paper!🎉
-    ➡️ NTL augments cross-entropy to help LMs reason better with numbers 🧠
-    """
-)
-st.subheader("Demo 1 — NTL vs. Cross Entropy in 3 Scenarios")
 st.markdown("""
-1️⃣ Pick a ground truth token: a digit (0–9) or "Text" 📝 (simulates generic text tokens).
-2️⃣ Choose a demo:
-- **Dirac** ⚡: All probability mass on one token.
-- **Gaussian** 🌊: Soft bell-curve around the true number.
-- **Bimodal** 🎯: Two peaks moving away from the target.
-Watch how losses evolve as predictions get worse — and see how NTL shines compared to CE! 🌟
 """)
-if "ground_truth" not in st.session_state:
-    st.session_state["ground_truth"] = "4"
-gt = st.selectbox("Ground Truth Token", options=options, key="ground_truth")
 def apply_scenario(step_idx):
-    scenario = st.session_state.active_scenarios[step_idx]
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
-def start_dirac_demo():
-    st.session_state.loss_history = []
-    st.session_state.active_scenarios = dirac
-    st.session_state.demo_name = "Dirac"
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
-    apply_scenario(0)
-def start_gauss_demo():
-    st.session_state.loss_history = []
-    st.session_state.active_scenarios = gauss
-    st.session_state.demo_name = "Gauss"
-    st.session_state.running_demo = True
-    st.session_state.demo_step = 0
-    st.session_state.last_update_time = time.time()
-    apply_scenario(0)
-def start_bimodal_demo():
-    st.session_state.loss_history = []
-    gt = st.session_state["ground_truth"]
-    st.session_state.active_scenarios = make_bimodal_scenarios(gt, options)
-    st.session_state.demo_name = f"Bimodal (GT={gt})"
-    st.session_state.running_demo = True
-    st.session_state.demo_step = 0
-    st.session_state.last_update_time = time.time()
-    apply_scenario(0)
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
-    scenario = st.session_state.active_scenarios
     current_time = time.time()
-    if current_time - st.session_state.last_update_time > DEMO_INTERVAL:
-        # if we haven’t yet shown the last scenario, advance
-        if st.session_state.demo_step < len(scenario) - 1:
-            st.session_state.demo_step += 1
-            apply_scenario(st.session_state.demo_step)
-            st.session_state.last_update_time = current_time
-            st.rerun()
-        else:
-            # we just displayed the final case → stop
-            st.session_state.running_demo = False
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
-    st.info(
-        f"Showing scenario {st.session_state.demo_step + 1}"
-        f"/{len(st.session_state.active_scenarios)}: "
-        f"{st.session_state.active_scenarios[st.session_state.demo_step]['name']}"
-    )
     if st.button("Stop Demo"):
-        st.session_state.running_demo = False
         st.rerun()
-else:
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        if st.button("Run: Dirac"):
-            start_dirac_demo()
-            st.rerun()
-    with col2:
-        if st.button("Run: Gauss"):
-            start_gauss_demo()
-            st.rerun()
-    with col3:
-        if st.button("Run: Bimodal"):
-            start_bimodal_demo()
-            st.rerun()
-current_prob_values_from_state = [
-    st.session_state.get(f"slider_{j}", 0)
-    for j in range(len(options))  # 1.0 / len(options)) for j in range(len(options))
-]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
@@ -195,322 +394,112 @@ probs_for_charts = (
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
-# Use manual GT token when not in running demo
-gt_choice_for_charts = (
-    st.session_state["manual_ground_truth"]
-    if not st.session_state.running_demo
-    else st.session_state["ground_truth"]
-)
 if gt_choice_for_charts == "Text":
-    gt_index_for_charts = 10  # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
-gt = st.session_state["ground_truth"]
-demo_name = st.session_state["demo_name"]
-st.markdown(f"#### Predicted distribution — ground truth: {gt}")
-df_dist = pd.DataFrame(
-    {"token": options, "probability": probs_for_charts.numpy().round(2)}
-)
-df_dist["type"] = [
-    "Ground Truth" if token == gt_choice_for_charts else "Prediction"
-    for token in options
-]
-bars = (
-    alt.Chart(df_dist)
-    .mark_bar(color="dodgerblue", size=40)
-    .encode(
-        x=alt.X(
-            "token:N",
-            title="Token",
-            sort=options,
-            axis=alt.Axis(
-                labelAngle=0,
-                labelFontSize=14,
-                titleFontSize=16,
-                labelAlign="center",
-                labelFlush=False,
-            ),
-        ),
-        y=alt.Y(
-            "probability:Q",
-            title="Probability",
-            scale=alt.Scale(domain=[0, 1]),
-            axis=alt.Axis(format=".2f", labelFontSize=14, titleFontSize=16),
-        ),
-        tooltip=[
-            alt.Tooltip("token:N", title="Token"),
-            alt.Tooltip("probability:Q", title="Predicted Prob.", format=".2f"),
-        ],
-    )
-)
-bg_bar = pd.DataFrame({"token": [gt], "height": [1.0]})
-gt_bar = (
-    alt.Chart(bg_bar)
-    .mark_bar(
-        color="darkgreen",
-        size=20,
-        opacity=0.3,
-        stroke="gray",
-        strokeWidth=2,
-        strokeDash=[4, 4],
-    )
-    .encode(
-        x=alt.X("token:N", sort=options),
-        y=alt.Y("height:Q", scale=alt.Scale(domain=[0, 1])),
-        tooltip=[
-            alt.Tooltip("token:N", title="Ground Truth"),
-            alt.Tooltip("height:Q", title="Desired mass", format=".2f"),
-        ],
-    )
 )
-annot1 = (
-    alt.Chart(pd.DataFrame({"token": [gt]}))
-    .mark_text(
-        text="⬇ Ground",
-        dy=-25,  # 10px above the top of the bar
-        dx=25,
-        fontSize=14,
-        fontWeight="bold",
-        color="darkgreen",
-    )
-    .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
-)
-annot2 = (
-    alt.Chart(pd.DataFrame({"token": [gt]}))
-    .mark_text(
-        text=f"truth={gt}",
-        dy=-10,  # 25px above the top, so it sits above line 1
-        dx=35,
-        fontSize=14,
-        fontWeight="bold",
-        color="darkgreen",
-    )
-    .encode(x=alt.X("token:N", sort=options), y=alt.value(1))
-)
-# 4) Layer them in order: background, bars, annotation
-final_chart = (gt_bar + bars + annot1 + annot2).properties(height=200)
-st.altair_chart(final_chart, use_container_width=True)
-ce_val, mae_val, was_val = compute_losses(probs_for_charts, gt_choice_for_charts)
-if (
-    st.session_state.running_demo
-    and len(st.session_state.loss_history) < st.session_state.demo_step + 1
-):
-    step = st.session_state.demo_step
-    scenario = st.session_state.active_scenarios[step]
-    ce, mae, was = compute_losses(probs_for_charts, gt_choice_for_charts)
-    # pick x_val differently for bimodal vs others
-    if st.session_state.demo_name.startswith("Bimodal"):
-        x_val = scenario["name"]  # e.g. "(4,4)", "(3,5)", …
-    else:
-        # exactly like before:
-        best_idx = np.argmax(scenario["values"])
-        x_val = options[best_idx]  # "0", "1", …, or "Text"
-    st.session_state.loss_history.append(
-        {
-            "step": step,
-            "x_val": x_val,
-            "Cross Entropy": ce,
-            "NTL-MAE": mae,
-            "NTL-WAS": was,
-        }
-    )
-#  1) build a raw DF from histories
-df = pd.DataFrame(st.session_state.loss_history)
-if df.empty:
-    # define an empty "melted" DataFrame with the right columns
-    df_loss_plot = pd.DataFrame(columns=["step", "x_val", "Loss Type", "Loss Value"])
-else:
-    # now it's safe to melt
-    df_loss_plot = df.melt(
-        id_vars=["step", "x_val"],
-        value_vars=["Cross Entropy", "NTL-MAE", "NTL-WAS"],
-        var_name="Loss Type",
-        value_name="Loss Value",
-    )
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
-if mae_val != "N/A":
-    loss_data["Loss"].append("NTL-MAE")
-    loss_data["Value"].append(mae_val)
 loss_df = pd.DataFrame(loss_data)
-if st.session_state.demo_name.startswith("Bimodal"):
-    domain = [sc["name"] for sc in st.session_state.active_scenarios]
-    x_title = f"Offset from GT {st.session_state['ground_truth']}"
-else:
-    domain = options
-    x_title = f"Maximum of predicted {st.session_state['demo_name']} distribution"
 # ============== Chart Display ==============
-st.markdown("#### Loss as a function of predicted distribution")
-grouped_chart = (
-    alt.Chart(df_loss_plot)
-    .mark_bar()
-    .encode(
-        x=alt.X(
-            "x_val:N",
-            title=x_title,
-            sort=domain,
-            scale=alt.Scale(domain=domain),
-            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=16),
-        ),
-        y=alt.Y(
-            "Loss Value:Q",
-            title="Loss Value",
-            scale=alt.Scale(domain=[0, MAX_LOSS_PLOT], nice=False, clamp=True),
-            axis=alt.Axis(labelFontSize=14, titleFontSize=16),
-        ),
-        color=alt.Color(
-            "Loss Type:N",
-            scale=alt.Scale(
-                domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
-                range=["red", "limegreen", "blueviolet"],
-            ),
-            legend=alt.Legend(
-                title="",
-                orient="top",
-                direction="horizontal",
-                columns=3,
-            ),
-        ),
-        xOffset="Loss Type:N",  # grouped bars
-        tooltip=[
-            alt.Tooltip("x_val:N", title="Scenario"),
-            alt.Tooltip("Loss Type:N", title="Loss Type"),
-            alt.Tooltip("Loss Value:Q", title="Value", format=".3f"),
-        ],
-    )
-    .properties(height=250)
-)
-st.altair_chart(grouped_chart, use_container_width=True)
 # Create a single chart for loss visualization
-if not st.session_state.running_demo:
-    for i in range(len(options)):
-        st.session_state[f"slider_{i}"] = 0.0
-    st.session_state.demo_step = 0
-    st.subheader("Demo 2 -- Manual loss comparison")
-    st.subheader("🧪 Demo 2 — Craft your own distribution")
-    st.markdown("""
-    This demo gives you more control but is harder to interpret. See it as a playground! 🎨
-    Manually adjust the sliders to change the predicted probabilities for each token.
-    The demo normalizes the values to form a valid probability distribution and calculates the losses.
-    👣 **Steps:**
-    - Use the **vertical sliders** to allocate probability to each token.
-    - Choose the correct **Ground Truth Token** (0–9 or "Text" 📜).
-    - Observe how each loss function reacts.
-    💡 **Tip:** Want to trick the loss? Try putting all mass on the wrong token or spread it wildly. See how NTL handles it! 😈
-    """)
-    manual_gt = st.selectbox(
-        "Ground Truth Token",
-        options=options,
-        key="manual_ground_truth",
-    )
-    loss_df = pd.DataFrame(
-        {
-            "Loss": ["Cross Entropy", "NTL-MAE", "NTL-WAS"],
-            "Value": [ce_val, mae_val, was_val],
-        }
-    )
-    # Sliders and Ground Truth Selector
-    # These widgets will read their initial values from st.session_state.
-    # User interactions will update st.session_state directly due to their keys.
-    st.markdown("#### Adjust the predicted token probability")
-    cols = st.columns(len(options))
-    for i, col in enumerate(cols):
-        label = options[i]  # Use token name directly for label
-        with col:
-            svs.vertical_slider(
-                label=label,
-                min_value=0.0,
-                max_value=1.0,
-                step=0.01,
-                height=50,
-                key=f"slider_{i}",
-                slider_color="green",
-                track_color="lightgray",
-                thumb_color="black",
-            )
-    chart = (
-        alt.Chart(loss_df)
-        .mark_bar()
-        .encode(
-            x=alt.X("Loss:N", sort=loss_df["Loss"].tolist()),
-            y=alt.Y(
-                "Value:Q",
-                scale=alt.Scale(
-                    domain=[
-                        0,
-                        max(
-                            loss_df["Value"].max() * 1.2,
-                            20 if st.session_state.running_demo else 0.5,
-                        ),
-                    ]
-                ),
-            ),
-            color=alt.Color(
-                "Loss:N",
-                scale=alt.Scale(
-                    domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
-                    range=["orangered", "limegreen", "blueviolet"],
-                ),
-            ),
-            tooltip=["Loss", "Value"],
-        )
-        .properties(height=300)
-    )
-    text = chart.mark_text(
-        align="center", baseline="bottom", dy=-5, fontSize=14
-    ).encode(text=alt.Text("Value:Q", format=".3f"))
-    final_chart = chart + text
-    st.altair_chart(final_chart, use_container_width=True)
-# # Add value labels on top of bars
-# text = chart.mark_text(align="center", baseline="bottom", dy=-5, fontSize=14).encode(
-#     text=alt.Text("Value:Q", format=".3f")
-# )
-# # Combine chart and text
-# final_chart = chart + text
 # Display chart with the full container width
-# st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
@@ -518,21 +507,20 @@ if not st.session_state.running_demo:
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
-    time.sleep(0.1)
     st.rerun()
 st.markdown("""
-### 🤔 TL;DR — Why NTL?
-Cross Entropy only cares if the prediction is exactly right or wrong ❌✅ — it doesn’t care *how close* a guess is!
-That’s bad for LLMs doing math and numeric reasoning 🧮.
-💥 NTL fixes that: it behaves like a regression loss on the token head, rewarding predictions that are numerically close.
 """)
-st.markdown("#### 📚 Further Resources")
 st.markdown("""
-- 📄 [ICML 2025 Paper](https://arxiv.org/abs/2411.02083)
-- 🌐 [NTL Landing Page](https://tum-ai.github.io/number-token-loss/)
-- 💻 [GitHub Code](https://github.com/tum-ai/number-token-loss)
 """)

 import altair as alt
 import pandas as pd
 import streamlit_vertical_slider as svs
 import torch
+# from streamlit_vertical_slider import vertical_slider # Not directly used, svs.vertical_slider is
+import streamlit as st
+import time
+import plotly.graph_objects as go  # Add Plotly import
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
+if 'running_demo' not in st.session_state:
     st.session_state.running_demo = False
+if 'demo_step' not in st.session_state:
     st.session_state.demo_step = 0
+if 'last_update_time' not in st.session_state:
     st.session_state.last_update_time = 0
+if 'loss_container' not in st.session_state:
     st.session_state.loss_container = None
+if 'previous_chart_html' not in st.session_state:
     st.session_state.previous_chart_html = ""
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
+        st.session_state[f"slider_{i}"] = 1.0 / len(options)
+if 'ground_truth' not in st.session_state:
+    st.session_state['ground_truth'] = options[0] # Default to "0"
+st.title("Number Token Loss - Demo")
 st.markdown("""
+Adjust the sliders to set a predicted probability for each token (0-9 and "Text").
+The sliders are vertical and compact. The app normalizes the slider values
+to form a valid probability distribution, visualizes it, and computes the corresponding
+Cross Entropy, NTL-MSE, and NTL-WAS losses.
 """)
+# --- Scenario Definitions ---
+scenarios = [
+    {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "0",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "1",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "2",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "3",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "4",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "5",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "6",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "7",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "8",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+     {
+        "name": "Probability mass at 0",
+        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "9",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "0",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "1",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "2",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "3",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "4",
+        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
+    },
+    {
+        "name": "Probability mass around ground truth (5)",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "5",
+        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "6",
+        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "7",
+        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "8",
+        "explanation": "Cross Entropy is high, NTL is higher but still penalizes less than CE because distribution knows it's a number."
+    },
+    {
+        "name": "Probability mass around 5",
+        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
+        "ground_truth": "9",
+        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "0",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "1",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "2",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "3",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "4",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "5",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "6",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "7",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "8",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 5",
+        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
+        "ground_truth": "9",
+        "explanation": "Both CE and NTL are high because the prediction is far from correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "0",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "1",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "2",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "3",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "4",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "5",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "6",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "7",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "8",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Probability mass concentrated on 1",
+        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
+        "ground_truth": "9",
+        "explanation": "Both losses are low because the prediction is correct."
+    },
+    {
+        "name": "Almost correct (1 vs 2)",
+        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
+        "ground_truth": "0",
+        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
+    },
+    {
+        "name": "Almost correct (1 vs 2)",
+        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
+        "ground_truth": "1",
+        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
+    },
+    {
+        "name": "Almost correct (1 vs 2)",
+        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
+        "ground_truth": "2",
+        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
+    },
+    {
+        "name": "Almost correct (1 vs 2)",
+        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
+        "ground_truth": "3",
+        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
+    }
+]
+# --- Helper Functions ---
 def apply_scenario(step_idx):
+    scenario = scenarios[step_idx]
+    # These assignments modify session state. They must be done *before* the widgets
+    # are rendered in the script run that should display these new values.
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
+    st.session_state['ground_truth'] = scenario["ground_truth"]
+def start_demo():
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
+    apply_scenario(0) # Apply the first scenario's state
+    # The button click that calls start_demo() will itself cause a rerun.
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
     current_time = time.time()
+    if current_time - st.session_state.last_update_time > 3.0:  # 3 seconds per scenario
+        next_step = (st.session_state.demo_step + 1) % len(scenarios)
+        st.session_state.demo_step = next_step
+        apply_scenario(next_step)  # Update session state for the new scenario
+        st.session_state.last_update_time = time.time() # Reset timer
+        st.rerun()  # Crucial: Rerun to reflect changes in widgets and charts
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
+    st.info(f"Showing scenario {st.session_state.demo_step + 1}/{len(scenarios)}: {scenarios[st.session_state.demo_step]['name']}")
+    st.markdown(f"**Explanation:** {scenarios[st.session_state.demo_step]['explanation']}")
     if st.button("Stop Demo"):
+        stop_demo()
         st.rerun()
+else: # Not st.session_state.running_demo
+    if st.button("Start Automated Demo"):
+        start_demo() # This calls apply_scenario(0)
+        st.rerun()   # Rerun to enter demo mode and draw scenario 0 correctly
+# Sliders and Ground Truth Selector
+# These widgets will read their initial values from st.session_state.
+# User interactions will update st.session_state directly due to their keys.
+if not st.session_state.running_demo:
+    st.markdown("#### Predicted Token Probabilities")
+    cols = st.columns(len(options))
+    for i, col in enumerate(cols):
+        label = options[i] # Use token name directly for label
+        with col:
+            svs.vertical_slider(
+                label=label, min_value=0.0, max_value=1.0, step=0.01, height=50,
+                key=f"slider_{i}", # This key links the widget to st.session_state[f"slider_{i}"]
+                slider_color="green", track_color="lightgray", thumb_color="black"
+            )
+# Ground truth selectbox
+st.selectbox(
+    "Ground Truth Token", options=options,
+    index=options.index(st.session_state['ground_truth']), # Display value from session state
+    key='ground_truth' # Links widget to st.session_state['ground_truth']
+)
+# Placeholder for charts and loss calculations that will be updated
+# This section always reads the current st.session_state to generate its content.
+current_prob_values_from_state = [st.session_state.get(f"slider_{j}", 1.0/len(options)) for j in range(len(options))]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
+gt_choice_for_charts = st.session_state.get('ground_truth', options[0])
 if gt_choice_for_charts == "Text":
+    gt_index_for_charts = 10 # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
+st.markdown("#### Input Probability Distribution")
+df_dist = pd.DataFrame({"token": options, "probability": probs_for_charts.numpy()})
+df_dist["type"] = ["Ground Truth" if token == gt_choice_for_charts else "Prediction" for token in options]
+chart = (
+    alt.Chart(df_dist).mark_bar().encode(
+        x=alt.X("token:N", title="Token", sort=options), # Ensure consistent sort order
+        y=alt.Y("probability:Q", title="Probability", scale=alt.Scale(domain=[0, 1])),
+        color=alt.Color("type:N", scale=alt.Scale(domain=["Ground Truth", "Prediction"], range=["green", "steelblue"]), legend=alt.Legend(title="Token Type"))
+    ).properties(height=300)
 )
+st.altair_chart(chart, use_container_width=True)
+ce_loss = -torch.log(torch.clamp(probs_for_charts[gt_index_for_charts], min=1e-9))
+if gt_numeric_for_charts is None: # Text token
+    ntl_mse_loss = torch.tensor(float('nan')) # MSE not applicable for text
+    ntl_was_loss = torch.tensor(float('nan')) # WAS not applicable for text
+else: # Numeric token
+    numeric_probs_for_loss = probs_for_charts[:10] # Probabilities for 0-9
+    # Ensure numeric_probs_for_loss sums to 1 for NTL calculations if it's a subset
+    numeric_probs_sum = torch.sum(numeric_probs_for_loss)
+    if numeric_probs_sum > 1e-6 : # Avoid division by zero
+            normalized_numeric_probs = numeric_probs_for_loss / numeric_probs_sum
+    else:
+            normalized_numeric_probs = torch.zeros_like(numeric_probs_for_loss)
+    loss_values_tensor = torch.arange(0, 10, dtype=torch.float32)
+    # Use normalized probabilities for NTL if only considering numeric tokens
+    if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6 :
+        pred_value = torch.sum( (probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * loss_values_tensor)
+    elif gt_choice_for_charts != "Text": # if sum is zero, pred_value is ill-defined or 0
+            pred_value = torch.tensor(0.0)
+    else: # Should not happen if gt_numeric_for_charts is not None
+        pred_value = torch.tensor(float('nan'))
+    if not torch.isnan(pred_value):
+        ntl_mse_loss = (pred_value - float(gt_numeric_for_charts)) ** 2
+        abs_diff = torch.abs(loss_values_tensor - float(gt_numeric_for_charts))
+        if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
+                ntl_was_loss = torch.sum((probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * abs_diff)
+        elif gt_choice_for_charts != "Text":
+                ntl_was_loss = torch.tensor(0.0) # Or some other default if all numeric probs are zero
+        else:
+                ntl_was_loss = torch.tensor(float('nan'))
+    else:
+        ntl_mse_loss = torch.tensor(float('nan'))
+        ntl_was_loss = torch.tensor(float('nan'))
+ce_val = round(ce_loss.item(), 3)
+mse_val = round(ntl_mse_loss.item(), 3) if not torch.isnan(ntl_mse_loss) else "N/A"
+was_val = round(ntl_was_loss.item(), 3) if not torch.isnan(ntl_was_loss) else "N/A"
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
+if mse_val != "N/A":
+    loss_data["Loss"].append("NTL-MSE")
+    loss_data["Value"].append(mse_val)
 loss_df = pd.DataFrame(loss_data)
 # ============== Chart Display ==============
 # Create a single chart for loss visualization
+st.subheader("Loss Comparison")
+# Create an Altair chart that will look good and redraw cleanly
+chart = alt.Chart(loss_df).mark_bar().encode(
+    x=alt.X('Loss:N', sort=loss_df["Loss"].tolist()),
+    y=alt.Y('Value:Q', scale=alt.Scale(domain=[0, max(loss_df["Value"].max() * 1.2, 20 if st.session_state.running_demo else 0.5)])),
+    color=alt.Color('Loss:N', scale=alt.Scale(
+        domain=['Cross Entropy', 'NTL-WAS', 'NTL-MSE'],
+        range=['steelblue', 'red', 'forestgreen']
+    )),
+    tooltip=['Loss', 'Value']
+).properties(
+    height=300
+)
+# Add value labels on top of bars
+text = chart.mark_text(
+    align='center',
+    baseline='bottom',
+    dy=-5,
+    fontSize=14
+).encode(
+    text=alt.Text('Value:Q', format='.3f')
+)
+# Combine chart and text
+final_chart = (chart + text)
 # Display chart with the full container width
+st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
+    time.sleep(0.1) # Adjusted from 0.2 to 0.5 (or try 1.0)
     st.rerun()
+# Add explanation of the demonstration
 st.markdown("""
+### What Does This Demo Show?
+- **Cross Entropy Loss**: Only cares if the prediction is exactly right or wrong - it doesn't consider how "close" a numerical prediction is.
+- **Number Token Loss (NTL)**: Considers numerical proximity - predicting "7" when the true value is "8" is better than predicting "2".
 """)
+# References / resources section with links (common to both modes)
+st.markdown("### Resources")
 st.markdown("""
+- [Paper: Number Token Loss (ArXiv)](https://arxiv.org/abs/2411.02083)
+- [GitHub: Number Token Loss](https://github.com/tum-ai/number-token-loss)
 """)