Spaces:

jannisborn
/

NumberTokenLoss

Sleeping

App Files Files Community

jannisborn commited on Jun 9

Commit

2854521

unverified ·

1 Parent(s): be74e38

update

Browse files

Files changed (1) hide show

src/streamlit_app.py +386 -426

src/streamlit_app.py CHANGED Viewed

@@ -1,392 +1,198 @@
 import altair as alt
 import pandas as pd
 import streamlit_vertical_slider as svs
 import torch
-# from streamlit_vertical_slider import vertical_slider # Not directly used, svs.vertical_slider is
-import streamlit as st
-import time
-import plotly.graph_objects as go  # Add Plotly import
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
-if 'running_demo' not in st.session_state:
     st.session_state.running_demo = False
-if 'demo_step' not in st.session_state:
     st.session_state.demo_step = 0
-if 'last_update_time' not in st.session_state:
     st.session_state.last_update_time = 0
-if 'loss_container' not in st.session_state:
     st.session_state.loss_container = None
-if 'previous_chart_html' not in st.session_state:
     st.session_state.previous_chart_html = ""
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
-        st.session_state[f"slider_{i}"] = 1.0 / len(options)
-if 'ground_truth' not in st.session_state:
-    st.session_state['ground_truth'] = options[0] # Default to "0"
-st.title("Number Token Loss - Demo")
 st.markdown("""
-Adjust the sliders to set a predicted probability for each token (0-9 and "Text").
-The sliders are vertical and compact. The app normalizes the slider values
-to form a valid probability distribution, visualizes it, and computes the corresponding
-Cross Entropy, NTL-MSE, and NTL-WAS losses.
 """)
-# --- Scenario Definitions ---
-scenarios = [
-    {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-     {
-        "name": "Probability mass at 0",
-        "values": [0.3, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Cross Entropy does not penalize if the prediction is far from the ground truth."
-    },
-    {
-        "name": "Probability mass around ground truth (5)",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Cross Entropy is high, NTL is higher but still penalizes less than CE because distribution knows it's a number."
-    },
-    {
-        "name": "Probability mass around 5",
-        "values": [0.05, 0.05, 0.05, 0.1, 0.2, 0.3, 0.15, 0.05, 0.03, 0.02, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Cross Entropy is moderate, NTL is low because predictions are close to ground truth."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 5",
-        "values": [0.05, 0.05, 0.05, 0.05, 0.05, 0.3, 0.2, 0.15, 0.05, 0.05, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Both CE and NTL are high because the prediction is far from correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "4",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "5",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "6",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "7",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "8",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Probability mass concentrated on 1",
-        "values": [0.05, 0.7, 0.05, 0.05, 0.05, 0.02, 0.02, 0.02, 0.02, 0.02, 0.0], # 11 values
-        "ground_truth": "9",
-        "explanation": "Both losses are low because the prediction is correct."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "0",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "1",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "2",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    },
-    {
-        "name": "Almost correct (1 vs 2)",
-        "values": [0.1, 0.1, 0.7, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # 11 values
-        "ground_truth": "3",
-        "explanation": "CE penalizes harshly, but NTL-WAS remains low because prediction is numerically close."
-    }
-]
-# --- Helper Functions ---
 def apply_scenario(step_idx):
-    scenario = scenarios[step_idx]
-    # These assignments modify session state. They must be done *before* the widgets
-    # are rendered in the script run that should display these new values.
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
-    st.session_state['ground_truth'] = scenario["ground_truth"]
-def start_demo():
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
-    apply_scenario(0) # Apply the first scenario's state
-    # The button click that calls start_demo() will itself cause a rerun.
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
     current_time = time.time()
-    if current_time - st.session_state.last_update_time > 3.0:  # 3 seconds per scenario
-        next_step = (st.session_state.demo_step + 1) % len(scenarios)
-        st.session_state.demo_step = next_step
-        apply_scenario(next_step)  # Update session state for the new scenario
-        st.session_state.last_update_time = time.time() # Reset timer
-        st.rerun()  # Crucial: Rerun to reflect changes in widgets and charts
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
-    st.info(f"Showing scenario {st.session_state.demo_step + 1}/{len(scenarios)}: {scenarios[st.session_state.demo_step]['name']}")
-    st.markdown(f"**Explanation:** {scenarios[st.session_state.demo_step]['explanation']}")
     if st.button("Stop Demo"):
-        stop_demo()
         st.rerun()
-else: # Not st.session_state.running_demo
-    if st.button("Start Automated Demo"):
-        start_demo() # This calls apply_scenario(0)
-        st.rerun()   # Rerun to enter demo mode and draw scenario 0 correctly
-# Sliders and Ground Truth Selector
-# These widgets will read their initial values from st.session_state.
-# User interactions will update st.session_state directly due to their keys.
-if not st.session_state.running_demo:
-    st.markdown("#### Predicted Token Probabilities")
-    cols = st.columns(len(options))
-    for i, col in enumerate(cols):
-        label = options[i] # Use token name directly for label
-        with col:
-            svs.vertical_slider(
-                label=label, min_value=0.0, max_value=1.0, step=0.01, height=50,
-                key=f"slider_{i}", # This key links the widget to st.session_state[f"slider_{i}"]
-                slider_color="green", track_color="lightgray", thumb_color="black"
-            )
-# Ground truth selectbox
-st.selectbox(
-    "Ground Truth Token", options=options,
-    index=options.index(st.session_state['ground_truth']), # Display value from session state
-    key='ground_truth' # Links widget to st.session_state['ground_truth']
-)
-# Placeholder for charts and loss calculations that will be updated
-# This section always reads the current st.session_state to generate its content.
-current_prob_values_from_state = [st.session_state.get(f"slider_{j}", 1.0/len(options)) for j in range(len(options))]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
@@ -394,112 +200,265 @@ probs_for_charts = (
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
-gt_choice_for_charts = st.session_state.get('ground_truth', options[0])
 if gt_choice_for_charts == "Text":
-    gt_index_for_charts = 10 # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
-st.markdown("#### Input Probability Distribution")
-df_dist = pd.DataFrame({"token": options, "probability": probs_for_charts.numpy()})
-df_dist["type"] = ["Ground Truth" if token == gt_choice_for_charts else "Prediction" for token in options]
-chart = (
-    alt.Chart(df_dist).mark_bar().encode(
-        x=alt.X("token:N", title="Token", sort=options), # Ensure consistent sort order
-        y=alt.Y("probability:Q", title="Probability", scale=alt.Scale(domain=[0, 1])),
-        color=alt.Color("type:N", scale=alt.Scale(domain=["Ground Truth", "Prediction"], range=["green", "steelblue"]), legend=alt.Legend(title="Token Type"))
-    ).properties(height=300)
-)
-st.altair_chart(chart, use_container_width=True)
-ce_loss = -torch.log(torch.clamp(probs_for_charts[gt_index_for_charts], min=1e-9))
-if gt_numeric_for_charts is None: # Text token
-    ntl_mse_loss = torch.tensor(float('nan')) # MSE not applicable for text
-    ntl_was_loss = torch.tensor(float('nan')) # WAS not applicable for text
-else: # Numeric token
-    numeric_probs_for_loss = probs_for_charts[:10] # Probabilities for 0-9
-    # Ensure numeric_probs_for_loss sums to 1 for NTL calculations if it's a subset
-    numeric_probs_sum = torch.sum(numeric_probs_for_loss)
-    if numeric_probs_sum > 1e-6 : # Avoid division by zero
-            normalized_numeric_probs = numeric_probs_for_loss / numeric_probs_sum
-    else:
-            normalized_numeric_probs = torch.zeros_like(numeric_probs_for_loss)
-    loss_values_tensor = torch.arange(0, 10, dtype=torch.float32)
-    # Use normalized probabilities for NTL if only considering numeric tokens
-    if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6 :
-        pred_value = torch.sum( (probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * loss_values_tensor)
-    elif gt_choice_for_charts != "Text": # if sum is zero, pred_value is ill-defined or 0
-            pred_value = torch.tensor(0.0)
-    else: # Should not happen if gt_numeric_for_charts is not None
-        pred_value = torch.tensor(float('nan'))
-    if not torch.isnan(pred_value):
-        ntl_mse_loss = (pred_value - float(gt_numeric_for_charts)) ** 2
-        abs_diff = torch.abs(loss_values_tensor - float(gt_numeric_for_charts))
-        if gt_choice_for_charts != "Text" and torch.sum(probs_for_charts[:10]) > 1e-6:
-                ntl_was_loss = torch.sum((probs_for_charts[:10]/torch.sum(probs_for_charts[:10])) * abs_diff)
-        elif gt_choice_for_charts != "Text":
-                ntl_was_loss = torch.tensor(0.0) # Or some other default if all numeric probs are zero
-        else:
-                ntl_was_loss = torch.tensor(float('nan'))
-    else:
-        ntl_mse_loss = torch.tensor(float('nan'))
-        ntl_was_loss = torch.tensor(float('nan'))
-ce_val = round(ce_loss.item(), 3)
-mse_val = round(ntl_mse_loss.item(), 3) if not torch.isnan(ntl_mse_loss) else "N/A"
-was_val = round(ntl_was_loss.item(), 3) if not torch.isnan(ntl_was_loss) else "N/A"
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
-if mse_val != "N/A":
-    loss_data["Loss"].append("NTL-MSE")
-    loss_data["Value"].append(mse_val)
 loss_df = pd.DataFrame(loss_data)
 # ============== Chart Display ==============
-# Create a single chart for loss visualization
-st.subheader("Loss Comparison")
-# Create an Altair chart that will look good and redraw cleanly
-chart = alt.Chart(loss_df).mark_bar().encode(
-    x=alt.X('Loss:N', sort=loss_df["Loss"].tolist()),
-    y=alt.Y('Value:Q', scale=alt.Scale(domain=[0, max(loss_df["Value"].max() * 1.2, 20 if st.session_state.running_demo else 0.5)])),
-    color=alt.Color('Loss:N', scale=alt.Scale(
-        domain=['Cross Entropy', 'NTL-WAS', 'NTL-MSE'],
-        range=['steelblue', 'red', 'forestgreen']
-    )),
-    tooltip=['Loss', 'Value']
-).properties(
-    height=300
-)
-# Add value labels on top of bars
-text = chart.mark_text(
-    align='center',
-    baseline='bottom',
-    dy=-5,
-    fontSize=14
-).encode(
-    text=alt.Text('Value:Q', format='.3f')
 )
-# Combine chart and text
-final_chart = (chart + text)
 # Display chart with the full container width
-st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
@@ -507,20 +466,21 @@ st.altair_chart(final_chart, use_container_width=True)
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
-    time.sleep(0.1) # Adjusted from 0.2 to 0.5 (or try 1.0)
     st.rerun()
-# Add explanation of the demonstration
 st.markdown("""
-### What Does This Demo Show?
-- **Cross Entropy Loss**: Only cares if the prediction is exactly right or wrong - it doesn't consider how "close" a numerical prediction is.
-- **Number Token Loss (NTL)**: Considers numerical proximity - predicting "7" when the true value is "8" is better than predicting "2".
 """)
-# References / resources section with links (common to both modes)
-st.markdown("### Resources")
 st.markdown("""
-- [Paper: Number Token Loss (ArXiv)](https://arxiv.org/abs/2411.02083)
-- [GitHub: Number Token Loss](https://github.com/tum-ai/number-token-loss)
 """)

+import logging
+import time
 import altair as alt
+import numpy as np
 import pandas as pd
+import streamlit as st
 import streamlit_vertical_slider as svs
 import torch
+from scenarios import dirac, gauss, make_bimodal_scenarios
+logging.getLogger("streamlit.watcher.local_sources_watcher").setLevel(logging.ERROR)
+DEMO_INTERVAL = 0.75
+CE_SCALING = 0.25
+MAX_LOSS_PLOT = 6
+LAST_STEP = -1
 # Define options globally as it's used in initialization and UI
 options = [str(i) for i in range(10)] + ["Text"]
+def compute_losses(probs: torch.Tensor, gt_token: str) -> tuple[float, float, float]:
+    """Compute CE, NTL-MAE, NTL-WAS losses for the given probability vector and ground truth token."""
+    ce_loss = CE_SCALING * -torch.log(
+        torch.clamp(probs[options.index(gt_token)], min=1e-9)
+    )
+    numeric_mass = probs[:10].sum()
+    if gt_token == "Text" or numeric_mass < 1e-6:
+        return ce_loss.item(), 0.0, 0.0
+    gt_numeric = int(gt_token)
+    token_vals = torch.arange(10, dtype=torch.float32)
+    mae = numeric_mass * abs(torch.dot(token_vals, probs[:10]) - gt_numeric)
+    was = numeric_mass * torch.dot(probs[:10], torch.abs(token_vals - gt_numeric))
+    return round(ce_loss.item(), 3), round(mae.item(), 3), round(was.item(), 3)
 # --- Session State Initialization ---
 # Ensure all session state variables are initialized before first use, especially by widgets.
+if "running_demo" not in st.session_state:
     st.session_state.running_demo = False
+if "demo_step" not in st.session_state:
     st.session_state.demo_step = 0
+if "last_update_time" not in st.session_state:
     st.session_state.last_update_time = 0
+if "loss_container" not in st.session_state:
     st.session_state.loss_container = None
+if "previous_chart_html" not in st.session_state:
     st.session_state.previous_chart_html = ""
+if "active_scenarios" not in st.session_state:
+    # default if you want one to load on first show
+    st.session_state.active_scenarios = dirac
+if "loss_history" not in st.session_state:
+    st.session_state.loss_history = []
+if "df_loss_plot" not in st.session_state:
+    # Initialize an empty DataFrame for loss history
+    st.session_state.df_loss_plot = pd.DataFrame(
+        columns=["step", "x_val", "Loss Type", "Loss Value"]
+    )
 # Initialize states for sliders and ground_truth selector
 # Using len(options) to correctly size for 0-9 + "Text"
 for i in range(len(options)):
     if f"slider_{i}" not in st.session_state:
+        st.session_state[f"slider_{i}"] = 0
+if "ground_truth" not in st.session_state:
+    st.session_state["ground_truth"] = options[5]
+if "manual_ground_truth" not in st.session_state:
+    st.session_state["manual_ground_truth"] = options[5]
+if "demo_name" not in st.session_state:
+    st.session_state["demo_name"] = "Dirac"
+st.title("NTL -- The Number Token Loss 🚀")
+st.markdown(
+    """This is the interactive demo for our [ICML 2025](https://arxiv.org/abs/2411.02083) paper!🎉
+    ➡️ NTL augments cross-entropy to help LMs reason better with numbers 🧠
+    """
+)
+st.subheader("Demo 1 — NTL vs. Cross Entropy in 3 Scenarios")
 st.markdown("""
+1️⃣ Pick a ground truth token: a digit (0–9) or "Text" 📝 (simulates generic text tokens).
+2️⃣ Choose a demo:
+- **Dirac** ⚡: All probability mass on one token.
+- **Gaussian** 🌊: Soft bell-curve around the true number.
+- **Bimodal** 🎯: Two peaks moving away from the target.
+Watch how losses evolve as predictions get worse — and see how NTL shines compared to CE! 🌟
 """)
+if "ground_truth" not in st.session_state:
+    st.session_state["ground_truth"] = "4"
+gt = st.selectbox("Ground Truth Token", options=options, key="ground_truth")
 def apply_scenario(step_idx):
+    scenario = st.session_state.active_scenarios[step_idx]
     for i, val in enumerate(scenario["values"]):
         st.session_state[f"slider_{i}"] = val
+def start_dirac_demo():
+    st.session_state.loss_history = []
+    st.session_state.active_scenarios = dirac
+    st.session_state.demo_name = "Dirac"
+    st.session_state.running_demo = True
+    st.session_state.demo_step = 0
+    st.session_state.last_update_time = time.time()
+    apply_scenario(0)
+def start_gauss_demo():
+    st.session_state.loss_history = []
+    st.session_state.active_scenarios = gauss
+    st.session_state.demo_name = "Gauss"
     st.session_state.running_demo = True
     st.session_state.demo_step = 0
     st.session_state.last_update_time = time.time()
+    apply_scenario(0)
+def start_bimodal_demo():
+    st.session_state.loss_history = []
+    gt = st.session_state["ground_truth"]
+    st.session_state.active_scenarios = make_bimodal_scenarios(gt, options)
+    st.session_state.demo_name = f"Bimodal (GT={gt})"
+    st.session_state.running_demo = True
+    st.session_state.demo_step = 0
+    st.session_state.last_update_time = time.time()
+    apply_scenario(0)
 def stop_demo():
     st.session_state.running_demo = False
 # --- Demo State Advancement Logic ---
 # This block handles advancing the demo. If it advances, it updates session state
 # and then reruns. This ensures widgets are drawn with the new state in the next run.
 if st.session_state.running_demo:
+    scenario = st.session_state.active_scenarios
     current_time = time.time()
+    if current_time - st.session_state.last_update_time > DEMO_INTERVAL:
+        # if we haven’t yet shown the last scenario, advance
+        if st.session_state.demo_step < len(scenario) - 1:
+            st.session_state.demo_step += 1
+            apply_scenario(st.session_state.demo_step)
+            st.session_state.last_update_time = current_time
+            # st.rerun() # not needed, leading to too many reruns
+        else:
+            # we just displayed the final case → stop
+            st.session_state.running_demo = False
 # --- UI Rendering ---
 # This section renders the main UI. It executes after any potential rerun from the block above.
 if st.session_state.running_demo:
+    st.info(
+        f"Showing scenario {st.session_state.demo_step + 1}"
+        f"/{len(st.session_state.active_scenarios)}: "
+        f"{st.session_state.active_scenarios[st.session_state.demo_step]['name']}"
+    )
     if st.button("Stop Demo"):
+        st.session_state.running_demo = False
         st.rerun()
+else:
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        if st.button("Run: Dirac"):
+            start_dirac_demo()
+            st.rerun()
+    with col2:
+        if st.button("Run: Gauss"):
+            start_gauss_demo()
+            st.rerun()
+    with col3:
+        if st.button("Run: Bimodal"):
+            start_bimodal_demo()
+            st.rerun()
+current_prob_values_from_state = [
+    st.session_state.get(f"slider_{j}", 0)
+    for j in range(len(options))  # 1.0 / len(options)) for j in range(len(options))
+]
 total_from_state = sum(current_prob_values_from_state)
 probs_for_charts = (
     torch.ones(len(options)) / len(options)
     else torch.tensor([v / total_from_state for v in current_prob_values_from_state])
 )
+# Use manual GT token when not in running demo
+gt_choice_for_charts = (
+    st.session_state["manual_ground_truth"]
+    if not st.session_state.running_demo
+    else st.session_state["ground_truth"]
+)
 if gt_choice_for_charts == "Text":
+    gt_index_for_charts = 10  # Assuming "Text" is the 11th item (index 10)
     gt_numeric_for_charts = None
 else:
     gt_index_for_charts = int(gt_choice_for_charts)
     gt_numeric_for_charts = gt_index_for_charts
+gt = st.session_state["ground_truth"]
+demo_name = st.session_state["demo_name"]
+st.markdown(f'#### Predicted distribution (<span style="color:darkgreen;">ground truth: {gt}</span>)', unsafe_allow_html=True)
+df_dist = pd.DataFrame(
+    {"token": options, "probability": probs_for_charts.numpy().round(2)}
+)
+df_dist["is_gt"] = df_dist["token"] == gt
+bars = (
+    alt.Chart(df_dist)
+    .mark_bar(color="dodgerblue", size=40)
+    .encode(
+        x=alt.X(
+            "token:N",
+            title="Token",
+            sort=options,
+            axis=alt.Axis(
+                labelAngle=0,
+                labelFontSize=14,
+                titleFontSize=16,
+                labelAlign="center",
+                labelFlush=False,
+            ),
+        ),
+        color=alt.condition(
+            "datum.is_gt",
+            alt.value("darkgreen"),   # color for ground truth
+            alt.value("dodgerblue")   # color for others
+        ),
+        y=alt.Y(
+            "probability:Q",
+            title="Probability",
+            scale=alt.Scale(domain=[0, 1]),
+            axis=alt.Axis(format=".2f", labelFontSize=14, titleFontSize=16),
+        ),
+        tooltip=[
+            alt.Tooltip("token:N", title="Token"),
+            alt.Tooltip("probability:Q", title="Predicted Prob.", format=".2f"),
+            alt.Tooltip("is_gt:N", title="Ground Truth")
+        ]
+    )
+)
+st.altair_chart(bars.properties(height=200), use_container_width=True, theme="streamlit")
+ce_val, mae_val, was_val = compute_losses(probs_for_charts, gt_choice_for_charts)
+if (
+    st.session_state.running_demo
+    and len(st.session_state.loss_history) < st.session_state.demo_step + 1
+):
+    step = st.session_state.demo_step
+    scenario = st.session_state.active_scenarios[step]
+    ce, mae, was = compute_losses(probs_for_charts, gt_choice_for_charts)
+    # pick x_val differently for bimodal vs others
+    if st.session_state.demo_name.startswith("Bimodal"):
+        x_val = scenario["name"]  # e.g. "(4,4)", "(3,5)", …
+    else:
+        # exactly like before:
+        best_idx = np.argmax(scenario["values"])
+        x_val = options[best_idx]  # "0", "1", …, or "Text"
+    st.session_state.loss_history.append(
+        {
+            "step": step,
+            "x_val": x_val,
+            "Cross Entropy": ce,
+            "NTL-MAE": mae,
+            "NTL-WAS": was,
+        }
+    )
+    st.session_state.df_loss_plot = pd.DataFrame(st.session_state.loss_history).melt(id_vars=["step", "x_val"],
+            value_vars=["Cross Entropy", "NTL-MAE", "NTL-WAS"],
+            var_name="Loss Type",
+            value_name="Loss Value")
 loss_data = {"Loss": ["Cross Entropy"], "Value": [ce_val]}
 if was_val != "N/A":
     loss_data["Loss"].append("NTL-WAS")
     loss_data["Value"].append(was_val)
+if mae_val != "N/A":
+    loss_data["Loss"].append("NTL-MAE")
+    loss_data["Value"].append(mae_val)
 loss_df = pd.DataFrame(loss_data)
+if st.session_state.demo_name.startswith("Bimodal"):
+    domain = [sc["name"] for sc in st.session_state.active_scenarios]
+    x_title = f"Offset from GT {st.session_state['ground_truth']}"
+else:
+    domain = options
+    x_title = f"Maximum of predicted {st.session_state['demo_name']} distribution"
 # ============== Chart Display ==============
+st.markdown("#### Loss as a function of predicted distribution")
+grouped_chart = (
+    alt.Chart(st.session_state.df_loss_plot)
+    .mark_bar()
+    .encode(
+        x=alt.X(
+            "x_val:N",
+            title=x_title,
+            sort=domain,
+            scale=alt.Scale(domain=domain),
+            axis=alt.Axis(labelAngle=0, labelFontSize=14, titleFontSize=16),
+        ),
+        y=alt.Y(
+            "Loss Value:Q",
+            title="Loss Value",
+            scale=alt.Scale(domain=[0, MAX_LOSS_PLOT], nice=False, clamp=True),
+            axis=alt.Axis(labelFontSize=14, titleFontSize=16),
+        ),
+        color=alt.Color(
+            "Loss Type:N",
+            scale=alt.Scale(
+                domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
+                range=["red", "limegreen", "blueviolet"],
+            ),
+            legend=alt.Legend(
+                title="",
+                orient="top",
+                direction="horizontal",
+                columns=3,
+            ),
+        ),
+        xOffset="Loss Type:N",  # grouped bars
+        tooltip=[
+            alt.Tooltip("x_val:N", title="Scenario"),
+            alt.Tooltip("Loss Type:N", title="Loss Type"),
+            alt.Tooltip("Loss Value:Q", title="Value", format=".3f"),
+        ],
+    )
+    .properties(height=250)
 )
+st.altair_chart(grouped_chart, use_container_width=True, theme="streamlit")
+# Create a single chart for loss visualization
+if not st.session_state.running_demo:
+    for i in range(len(options)):
+        st.session_state[f"slider_{i}"] = 0.0
+    st.session_state.demo_step = 0
+    st.subheader("Demo 2 -- Manual loss comparison")
+    st.subheader("🧪 Demo 2 — Craft your own distribution")
+    st.markdown("""
+    This demo gives you more control but is harder to interpret. See it as a playground! 🎨
+    Manually adjust the sliders to change the predicted probabilities for each token.
+    The demo normalizes the values to form a valid probability distribution and calculates the losses.
+    👣 **Steps:**
+    - Use the **vertical sliders** to allocate probability to each token.
+    - Choose the correct **Ground Truth Token** (0–9 or "Text" 📜).
+    - Observe how each loss function reacts.
+    💡 **Tip:** Want to trick the loss? Try putting all mass on the wrong token or spread it wildly. See how NTL handles it! 😈
+    """)
+    manual_gt = st.selectbox(
+        "Ground Truth Token",
+        options=options,
+        key="manual_ground_truth",
+    )
+    loss_df = pd.DataFrame(
+        {
+            "Loss": ["Cross Entropy", "NTL-MAE", "NTL-WAS"],
+            "Value": [ce_val, mae_val, was_val],
+        }
+    )
+    # Sliders and Ground Truth Selector
+    # These widgets will read their initial values from st.session_state.
+    # User interactions will update st.session_state directly due to their keys.
+    st.markdown("#### Adjust the predicted token probability")
+    cols = st.columns(len(options))
+    for i, col in enumerate(cols):
+        label = options[i]  # Use token name directly for label
+        with col:
+            svs.vertical_slider(
+                label=label,
+                min_value=0.0,
+                max_value=1.0,
+                step=0.01,
+                height=50,
+                key=f"slider_{i}",
+                slider_color="green",
+                track_color="lightgray",
+                thumb_color="black",
+            )
+    chart = (
+        alt.Chart(loss_df)
+        .mark_bar()
+        .encode(
+            x=alt.X("Loss:N", sort=loss_df["Loss"].tolist()),
+            y=alt.Y(
+                "Value:Q",
+                scale=alt.Scale(
+                    domain=[
+                        0,
+                        max(
+                            loss_df["Value"].max() * 1.2,
+                            20 if st.session_state.running_demo else 0.5,
+                        ),
+                    ]
+                ),
+            ),
+            color=alt.Color(
+                "Loss:N",
+                scale=alt.Scale(
+                    domain=["Cross Entropy", "NTL-WAS", "NTL-MAE"],
+                    range=["orangered", "limegreen", "blueviolet"],
+                ),
+            ),
+            tooltip=["Loss", "Value"],
+        )
+        .properties(height=300)
+    )
+    text = chart.mark_text(
+        align="center", baseline="bottom", dy=-5, fontSize=14
+    ).encode(text=alt.Text("Value:Q", format=".3f"))
+    final_chart = chart + text
+    st.altair_chart(final_chart, use_container_width=True)
+# # Add value labels on top of bars
+# text = chart.mark_text(align="center", baseline="bottom", dy=-5, fontSize=14).encode(
+#     text=alt.Text("Value:Q", format=".3f")
+# )
+# # Combine chart and text
+# final_chart = chart + text
 # Display chart with the full container width
+# st.altair_chart(final_chart, use_container_width=True)
 # --- Polling Rerun for Demo Mode ---
 # If the demo is running and we haven't just advanced (which would have caused a rerun),
 if st.session_state.running_demo:
     # This check is implicitly: if we are here and demo is running, it means
     # the time-based advance condition was NOT met in the block at the top.
+    time.sleep(DEMO_INTERVAL)
     st.rerun()
 st.markdown("""
+### 🤔 TL;DR — Why NTL?
+Cross Entropy only cares if the prediction is exactly right or wrong ❌✅ — it doesn’t care *how close* a guess is!
+That’s bad for LLMs doing math and numeric reasoning 🧮.
+💥 NTL fixes that: it behaves like a regression loss on the token head, rewarding predictions that are numerically close.
 """)
+st.markdown("#### 📚 Further Resources")
 st.markdown("""
+- 📄 [ICML 2025 Paper](https://arxiv.org/abs/2411.02083)
+- 🌐 [NTL Landing Page](https://tum-ai.github.io/number-token-loss/)
+- 💻 [GitHub Code](https://github.com/tum-ai/number-token-loss)
 """)