Spaces:

multimodalart
/

Dream

Runtime error

App Files Files Community

multimodalart HF Staff commited on Apr 5

Commit

168a7c1

verified ·

1 Parent(s): 5713ed1

Update app.py

Browse files

Files changed (1) hide show

app.py +614 -420

app.py CHANGED Viewed

@@ -3,58 +3,31 @@ import torch
 import numpy as np
 import gradio as gr
 import spaces
 import time
 import re
-from transformers import AutoModel, AutoTokenizer
-from threading import Lock
-from queue import Queue
-# --- Configuration ---
-MODEL_PATH = "Dream-org/Dream-v0-Instruct-7B"
-DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-print(f"Using device: {DEVICE}")
-# --- Load Model and Tokenizer ---
-print("Loading model and tokenizer...")
-# Need configuration files for trust_remote_code
-# Make sure config.json, configuration_dream.py, modeling_dream.py,
-# generation_utils.py, generation_config.json are in the same directory
-# or accessible in the Hugging Face cache.
-model = AutoModel.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True
-).to(DEVICE).eval()
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_PATH,
-    trust_remote_code=True
-)
-print("Model and tokenizer loaded.")
 # --- Constants ---
-# Get IDs from tokenizer/config if possible, otherwise hardcode from provided files
-MASK_TOKEN = tokenizer.mask_token # Should be "<|mask|>"
-try:
-    MASK_ID = tokenizer.mask_token_id # Should be 151666
-    if MASK_ID is None: raise AttributeError # Handle case where it might not be set directly
-except AttributeError:
-    print("Warning: Could not directly get mask_token_id, using hardcoded value 151666.")
-    MASK_ID = 151666
-try:
-    EOS_ID = tokenizer.eos_token_id # Should be 151643
-    PAD_ID = tokenizer.pad_token_id # Should be 151643
-    if EOS_ID is None or PAD_ID is None: raise AttributeError
-except AttributeError:
-    print("Warning: Could not directly get eos/pad_token_id, using hardcoded value 151643.")
-    EOS_ID = 151643
-    PAD_ID = 151643
-# Ensure MASK_TOKEN and MASK_ID are valid
-if MASK_TOKEN is None or MASK_ID is None:
-    raise ValueError("Mask token or ID is not defined correctly.")
-if EOS_ID is None or PAD_ID is None:
-     raise ValueError("EOS/PAD token or ID is not defined correctly.")
 # --- Helper Functions ---
@@ -71,13 +44,18 @@ def parse_constraints(constraints_text):
         try:
             pos_str, word = part.split(':', 1)
             pos = int(pos_str.strip())
             word = word.strip()
             if word and pos >= 0:
                 # Tokenize the word - handle potential multi-token words
-                # Add space prefix for consistency, similar to how model might see words mid-sentence
-                tokens = tokenizer.encode(" " + word, add_special_tokens=False)
                 for i, token_id in enumerate(tokens):
-                     constraints[pos + i] = token_id
         except ValueError:
             continue
         except Exception as e:
@@ -86,280 +64,459 @@ def parse_constraints(constraints_text):
     return constraints
 def format_chat_history(history):
     """
-    Format chat history for the Dream model using its chat template logic.
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
-        Formatted list of message dictionaries for the model
     """
     messages = []
-     # Add system prompt if history is empty or doesn't start with system
-    if not history or history[0][0].lower() != 'system':
-         # Check if the tokenizer's template expects an explicit system message
-         # The template provided in tokenizer_config.json handles adding a default one
-         pass # Let apply_chat_template handle the default system message
-    for user_msg, assistant_msg in history:
-        if user_msg: # Handle potential initial system message possibility if needed
-             messages.append({"role": "user", "content": user_msg})
         if assistant_msg is not None: # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
-# --- Core Generation Logic with Visualization ---
-# Use a thread-safe queue to pass visualization states from the hook
-vis_queue = Queue()
-# Lock to prevent race conditions when accessing shared state like previous_x
-state_lock = Lock()
-# Store the previous state for comparison in the hook
-previous_x_shared = None
 @spaces.GPU
-def generate_response_with_visualization(
-    messages, # List of message dicts from format_chat_history
-    max_new_tokens=64,
-    steps=64, # Default steps based on README example
-    constraints=None,
-    temperature=0.6, # Default from demo_token_control
-    top_p=0.95,      # Default from demos
-    alg="entropy",   # Default from demos
-    alg_temp=0.1,    # Default from demo_multiturn_chat
-):
     """
-    Generate text with Dream model and capture visualization states using a hook.
     Args:
-        messages: List of message dictionaries with 'role' and 'content'.
-        max_new_tokens: Max tokens to generate.
-        steps: Diffusion steps.
-        constraints: Dictionary mapping positions (relative to response start) to token IDs.
-        temperature: Sampling temperature.
-        top_p: Nucleus sampling p.
-        alg: Remasking algorithm ('origin', 'entropy', 'maskgit_plus', 'topk_margin').
-        alg_temp: Temperature for confidence-based algorithms.
     Returns:
-        Tuple: (List of visualization states, final generated text string)
     """
-    global previous_x_shared, vis_queue
     if constraints is None:
-        constraints = {}
-    visualization_states = []
-    # Clear the queue for a new generation
-    while not vis_queue.empty():
-        try:
-            vis_queue.get_nowait()
-        except Queue.Empty:
-            break
-    # Prepare the prompt using chat template
-    # The template automatically adds the generation prompt like "<|im_start|>assistant\n"
-    try:
-        inputs = tokenizer.apply_chat_template(
-            messages,
-            return_tensors="pt",
-            add_generation_prompt=True,
-            return_dict=True
-        )
-        input_ids = inputs.input_ids.to(device=DEVICE)
-        # Dream doesn't seem to explicitly use attention_mask in simple demos,
-        # but it's good practice if padding were involved.
-        # For now, assume no padding in this interactive demo.
-        attention_mask = inputs.attention_mask.to(device=DEVICE) if 'attention_mask' in inputs else None
-    except Exception as e:
-        print(f"Error applying chat template: {e}")
-        # Provide a fallback or error state
-        error_state = [("Error in chat formatting.", "red")]
-        return [error_state], f"Error: Could not format chat history. {e}"
-    prompt_length = input_ids.shape[1]
-    total_length = prompt_length + max_new_tokens
-    # --- Define the Hook Function ---
-    def generation_tokens_hook_func(step, x, logits):
-        global previous_x_shared, vis_queue
-        with state_lock: # Ensure thread safety if needed, though hooks might run sequentially
-            current_x = x.clone() # Shape: (batch_size, total_length)
-            # --- Apply Constraints ---
-            # Constraints are relative to the start of the *response*
-            for rel_pos, token_id in constraints.items():
-                abs_pos = prompt_length + rel_pos
-                if 0 <= abs_pos < current_x.shape[1]:
-                    # Ensure constraint application doesn't go out of bounds
-                    # Apply constraint for the first batch element (batch size is 1 here)
-                    current_x[0, abs_pos] = token_id
-            # --- Create Visualization State ---
-            current_vis_state = []
-            x_response = current_x[0, prompt_length:] # Get the response part for batch 0
-            prev_x_response = previous_x_shared[0, prompt_length:] if previous_x_shared is not None else None
-            for i in range(max_new_tokens):
-                current_token_id = x_response[i].item()
-                token_str = tokenizer.decode([current_token_id], skip_special_tokens=False) # Keep special tokens for vis
-                # Clean up visual representation of special tokens
-                if token_str == tokenizer.eos_token or token_str == tokenizer.pad_token:
-                     token_str = "[EOS/PAD]" # Make it visually distinct
-                elif token_str == tokenizer.mask_token:
-                     token_str = "[MASK]"
-                elif token_str.strip() == "": # Handle empty strings from decoding potentially odd tokens
-                    token_str = "[UNK/SPACE]"
-                color = "#DDDDDD" # Default background
-                if current_token_id == MASK_ID:
-                    color = "#444444" # Dark gray for masks
-                elif prev_x_response is not None and prev_x_response[i].item() == MASK_ID:
-                    # Token was mask, now it's revealed in this step
-                     # Use green for newly revealed
-                    color = "#66CC66" # Light green
-                else:
-                     # Token was already revealed in a previous step or is a constraint
-                     # Check if it's a constraint applied *now*
-                     is_constraint = (prompt_length + i - prompt_length) in constraints and \
-                                      constraints[prompt_length + i - prompt_length] == current_token_id
-                     if is_constraint:
-                         color = "#FFD700" # Gold for constraints
-                     else:
-                         color = "#6699CC" # Light blue for previously revealed
-                current_vis_state.append((token_str, color))
-            # --- Update shared state and put vis state in queue ---
-            previous_x_shared = current_x.clone() # Update for the *next* step's comparison
-            vis_queue.put(current_vis_state)
-            # The hook must return the potentially modified tensor `x`
-            return current_x
-    # --- End of Hook Function ---
-    # Initialize previous_x_shared before generation starts
-    # Create initial masked state for visualization
-    initial_x = input_ids.clone()
-    if initial_x.shape[1] < total_length:
-         padding = torch.full((1, total_length - initial_x.shape[1]), MASK_ID, dtype=torch.long, device=DEVICE)
-         initial_x = torch.cat([initial_x, padding], dim=1)
-    else:
-         initial_x = initial_x[:, :total_length] # Truncate if prompt is too long
-    # Apply initial constraints to the starting state
     for rel_pos, token_id in constraints.items():
         abs_pos = prompt_length + rel_pos
-        if 0 <= abs_pos < initial_x.shape[1]:
-             initial_x[0, abs_pos] = token_id
-    with state_lock:
-        previous_x_shared = initial_x.clone()
-    # Add the initial all-masked state (or with constraints) to the visualization queue
-    initial_vis_state = []
-    initial_x_response = initial_x[0, prompt_length:]
-    for i in range(max_new_tokens):
-         token_id = initial_x_response[i].item()
-         if token_id == MASK_ID:
-              initial_vis_state.append((MASK_TOKEN, "#444444"))
-         else:
-              # Must be a pre-applied constraint
-              token_str = tokenizer.decode([token_id], skip_special_tokens=False)
-              if token_str == tokenizer.eos_token or token_str == tokenizer.pad_token:
-                   token_str = "[EOS/PAD]"
-              elif token_str.strip() == "":
-                   token_str = "[UNK/SPACE]"
-              initial_vis_state.append((token_str, "#FFD700")) # Gold for constraints
-    vis_queue.put(initial_vis_state)
-    # --- Run Generation ---
-    try:
-        # output_history=False because the hook handles state capture
-        # return_dict_in_generate=True to get the GenerationOutput object
-        output = model.diffusion_generate(
-            initial_x, # Start with the potentially constraint-applied tensor
-            attention_mask=None, # Assuming no padding needed for interactive use
-            max_new_tokens=max_new_tokens, # This might not be strictly needed if total_length is fixed
-            output_history=False,
-            return_dict_in_generate=True,
-            steps=steps,
-            temperature=temperature,
-            top_p=top_p,
-            alg=alg,
-            alg_temp=alg_temp if alg != 'origin' else None, # alg_temp only for confidence algs
-            generation_tokens_hook_func=generation_tokens_hook_func
-        )
-        final_sequence = output.sequences[0] # Batch size 1
-        # Decode the final response text, cleaning up special tokens
-        response_tokens = final_sequence[prompt_length:]
-        # Filter out EOS/PAD tokens for the final text display
-        response_tokens_filtered = [tok for tok in response_tokens.tolist() if tok != EOS_ID and tok != PAD_ID]
-        final_text = tokenizer.decode(response_tokens_filtered,
-                                      skip_special_tokens=True,
-                                      clean_up_tokenization_spaces=True) # Standard cleanup
-    except Exception as e:
-        print(f"Error during generation: {e}")
-        import traceback
-        traceback.print_exc()
-        # Provide error state
-        error_state = [("Generation Error.", "red")]
-        visualization_states.append(error_state)
-        final_text = f"Error: Generation failed. {e}"
-        # Add any states captured before the error
-        while not vis_queue.empty():
-             try:
-                 visualization_states.append(vis_queue.get_nowait())
-             except Queue.Empty:
-                 break
-        return visualization_states, final_text
-    # Retrieve all visualization states captured by the hook
-    while not vis_queue.empty():
-        try:
-            visualization_states.append(vis_queue.get_nowait())
-        except Queue.Empty:
             break
-    # If somehow no states were captured, add the initial one
-    if not visualization_states:
-         visualization_states.append(initial_vis_state)
-    return visualization_states, final_text.strip()
-# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
 button{height: 60px}
 '''
 def create_chatbot_demo():
-    with gr.Blocks(css=css) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
-        gr.Markdown("Chat with the Dream 7B Instruct model and visualize the diffusion generation process.")
-        gr.Markdown("Model: [Dream-org/Dream-v0-Instruct-7B](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)")
         # STATE MANAGEMENT
         chat_history = gr.State([])
         # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
-                chatbot_ui = gr.Chatbot(label="Conversation", height=500, avatar_images=["user.png", "robot.png"])
                 # Message input
                 with gr.Group():
@@ -367,192 +524,229 @@ def create_chatbot_demo():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
-                            show_label=False,
-                            scale=9
                         )
                         send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
-                    label="Word Constraints (Optional)",
-                    info="Place words at specific positions (0-indexed from response start). Format: 'pos:word, pos:word,...'. Example: '0:Once, 5:upon, 10:a'",
-                    placeholder="0:Once, 5:upon, 10:a",
                     value=""
                 )
             with gr.Column(scale=2):
                 output_vis = gr.HighlightedText(
-                    label="Diffusion Process Visualization",
-                    combine_adjacent=False,
-                    show_legend=True, # Keep legend hidden via CSS if desired
-                )
-                # Legend (colors defined in generate_response_with_visualization)
-                gr.Markdown(
-                    "<small>Color Legend: <span style='background-color:#444444; color:white;'>[MASK]</span>"
-                    " <span style='background-color:#66CC66;'>Newly Revealed</span>"
-                    " <span style='background-color:#6699CC;'>Previously Revealed</span>"
-                    " <span style='background-color:#FFD700;'>Constraint</span>"
-                     " <span style='background-color:#DDDDDD;'>[EOS/PAD/UNK]</span></small>"
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
-            max_new_tokens_slider = gr.Slider(
-                minimum=16, maximum=512, value=128, step=16, # Increased default/max
-                label="Max New Tokens (Generation Length)"
-            )
-            steps_slider = gr.Slider(
-                minimum=8, maximum=512, value=128, step=8,   # Increased default/max
-                label="Diffusion Steps"
-            )
-            temp_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.6, step=0.05, # Finer steps for temp
-                label="Temperature"
-            )
-            top_p_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.95, step=0.05,
-                label="Top-P (Nucleus Sampling)"
-            )
-            alg_radio = gr.Radio(
-                # Choices from README
-                choices=['origin', 'entropy', 'maskgit_plus', 'topk_margin'],
-                value='entropy',
-                label="Remasking Algorithm"
-            )
-            alg_temp_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.1, step=0.05,
-                label="Algorithm Temperature (for confidence-based algs)"
-            )
-            vis_delay_slider = gr.Slider(
-                minimum=0.0, maximum=0.5, value=0.03, step=0.01, # Faster default delay
-                label="Visualization Delay (seconds)"
-            )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
-        # HELPER FUNCTIONS (UI Logic)
         def add_message_to_history(history, message, response):
             """Add a message pair to the history state"""
-            new_history = history + [[message, response]]
-            return new_history
-        def user_message_submitted(message, history):
-            """ Handle user sending a message: update history, clear input """
             if not message or message.strip() == "":
-                return history, history, "", [] # No change if empty
-            # Add user message, response is initially None
-            new_history = add_message_to_history(history, message, None)
-            # Prepare display version (immediately shows user message)
-            display_history = new_history
-            # Clear input box
-            message_out = ""
-            # Clear visualization
-            vis_out = []
-            return new_history, display_history, message_out, vis_out
-        def bot_response_generator(history, constraints_str, max_tokens, steps, temp, top_p, alg, alg_temp, delay):
-            """ Generator function to stream bot response and visualization """
-            if not history or history[-1][1] is not None: # Ensure there's a user msg waiting for response
-                print("Warning: Bot response triggered without pending user message.")
-                yield history, [], "Error: No user message to respond to." # Send error state back?
                 return
-            # Get the full conversation history formatted for the model
             last_user_message = history[-1][0]
-            messages_for_model = format_chat_history(history[:-1]) # History *before* the last user msg
-            messages_for_model.append({"role": "user", "content": last_user_message})
-            # Parse constraints
             try:
                 parsed_constraints = parse_constraints(constraints_str)
-            except Exception as e:
-                 print(f"Error parsing constraints: {e}")
-                 yield history, [("Constraint Error", "red")], f"Error: Failed to parse constraints: {e}"
-                 return
-            # Generate response and visualization states
-            try:
-                 vis_states, final_response_text = generate_response_with_visualization(
-                    messages_for_model,
-                    max_new_tokens=max_tokens,
                     steps=steps,
                     constraints=parsed_constraints,
-                    temperature=temp,
-                    top_p=top_p,
                     alg=alg,
-                    alg_temp=alg_temp
-                 )
             except Exception as e:
-                print(f"Error in generate_response_with_visualization: {e}")
                 import traceback
                 traceback.print_exc()
-                yield history, [("Generation Error", "red")], f"Error: Generation failed: {e}"
-                return
-            # Update the history state with the final response *once*
-            history[-1][1] = final_response_text # Update the None placeholder
-            # Yield initial state immediately
-            if vis_states:
-                yield history, vis_states[0]
-            else:
-                 yield history, [] # Should not happen if generation worked
-            # Stream intermediate visualization states
-            for state in vis_states[1:]:
-                time.sleep(delay)
-                yield history, state
-            # Final yield ensures the chatbot UI has the complete history
-            # The last state in vis_states should already be yielded by the loop
-            # yield history, vis_states[-1] if vis_states else []
-        def clear_conversation():
-            """Clear the conversation history and visualization"""
-            return [], [], "", [] # history, chatbot_ui, user_input, output_vis
-        # EVENT HANDLERS
-        # User presses Enter or Send button
-        submit_args = {
-             "fn": user_message_submitted,
-             "inputs": [user_input, chat_history],
-             "outputs": [chat_history, chatbot_ui, user_input, output_vis]
-        }
-        user_input.submit(**submit_args)
-        send_btn.click(**submit_args)
-        # After user message is submitted, trigger bot response generation
-        generate_args = {
-            "fn": bot_response_generator,
-            "inputs": [
-                chat_history, constraints_input, max_new_tokens_slider, steps_slider,
-                temp_slider, top_p_slider, alg_radio, alg_temp_slider, vis_delay_slider
-            ],
-            "outputs": [chatbot_ui, output_vis] # Update chatbot history and visualization
-        }
-        # Trigger generation after submit OR click
-        user_input.submit(None, None, None, queue=True).then(**generate_args)
-        send_btn.click(None, None, None, queue=True).then(**generate_args)
-        # Clear button handler
         clear_btn.click(
-            fn=clear_conversation,
             inputs=[],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis]
         )
     return demo
-# Launch the demo
 if __name__ == "__main__":
     demo = create_chatbot_demo()
-    # queue() allows streaming and handling multiple users
-    # share=True creates a public link (use with caution)
-    demo.queue().launch(share=True, debug=True)

 import numpy as np
 import gradio as gr
 import spaces
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+from transformers.generation.configuration_utils import GenerationConfig
 import time
 import re
+import torch.distributions as dists # Import dists for sampling logic
+# --- Model Loading ---
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {device}")
+# Load Dream model and tokenizer
+model_path = "Dream-org/Dream-v0-Instruct-7B"
+# Load configuration first to get token IDs
+config = DreamConfig.from_pretrained(model_path) # Assuming configuration_dream.py is present
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
+model = model.to(device).eval()
+print("Model and Tokenizer loaded.")
 # --- Constants ---
+MASK_TOKEN = tokenizer.mask_token # "<|mask|>"
+MASK_ID = config.mask_token_id # Get from config (e.g., 151666)
+EOS_ID = config.eos_token_id   # Get from config (e.g., 151643)
+PAD_ID = config.pad_token_id   # Get from config (e.g., 151643)
 # --- Helper Functions ---
         try:
             pos_str, word = part.split(':', 1)
             pos = int(pos_str.strip())
+            # Use strip() and lower() for robustness if needed, but preserve case for now
             word = word.strip()
             if word and pos >= 0:
                 # Tokenize the word - handle potential multi-token words
+                # Add space prefix typical for non-leading words if pos > 0
+                prefix = " " if pos > 0 else ""
+                tokens = tokenizer.encode(prefix + word, add_special_tokens=False)
                 for i, token_id in enumerate(tokens):
+                     # Only add if the token is not a special token id already
+                     # (This prevents accidental replacement of things like MASK_ID)
+                    if token_id not in [MASK_ID, EOS_ID, PAD_ID]:
+                        constraints[pos + i] = token_id
         except ValueError:
             continue
         except Exception as e:
     return constraints
 def format_chat_history(history):
     """
+    Format chat history for the Dream model (using ChatML format potentially)
     Args:
         history: List of [user_message, assistant_message] pairs
     Returns:
+        Formatted conversation for the model (list of message dicts)
     """
     messages = []
+     # Check if the first message is a system prompt
+    if history and history[0][0].lower().startswith("system:"):
+        # Special handling if needed, or just treat as user
+        # For now, let's assume standard user/assistant alternation
+        pass # Or handle system prompt separately if template requires
+    for i, (user_msg, assistant_msg) in enumerate(history):
+         # Basic user/assistant structure
+        messages.append({"role": "user", "content": user_msg})
         if assistant_msg is not None: # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
+# --- Core Generation Logic (Adapted from Dream's _sample) ---
+def sample_tokens_for_vis(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
+    """
+    Simplified version of Dream's sample_tokens to get both token and confidence.
+    Returns confidence and chosen token ID.
+    """
+    # Apply temperature
+    if temperature > 0:
+        logits = logits / temperature
+    # Apply Top-P
+    if top_p is not None and top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_remove)
+        logits = logits.masked_fill(indices_to_remove, float('-inf'))
+    # Apply Top-K
+    if top_k is not None and top_k > 0:
+        top_k = min(top_k, logits.size(-1))
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits = logits.masked_fill(indices_to_remove, float('-inf'))
+    # Calculate probabilities
+    probs = torch.softmax(logits, dim=-1)
+    # Sample or Argmax
+    if temperature > 0:
+        # Use torch distributions for robust sampling
+        dist = dists.Categorical(probs=probs)
+        x0 = dist.sample()
+        # Gather confidence for the sampled token
+        confidence = torch.gather(probs, -1, x0.unsqueeze(-1)).squeeze(-1)
+    else:
+        # Argmax for deterministic generation
+        confidence, x0 = torch.max(probs, dim=-1)
+    # --- Calculate specific confidence metrics if requested ---
+    # Note: These modify the 'confidence' variable *after* sampling x0
+    if margin_confidence:
+        if probs.shape[-1] >= 2:
+             # Ensure logits weren't completely masked, handle edge cases
+            if not torch.isinf(logits).all(dim=-1).any():
+                # Sort probabilities to get top1 and top2
+                sorted_probs, _ = torch.sort(probs, dim=-1, descending=True)
+                top1_probs = sorted_probs[..., 0]
+                top2_probs = sorted_probs[..., 1]
+                confidence = top1_probs - top2_probs
+            else:
+                 # Fallback if all logits are -inf (shouldn't normally happen)
+                 confidence.fill_(0.0) # Or some other indicator
+        else:
+             # Only one possible token, margin is undefined or 1? Set to top1 prob.
+             confidence, _ = torch.max(probs, dim=-1)
+    elif neg_entropy:
+        epsilon = 1e-9 # Slightly smaller epsilon
+        log_probs = torch.log(probs + epsilon)
+        # Negative entropy is sum(p * log(p))
+        confidence = torch.sum(probs * log_probs, dim=-1) # Lower value (more negative) is higher confidence
+    return confidence, x0
 @spaces.GPU
+@torch.no_grad()
+def generate_response_with_visualization_dream(
+    messages, gen_length=64, steps=64,
+    constraints=None, temperature=0.2, top_p=0.95, top_k=None, # Added top_k
+    alg="entropy", alg_temp=0.1, # Dream specific params
+    yield_intermediate=True # Control yielding behavior
+    ):
     """
+    Generate text with Dream model with real-time visualization.
+    Adapts logic from Dream's _sample method.
     Args:
+        messages: List of message dictionaries with 'role' and 'content'
+        gen_length: Max new tokens to generate
+        steps: Number of diffusion steps
+        constraints: Dictionary mapping positions to *token IDs*
+        temperature: Sampling temperature
+        top_p: Nucleus sampling probability
+        top_k: Top-k sampling
+        alg: Remasking strategy ('origin', 'maskgit_plus', 'topk_margin', 'entropy')
+        alg_temp: Temperature for confidence-based remasking randomness
+        yield_intermediate: Whether to yield intermediate states for visualization
     Returns:
+        Yields visualization states or returns final state list, and final text.
     """
     if constraints is None:
+        constraints = {} # keys are positions relative to start of response
+    # --- Prepare Input ---
+    chat_input_text = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
+    input_ids = tokenizer(chat_input_text, return_tensors="pt")['input_ids'].to(device)
+    prompt_length = input_ids.shape[1]
+    max_length = prompt_length + gen_length
+    # Clamp max_length if it exceeds model capacity (use config value if available)
+    model_max_len = getattr(config, 'max_position_embeddings', 2048) # Default fallback
+    if max_length > model_max_len:
+        print(f"Warning: Requested length ({max_length}) exceeds model max ({model_max_len}). Clamping.")
+        max_length = model_max_len
+        gen_length = max_length - prompt_length
+        if gen_length <= 0:
+             print("Warning: Prompt is already at or exceeding model max length. Cannot generate.")
+             if yield_intermediate:
+                 yield [], "Error: Prompt too long."
+                 return
+             else:
+                 return [], "Error: Prompt too long."
+    # Initialize sequence 'x' with input_ids and padding with MASK_ID
+    x = torch.full((1, max_length), MASK_ID, dtype=torch.long, device=device)
+    x[:, :prompt_length] = input_ids.clone()
+    # Apply initial constraints to x (relative position -> absolute position)
     for rel_pos, token_id in constraints.items():
         abs_pos = prompt_length + rel_pos
+        if abs_pos < max_length:
+            # Ensure we don't overwrite prompt or special tokens accidentally
+            if token_id not in [MASK_ID, EOS_ID, PAD_ID]:
+                 x[:, abs_pos] = token_id
+            else:
+                 print(f"Warning: Skipped constraint for special token ID {token_id} at pos {rel_pos}")
+    # --- Visualization Setup ---
+    visualization_states = []
+    revealed_eos_pad = set() # Track positions where EOS/PAD was shown once
+    def get_vis_state(current_x, old_x, step_confidences=None):
+        nonlocal revealed_eos_pad
+        state = []
+        newly_revealed_in_step = False # Flag if any token changed from MASK
+        current_revealed_eos_pad = set() # Track EOS/PAD revealed *in this step*
+        for i in range(gen_length):
+            abs_pos = prompt_length + i
+            current_token_id = current_x[0, abs_pos].item()
+            old_token_id = old_x[0, abs_pos].item()
+            is_eos_or_pad = (current_token_id == EOS_ID or current_token_id == PAD_ID)
+            # Handle EOS/PAD hiding: Show once, then hide
+            if is_eos_or_pad and abs_pos in revealed_eos_pad:
+                state.append(("", "#FFFFFF")) # Make it invisible (white on white/transparent)
+                continue # Skip rest of logic for this pos
+            token_str = tokenizer.decode([current_token_id], skip_special_tokens=False) # Decode even specials initially
+            if current_token_id == MASK_ID:
+                color = "#444444" # Dark Gray for Mask
+                token_str = MASK_TOKEN # Display mask token string
+            elif old_token_id == MASK_ID: # Newly revealed in this step
+                newly_revealed_in_step = True
+                confidence = step_confidences.get(abs_pos, 0.5) # Get confidence if available, default 0.5
+                # Color based on confidence (adjust thresholds as needed)
+                # Note: Entropy confidence is negative, more negative = higher confidence
+                if alg == 'entropy':
+                     # Example thresholds for negative entropy (adjust based on observation)
+                    if confidence > -1.0: # Low confidence (high entropy)
+                        color = "#FF6666"  # Light Red
+                    elif confidence > -3.0: # Medium confidence
+                        color = "#FFAA33"  # Orange
+                    else: # High confidence (low entropy)
+                        color = "#66CC66"  # Light Green
+                else: # Standard confidence (probability or margin)
+                    if confidence < 0.3:
+                        color = "#FF6666"  # Light Red
+                    elif confidence < 0.7:
+                        color = "#FFAA33"  # Orange
+                    else:
+                        color = "#66CC66"  # Light Green
+                # If it's EOS/PAD revealed now, mark for future hiding
+                if is_eos_or_pad:
+                    current_revealed_eos_pad.add(abs_pos)
+            else: # Previously revealed
+                color = "#6699CC" # Light Blue
+            # Clean up token string for display (optional)
+            # token_str = token_str.replace(" ", " ") # Keep spaces visible
+            state.append((token_str, color))
+        # Update the global set of revealed EOS/PAD positions
+        revealed_eos_pad.update(current_revealed_eos_pad)
+        return state, newly_revealed_in_step
+    # Add initial state (all masked, constraints applied)
+    initial_vis_state, _ = get_vis_state(x, x) # Pass x as old_x initially
+    visualization_states.append(initial_vis_state)
+    if yield_intermediate:
+        yield initial_vis_state # Yield the starting state
+    # --- Diffusion Loop ---
+    timesteps = torch.linspace(1.0, 1e-3, steps + 1, device=device) # Use epsilon from Dream's defaults if needed
+    # Store the state before the loop starts
+    old_x = x.clone()
+    for i in range(steps):
+        # --- Core Dream Step ---
+        mask_index = (x == MASK_ID)
+        if not mask_index.any(): # Stop if no masks left
+            print(f"No masks left at step {i}. Stopping generation.")
             break
+        # Prepare attention mask (full attention for Dream unless specified otherwise)
+        # Dream's modeling code handles standard causal masking internally based on position_ids
+        # For diffusion, we typically allow attending to everything (masked or not)
+        # The `model` forward pass expects a standard causal mask or None
+        # Let's use None, assuming the model handles positions correctly
+        attention_mask = None # Or potentially create a full mask: torch.ones_like(x)
+        # Create position_ids (simple range for now)
+        position_ids = torch.arange(0, x.shape[1], device=device).unsqueeze(0)
+        # Model forward pass
+        outputs = model(input_ids=x, attention_mask=attention_mask, position_ids=position_ids)
+        logits = outputs.logits
+        # logits = torch.cat([logits[:,:1], logits[:, :-1]], dim=1) # Dream applies shift in utils, replicate if needed
+        # Select logits for masked positions ONLY
+        # Need to handle batch dimension (which is 1 here)
+        current_mask_indices_flat = torch.where(mask_index.flatten())[0]
+        if len(current_mask_indices_flat) == 0:
+            print(f"No mask indices found flat at step {i}. Stopping generation.")
+            break
+        # Use advanced indexing to get logits for masked positions
+        # Logits shape: [batch_size, seq_len, vocab_size]
+        # Mask_index shape: [batch_size, seq_len]
+        # We need logits corresponding to True values in mask_index
+        # Example: batch_idx = torch.where(mask_index)[0], seq_idx = torch.where(mask_index)[1]
+        # mask_logits = logits[batch_idx, seq_idx]
+        batch_indices, seq_indices = torch.where(mask_index)
+        mask_logits = logits[batch_indices, seq_indices] # Shape: [num_masked_tokens, vocab_size]
+        if mask_logits.numel() == 0: # Double check after indexing
+            print(f"No mask logits selected at step {i}. Stopping generation.")
+            break
+        t = timesteps[i]
+        s = timesteps[i + 1]
+        # --- Remasking Logic (Simplified from Dream's _sample) ---
+        step_confidences = {} # Store confidences for revealed tokens in this step {abs_pos: confidence}
+        if alg == 'origin':
+            p_transfer = (1.0 - s / t) if i < steps - 1 else 1.0
+            # Sample for all masked positions
+            confidence, x0_masked = sample_tokens_for_vis(mask_logits, temperature=temperature, top_p=top_p, top_k=top_k)
+            # Decide which ones to transfer based on random probability
+            transfer_mask = torch.rand(x0_masked.shape, device=device) < p_transfer
+            # Create a tensor of MASK_IDs, and fill in the transferred tokens
+            updates_for_masked_pos = torch.full_like(x0_masked, MASK_ID)
+            updates_for_masked_pos[transfer_mask] = x0_masked[transfer_mask]
+            # Update x at the masked positions
+            x[mask_index] = updates_for_masked_pos
+            # Store confidences for the *transferred* tokens for visualization
+            transferred_indices_flat = current_mask_indices_flat[transfer_mask]
+            transferred_confidences = confidence[transfer_mask]
+            for flat_idx, conf in zip(transferred_indices_flat, transferred_confidences):
+                 abs_pos = flat_idx.item() # Convert flat index back to seq position (assuming batch=1)
+                 step_confidences[abs_pos] = conf.item()
+        else: # Confidence-based algorithms ('maskgit_plus', 'topk_margin', 'entropy')
+            use_margin = (alg == 'topk_margin')
+            use_entropy = (alg == 'entropy')
+            # Sample potential replacements for ALL masked positions first
+            confidence, x0_masked = sample_tokens_for_vis(
+                mask_logits,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                margin_confidence=use_margin,
+                neg_entropy=use_entropy
+            )
+            num_mask_tokens = mask_index.sum().item()
+            # Calculate how many tokens to unmask/transfer in this step
+            num_transfer_tokens = int(num_mask_tokens * (1.0 - s / t)) if i < steps - 1 else num_mask_tokens
+            if num_transfer_tokens > 0 and confidence.numel() > 0:
+                transfer_indices_relative = None # Indices relative to the masked tokens
+                if alg_temp is None or alg_temp <= 0:
+                    # Deterministic: Select top-k confidence scores among masked tokens
+                    # Ensure k is not larger than the number of masked tokens
+                    k = min(num_transfer_tokens, confidence.shape[0])
+                    if k > 0:
+                       _, transfer_indices_relative = torch.topk(confidence, k)
+                else:
+                    # Stochastic: Sample based on confidence scores
+                    # Ensure probabilities are valid
+                    conf_probs = F.softmax(confidence / alg_temp, dim=-1)
+                    if not torch.isnan(conf_probs).any() and not torch.isinf(conf_probs).any() and conf_probs.sum() > 1e-6:
+                         # Ensure k is not larger than the number of masked tokens
+                        k = min(num_transfer_tokens, confidence.shape[0])
+                        if k > 0:
+                            transfer_indices_relative = torch.multinomial(conf_probs, num_samples=k, replacement=False)
+                    else:
+                        print(f"Warning: Invalid confidence probabilities at step {i}. Falling back to top-k.")
+                         # Fallback to deterministic if sampling fails
+                        k = min(num_transfer_tokens, confidence.shape[0])
+                        if k > 0:
+                            _, transfer_indices_relative = torch.topk(confidence, k)
+                if transfer_indices_relative is not None and transfer_indices_relative.numel() > 0:
+                    # Create updates, initially all MASK_ID
+                    updates_for_masked_pos = torch.full_like(x0_masked, MASK_ID)
+                    # Place the selected sampled tokens into the updates tensor
+                    updates_for_masked_pos[transfer_indices_relative] = x0_masked[transfer_indices_relative]
+                    # Update x at the original masked positions
+                    x[mask_index] = updates_for_masked_pos
+                     # Store confidences for the *transferred* tokens for visualization
+                    selected_confidences = confidence[transfer_indices_relative]
+                    # Get the absolute positions corresponding to these relative indices
+                    original_indices_flat = current_mask_indices_flat[transfer_indices_relative]
+                    for flat_idx, conf in zip(original_indices_flat, selected_confidences):
+                        abs_pos = flat_idx.item()
+                        step_confidences[abs_pos] = conf.item()
+                else:
+                    # No tokens were selected to transfer, x remains unchanged for masked parts
+                     pass # x[mask_index] remains MASK_ID essentially
+            else:
+                 # If num_transfer_tokens is 0, x remains unchanged for masked parts
+                 pass
+        # --- Apply Constraints and Finalize Step ---
+        # Ensure constraints are always maintained AFTER updates
+        for rel_pos, token_id in constraints.items():
+            abs_pos = prompt_length + rel_pos
+            if abs_pos < max_length:
+                 # Check if the position was masked before applying constraint
+                 # if mask_index[0, abs_pos]: # Only apply if it *was* a mask, maybe? Optional.
+                 x[:, abs_pos] = token_id
+        # --- Visualization Update ---
+        current_vis_state, newly_revealed = get_vis_state(x, old_x, step_confidences)
+        # Only add/yield if something actually changed or if it's the last step
+        if newly_revealed or i == steps - 1:
+            visualization_states.append(current_vis_state)
+            if yield_intermediate:
+                yield current_vis_state
+        # Update old_x for the next iteration
+        old_x = x.clone()
+    # --- Final Output ---
+    response_tokens = x[0, prompt_length:]
+    # Decode, cleaning up potential special tokens unless they are intended
+    final_text = tokenizer.decode(response_tokens,
+                                  skip_special_tokens=True, # Skip things like <|mask|> in final output
+                                  clean_up_tokenization_spaces=True)
+    # If not yielding intermediates, return the full list now
+    if not yield_intermediate:
+        return visualization_states, final_text
+    else:
+        # If yielding intermediates, we still need a way to signal completion
+        # and return the final text. Gradio's yield typically handles this if
+        # the last yielded value is the final one. We'll return the final text
+        # separately after the loop finishes in the calling function.
+        # The loop yields states, the calling function returns the final text.
+        pass # Final text is handled outside the generator function
+# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
 button{height: 60px}
+.token-revealed { transition: background-color 0.5s ease; } /* Optional: Add transition effect */
+.token-masked { background-color: #444444; color: white; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
+.token-new-high { background-color: #66CC66; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
+.token-new-mid { background-color: #FFAA33; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
+.token-new-low { background-color: #FF6666; color: black; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
+.token-old { background-color: #6699CC; color: white; padding: 1px 2px; margin: 1px; border-radius: 3px; display: inline-block; }
+.token-hidden { display: none; } /* Hide EOS/PAD after first reveal */
 '''
 def create_chatbot_demo():
+    with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
+        gr.Markdown(
+            "[[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)] "
+            "[[Blog](https://hkunlp.github.io/blog/2025/dream/)] "
+             "[[Original LLaDA Demo Inspiration](https://huggingface.co/spaces/GSAI-ML/LLaDA-demo)]"
+        )
+        gr.Markdown(
+            "**Note:** This demo visualizes the diffusion process in real-time. "
+            "Tokens start masked (<font color='#444444'>[MASK]</font>) and are revealed step-by-step. "
+            "Colors indicate confidence: <font color='#66CC66'>High</font>, "
+            "<font color='#FFAA33'>Medium</font>, <font color='#FF6666'>Low</font>. "
+            "Previously revealed tokens are <font color='#6699CC'>blue</font>. "
+            f"EOS/PAD tokens ({tokenizer.decode([EOS_ID])}) are hidden after appearing once."
+        )
         # STATE MANAGEMENT
         chat_history = gr.State([])
+        current_response_text = gr.State("") # Store the final text separately
         # UI COMPONENTS
         with gr.Row():
             with gr.Column(scale=3):
+                chatbot_ui = gr.Chatbot(label="Conversation", height=500, bubble_full_width=False)
                 # Message input
                 with gr.Group():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
+                            scale=7,
+                            show_label=False
                         )
                         send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
+                    label="Word Constraints (Relative Position)",
+                    info="Place words at specific 0-indexed positions in the *response*. Format: 'pos:word, pos:word'. Example: '0:Once, 5:upon, 10:time'",
+                    placeholder="0:Hello, 10:world",
                     value=""
                 )
             with gr.Column(scale=2):
+                 # Use HighlightedText with specific classes for better styling control
                 output_vis = gr.HighlightedText(
+                    label="Denoising Process Visualization",
+                    # Show legend mapping colors to confidence might be useful if classes aren't self-explanatory
+                    # For now, using the description markdown above.
+                    show_legend=False,
+                    # Use custom classes defined in CSS
+                    # color_map={ # This might not work directly with dynamic classes, CSS is better
+                    #     "MASK": "#444444", "NEW_H": "#66CC66", "NEW_M": "#FFAA33",
+                    #     "NEW_L": "#FF6666", "OLD": "#6699CC", "HIDDEN": "#FFFFFF"
+                    # }
+                    combine_adjacent=False, # Keep tokens separate
+                    height=550, # Adjust height as needed
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
+            with gr.Row():
+                gen_length = gr.Slider(
+                    minimum=16, maximum=512, value=64, step=8, # Increased max length
+                    label="Max New Tokens"
+                )
+                steps = gr.Slider(
+                    minimum=8, maximum=512, value=64, step=4, # Allow more steps
+                    label="Diffusion Steps"
+                )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0.0, maximum=1.5, value=0.2, step=0.05, # Wider range for temp
+                    label="Temperature"
+                )
+                top_p = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.95, step=0.05,
+                    label="Top-P (Nucleus Sampling)"
+                )
+                # top_k = gr.Slider(
+                #     minimum=0, maximum=200, value=0, step=5, # Allow Top-K=0 (disabled)
+                #     label="Top-K (0 to disable)"
+                # )
+            with gr.Row():
+                # Dream specific algorithm choice
+                alg_strategy = gr.Radio(
+                    choices=["entropy", "maskgit_plus", "topk_margin", "origin"],
+                    value="entropy",
+                    label="Algorithm (`alg`)"
+                )
+                alg_temp = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.1, step=0.01,
+                    label="Algorithm Temp (`alg_temp`)"
+                )
+            with gr.Row():
+                visualization_delay = gr.Slider(
+                    minimum=0.0, maximum=0.5, value=0.03, step=0.01, # Faster default delay
+                    label="Visualization Delay (seconds)"
+                )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
+        # --- Helper Functions for UI ---
         def add_message_to_history(history, message, response):
             """Add a message pair to the history state"""
+            history.append([message, response])
+            return history
+        def user_message_action(message, history):
+            """Handles user sending a message: updates history, clears input."""
             if not message or message.strip() == "":
+                return history, history, "", [], "" # Return empty vis, empty response
+            # Add user message with None response placeholder
+            history = add_message_to_history(history, message, None)
+            # Return updated history for chatbot display, clear input box
+            return history, history, "", [], "" # Clear vis and response text state too
+        def bot_response_generator(
+            history, gen_length, steps, constraints_str, delay,
+            temperature, top_p, # top_k,
+            alg, alg_temp
+            ):
+            """Generates bot response and yields visualization states."""
+            if not history or history[-1][1] is not None: # Check if last message already has a response
+                print("History empty or last message already processed.")
+                yield history, [], "" # Yield empty state if no work to do
                 return
             last_user_message = history[-1][0]
+            print(f"Generating response for: {last_user_message}")
             try:
+                # Format history for the model (excluding the last None response)
+                messages = format_chat_history(history[:-1])
+                # Add the current user message
+                messages.append({"role": "user", "content": last_user_message})
+                # Parse constraints into token IDs
                 parsed_constraints = parse_constraints(constraints_str)
+                print(f"Parsed constraints: {parsed_constraints}")
+                final_text = "" # Initialize final_text
+                # Use the generator function
+                response_generator = generate_response_with_visualization_dream(
+                    messages,
+                    gen_length=gen_length,
                     steps=steps,
                     constraints=parsed_constraints,
+                    temperature=temperature,
+                    top_p=top_p if top_p > 0 else None, # Pass None if 0
+                    top_k=None, # Pass None if 0 top_k if top_k > 0 else None,
                     alg=alg,
+                    alg_temp=alg_temp if alg_temp > 0 else None, # Pass None if 0
+                    yield_intermediate=True
+                )
+                # Iterate through the yielded visualization states
+                last_state = None
+                for vis_state in response_generator:
+                    last_state = vis_state
+                     # Update chatbot with placeholder during generation
+                    history[-1][1] = "..." # Indicate thinking
+                    yield history, vis_state, "..." # Yield history, current vis state, placeholder text
+                    if delay > 0:
+                        time.sleep(delay)
+                # --- Generation Finished ---
+                # Extract final text (needs to be done *after* the generator is exhausted)
+                # Re-run the generation without yielding intermediates to get the final text reliably
+                # (Or modify the generator to return it, but this is simpler for now)
+                # TODO: Optimize this - maybe the generator could return the final text at the end?
+                print("Re-generating final text (non-streaming)...")
+                final_vis_states, final_text = generate_response_with_visualization_dream(
+                    messages, gen_length, steps, parsed_constraints, temperature,
+                    top_p if top_p > 0 else None, None, #top_k if top_k > 0 else None,
+                    alg, alg_temp if alg_temp > 0 else None,
+                    yield_intermediate=False # Get final result only
+                )
+                print(f"Final Text: {final_text}")
+                # Update the history with the actual final response
+                history[-1][1] = final_text.strip() if final_text else "[No response]"
+                # Yield the final state one last time
+                yield history, final_vis_states[-1] if final_vis_states else [], final_text.strip()
             except Exception as e:
                 import traceback
+                print(f"Error during generation: {e}")
                 traceback.print_exc()
+                error_msg = f"Error: {str(e)}"
+                history[-1][1] = error_msg # Show error in chat
+                # Show error in visualization (red text)
+                error_vis = [(error_msg, "#FF0000")]
+                yield history, error_vis, error_msg
+        def clear_conversation_action():
+            """Clears chat history, visualization, and response text."""
+            return [], [], "", [] # History, Chatbot UI, Response Text, Visualization
+        # --- Event Wiring ---
+        # 1. User Submits Message (Textbox Enter or Button Click)
+        submit_triggers = [user_input.submit, send_btn.click]
+        for trigger in submit_triggers:
+            trigger.then(
+                fn=user_message_action,
+                inputs=[user_input, chat_history],
+                outputs=[chat_history, chatbot_ui, user_input, output_vis, current_response_text], # Update history state, chatbot UI, clear input, clear vis, clear response state
+                queue=True # Enable queue for handling multiple users
+            ).then(
+                # 2. Trigger Bot Response Generation (Generator Function)
+                fn=bot_response_generator,
+                inputs=[
+                    chat_history, gen_length, steps, constraints_input, visualization_delay,
+                    temperature, top_p, # top_k,
+                    alg_strategy, alg_temp
+                ],
+                outputs=[chatbot_ui, output_vis, current_response_text] # Stream updates to chatbot, visualization, and store final text
+            )
+        # Clear Button Action
         clear_btn.click(
+            fn=clear_conversation_action,
             inputs=[],
+            outputs=[chat_history, chatbot_ui, current_response_text, output_vis],
+            queue=False # No need to queue clear action
         )
     return demo
+# --- Launch ---
 if __name__ == "__main__":
+    # Make sure the necessary Dream model files (modeling_dream.py, configuration_dream.py etc.)
+    # are in the same directory or accessible in the Python path.
+    # Also ensure 'generation_utils.py' is available if needed by the model loading/config.
+    # Check if 'modeling_dream.py' exists before launching
+    import os
+    if not os.path.exists("modeling_dream.py") or not os.path.exists("configuration_dream.py"):
+         print("\nERROR: Could not find 'modeling_dream.py' and/or 'configuration_dream.py'.")
+         print("Please make sure these files (from the 'dream_model.txt' source) are in the same directory as this script.")
+         print("You might need to extract them from the provided text file.")
+         # exit() # Optional: stop execution if files are missing
+    print("Creating Gradio Demo...")
     demo = create_chatbot_demo()
+    print("Launching Gradio Demo...")
+    # Use queueing for better user experience with potentially long generation times
+    demo.queue().launch(share=True, debug=True) # Enable debug for more detailed logs