Spaces:

multimodalart
/

Dream

Running on Zero

App Files Files Community

multimodalart HF Staff commited on about 1 month ago

Commit

f4ff30a

verified ·

1 Parent(s): cfc13ea

Update app.py

Browse files

Files changed (1) hide show

app.py +329 -389

app.py CHANGED Viewed

@@ -2,57 +2,62 @@
 import torch
 import numpy as np
 import gradio as gr
-import spaces
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel, AutoConfig
 import time
-import copy
 # Determine device
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f"Using device: {device}")
-# --- Model and Tokenizer Loading ---
-model_path = "Dream-org/Dream-v0-Instruct-7B"
-print(f"Loading tokenizer from {model_path}...")
-# Load configuration first to get special token IDs
-config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-print(f"Loading model from {model_path}...")
 model = AutoModel.from_pretrained(
     model_path,
-    torch_dtype=torch.bfloat16,
     trust_remote_code=True
-).to(device).eval()
-print("Model loaded successfully.")
-# --- Constants from Dream Model ---
-# Get IDs directly from config or tokenizer if available
 MASK_TOKEN = tokenizer.mask_token
-MASK_ID = config.mask_token_id if hasattr(config, 'mask_token_id') else tokenizer.mask_token_id
-EOS_ID = config.eos_token_id if hasattr(config, 'eos_token_id') else tokenizer.eos_token_id
-PAD_ID = config.pad_token_id if hasattr(config, 'pad_token_id') else tokenizer.pad_token_id # Often same as EOS
-print(f"MASK_TOKEN: '{MASK_TOKEN}', MASK_ID: {MASK_ID}")
-print(f"EOS_ID: {EOS_ID}, PAD_ID: {PAD_ID}")
-if MASK_ID is None:
-    raise ValueError("Could not determine MASK_ID from model config or tokenizer.")
-if EOS_ID is None:
-    raise ValueError("Could not determine EOS_ID from model config or tokenizer.")
-if PAD_ID is None:
-    raise ValueError("Could not determine PAD_ID from model config or tokenizer.")
 # --- Helper Functions ---
-def parse_constraints(constraints_text, tokenizer):
-    """Parse constraints in format: 'position:word, position:word, ...'"""
     constraints = {}
-    processed_constraints_tokens = {}
     if not constraints_text:
-        return constraints, processed_constraints_tokens
     parts = constraints_text.split(',')
     for part in parts:
@@ -60,270 +65,292 @@ def parse_constraints(constraints_text, tokenizer):
             continue
         pos_str, word = part.split(':', 1)
         try:
             pos = int(pos_str.strip())
             word = word.strip()
-            if word and pos >= 0:
-                # Store original word constraint for display/debugging if needed
-                constraints[pos] = word
-                # Tokenize the word (add space for consistency if not BOS)
-                # Note: Dream tokenizer might handle spaces differently, adjust if needed
-                prefix = " " if pos > 0 else ""
-                tokens = tokenizer.encode(prefix + word, add_special_tokens=False)
-                for i, token_id in enumerate(tokens):
-                     # Prevent overwriting multi-token constraints partially
-                    if pos + i not in processed_constraints_tokens:
-                        processed_constraints_tokens[pos + i] = token_id
         except ValueError:
-            continue
         except Exception as e:
-             print(f"Error tokenizing constraint word '{word}': {e}")
-             continue
-    # Sort by position for consistent application
-    processed_constraints_tokens = dict(sorted(processed_constraints_tokens.items()))
-    print(f"Parsed Constraints (Word): {constraints}")
-    print(f"Parsed Constraints (Tokens): {processed_constraints_tokens}")
-    return constraints, processed_constraints_tokens
-def format_chat_history(history):
     """
-    Format chat history for the Dream model using its chat template convention.
     Args:
-        history: List of [user_message, assistant_message] pairs
     Returns:
-        Formatted list of message dictionaries for the model
     """
     messages = []
-     # Add system prompt if not present (standard practice)
-    if not history or history[0][0] is None or history[0][0].lower() != "system":
-         messages.append({"role": "system", "content": "You are a helpful assistant."})
     for user_msg, assistant_msg in history:
-        if user_msg is not None: # Handle potential system message case
              messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:  # Skip if None (for the latest user message)
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
-# --- Core Generation Logic with Visualization Hook ---
-@spaces.GPU
-def generate_response_with_visualization(
-    messages, # List of message dictionaries
-    gen_length=64,
-    steps=64,
-    constraints_text="", # Raw constraint text
-    temperature=0.2,
-    top_p=0.95,
-    top_k=None, # Added for Dream
-    alg="entropy", # Changed from remasking
-    alg_temp=0.0, # Added for Dream
-    visualization_delay=0.05,
-    tokenizer=tokenizer,
-    model=model,
-    device=device,
-    MASK_ID=MASK_ID,
-    EOS_ID=EOS_ID,
-    PAD_ID=PAD_ID
-):
     """
-    Generate text with Dream model with real-time visualization using a hook.
     """
-    visualization_states = []
-    final_text = ""
-    # Use a list to hold previous_x, allowing nonlocal modification
-    # Initialize with None, it will be set after the first hook call
-    shared_state = {'previous_x': None}
     try:
-        # --- 1. Prepare Inputs ---
-        _, parsed_constraints_tokens = parse_constraints(constraints_text, tokenizer)
-        # Apply chat template
-        # Important: Keep tokenize=False initially to get prompt length correctly
-        # The template adds roles and special tokens like <|im_start|> etc.
-        chat_input_text = tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True, # Adds the prompt for the assistant's turn
-            tokenize=False
         )
-        # Tokenize the full templated chat string
-        inputs = tokenizer(chat_input_text, return_tensors="pt", return_dict=True)
         input_ids = inputs.input_ids.to(device)
-        attention_mask = inputs.attention_mask.to(device) # Use mask from tokenizer
         prompt_length = input_ids.shape[1]
-        total_length = prompt_length + gen_length
-        # --- 2. Initialize Generation Sequence ---
-        # Start with the prompt, pad the rest with MASK_ID
-        x = torch.full((1, total_length), MASK_ID, dtype=torch.long, device=device)
-        x[:, :prompt_length] = input_ids.clone()
-        attention_mask = F.pad(attention_mask, (0, gen_length), value=1) # Extend attention mask
-        # Apply initial constraints to the masked sequence `x`
-        for pos, token_id in parsed_constraints_tokens.items():
-            absolute_pos = prompt_length + pos
-            if absolute_pos < total_length:
-                print(f"Applying initial constraint at pos {absolute_pos}: token {token_id}")
-                x[:, absolute_pos] = token_id
-        # Store initial state (prompt + all masked) for visualization
-        initial_state_vis = []
-        # Add prompt tokens (optional visualization, could be grayed out or skipped)
-        # for i in range(prompt_length):
-        #     token_str = tokenizer.decode([x[0, i].item()], skip_special_tokens=True)
-        #     initial_state_vis.append((token_str if token_str else " ", "#AAAAAA")) # Gray for prompt
-        # Add masked tokens for the generation part
-        for _ in range(gen_length):
-            initial_state_vis.append((MASK_TOKEN, "#444444")) # Dark gray for masks
-        visualization_states.append(initial_state_vis)
-        shared_state['previous_x'] = x.clone() # Initialize previous_x
-        # --- 3. Define the Visualization Hook ---
-        def generation_tokens_hook_func(step, current_x_hook, logits):
-            # nonlocal previous_x # Allow modification of the outer scope variable
-            current_x_hook = current_x_hook.clone() # Work on a copy
-            # --- Apply constraints within the hook ---
-            # This ensures constraints are respected even if the model tries to overwrite them
-            for pos, token_id in parsed_constraints_tokens.items():
-                absolute_pos = prompt_length + pos
-                if absolute_pos < total_length:
-                    current_x_hook[:, absolute_pos] = token_id
-            # --- End Constraint Application ---
-            if shared_state['previous_x'] is None: # First call
-                 shared_state['previous_x'] = current_x_hook.clone()
-                 return current_x_hook # Must return the (potentially modified) sequence
-            # Generate visualization state for this step
-            current_state_vis = []
-            prev_x_step = shared_state['previous_x']
-            for i in range(gen_length):
-                pos = prompt_length + i  # Absolute position in the sequence
-                current_token_id = current_x_hook[0, pos].item()
-                prev_token_id = prev_x_step[0, pos].item()
-                # Decode token, handling special tokens we want to hide
-                token_str = ""
-                color = "#444444" # Default: Dark Gray (Mask)
-                token_str_raw = tokenizer.decode([current_token_id], skip_special_tokens=False) # Keep special tokens for ID check
-                if current_token_id == MASK_ID:
-                    token_str = MASK_TOKEN
-                    color = "#444444" # Dark gray
-                elif current_token_id == EOS_ID or current_token_id == PAD_ID:
-                     token_str = "" # Hide EOS/PAD visually
-                     color = "#DDDDDD" # Use a light gray or make transparent if possible
-                else:
-                    # Decode without special tokens for display if it's not MASK/EOS/PAD
-                    token_str = tokenizer.decode([current_token_id], skip_special_tokens=True).strip()
-                    if not token_str: token_str = token_str_raw # Fallback if strip removes everything (e.g., space)
-                    if prev_token_id == MASK_ID:
-                        # Newly revealed in this step
-                        color = "#66CC66" # Light green (Simplified from confidence levels)
-                    else:
-                        # Previously revealed
-                        color = "#6699CC" # Light blue
-                current_state_vis.append((token_str if token_str else " ", color)) # Ensure non-empty tuple element
-            visualization_states.append(current_state_vis)
-            shared_state['previous_x'] = current_x_hook.clone() # Update previous_x for the next step
-            return current_x_hook # Return the sequence (constraints applied)
-        # --- 4. Run Diffusion Generation ---
-        print("Starting diffusion generation...")
-        start_time = time.time()
         output = model.diffusion_generate(
-            input_ids=x[:, :prompt_length], # Pass only the initial prompt to diffusion_generate
-                                            # as it handles the masking internally based on MASK_ID
-            attention_mask=attention_mask,  # Provide the full attention mask
             max_new_tokens=gen_length,
             output_history=False, # We capture history via the hook
             return_dict_in_generate=True,
             steps=steps,
             temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
             alg=alg,
-            alg_temp=alg_temp if alg != 'origin' else None, # alg_temp only for confidence-based
-            # Pass the hook function
-            generation_tokens_hook_func=generation_tokens_hook_func,
-            # Ensure the initial masked sequence `x` is used correctly if needed by internal logic
-            # Depending on the exact implementation of diffusion_generate, passing x directly might be needed
-            # Check Dream's generation_utils if issues arise. For now, assume it uses input_ids + max_new_tokens
         )
         end_time = time.time()
-        print(f"Diffusion generation finished in {end_time - start_time:.2f} seconds.")
-        # --- 5. Process Final Output ---
-        # The hook has already built visualization_states
         final_sequence = output.sequences[0]
-        # Decode the generated part, skipping special tokens for the final text output
         response_tokens = final_sequence[prompt_length:]
-        # Filter out PAD tokens before final decode, keep EOS if needed conceptually, but skip for clean text
-        response_tokens_cleaned = [tok for tok in response_tokens if tok != PAD_ID] # Keep EOS initially if needed
-        final_text = tokenizer.decode(
-            response_tokens_cleaned,
-            skip_special_tokens=True, # Skip EOS, BOS, etc.
-            clean_up_tokenization_spaces=True # Recommended for cleaner output
         ).strip()
-        # Ensure the last state in visualization matches the final text (debug check)
-        # print(f"Last Vis State Tokens: {''.join([t[0] for t in visualization_states[-1]]).strip()}")
-        # print(f"Final Decoded Text: {final_text}")
     except Exception as e:
-        print(f"Error during generation: {e}")
         import traceback
         traceback.print_exc()
-        # Add error message to visualization
-        error_msg = f"Error: {str(e)}"
-        visualization_states.append([(error_msg, "red")])
-        final_text = error_msg # Display error in the chatbot too
-    # Make sure at least the initial state is present
-    if not visualization_states:
-         visualization_states.append([("Error: No states generated.", "red")])
-    return visualization_states, final_text
-# --- Gradio UI Definition ---
 css = '''
 .category-legend{display:none}
-button{height: 60px}
-.token-text { white-space: pre; } /* Preserve spaces in tokens */
-footer { display: none !important; visibility: hidden !important; }
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
         gr.Markdown(
             "[[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)] "
-            "[[Blog Post](https://hkunlp.github.io/blog/2025/dream/)] "
-             "(Note: Visualization shows token reveal steps, colors indicate status: Gray=Masked, Green=Newly Revealed, Blue=Previously Revealed)"
         )
         # STATE MANAGEMENT
         chat_history = gr.State([])
-        # Store constraints parsed into token IDs
-        parsed_constraints_state = gr.State({})
         # UI COMPONENTS
         with gr.Row():
@@ -331,8 +358,9 @@ def create_chatbot_demo():
                 chatbot_ui = gr.Chatbot(
                     label="Conversation",
                     height=500,
-                    bubble_full_width=False # Makes bubbles wrap content
-                 )
                 # Message input
                 with gr.Group():
@@ -340,217 +368,129 @@ def create_chatbot_demo():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
                             show_label=False,
-                            scale=7
                         )
-                        send_btn = gr.Button("Send", scale=1)
                 constraints_input = gr.Textbox(
-                    label="Word Constraints (Experimental)",
-                    info="Place specific words at positions (0-indexed). Format: 'pos:word, pos:word'. Example: '0:Once, 5:upon, 10:time'. Multi-token words supported.",
-                    placeholder="0:The, 10:story",
                     value=""
                 )
             with gr.Column(scale=2):
                 output_vis = gr.HighlightedText(
                     label="Denoising Process Visualization",
                     combine_adjacent=False,
-                    show_legend=False, # Legend not very informative here
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
-                    minimum=16, maximum=512, value=128, step=8,
                     label="Max New Tokens"
                 )
                 steps = gr.Slider(
-                    minimum=8, maximum=512, value=128, step=4,
-                    label="Denoising Steps"
                 )
             with gr.Row():
                 temperature = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.2, step=0.05,
                     label="Temperature"
                 )
                 top_p = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
-                    label="Top-P"
                 )
                 top_k = gr.Slider(
                     minimum=0, maximum=200, value=0, step=5,
                     label="Top-K (0=disabled)"
                 )
             with gr.Row():
-                 alg = gr.Radio(
                     choices=['origin', 'maskgit_plus', 'topk_margin', 'entropy'],
-                    value='entropy',
-                    label="Sampling Algorithm (`alg`)"
-                 )
-                 alg_temp = gr.Slider(
-                    minimum=0.0, maximum=1.0, value=0.0, step=0.05,
-                    label="Algorithm Temp (`alg_temp`, adds randomness to confidence-based `alg`)"
-                 )
             with gr.Row():
                 visualization_delay = gr.Slider(
-                    minimum=0.0, maximum=0.5, value=0.02, step=0.01,
                     label="Visualization Delay (seconds)"
                 )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
-        # --- Event Handlers ---
-        def add_message(history, message, response):
-            """Add a message pair to the history and return the updated history"""
-            # Ensure history is a list
-            if not isinstance(history, list):
-                 history = []
-            history.append([message, response])
-            return history
-        def user_message_submitted(message, history):
-            """Process a submitted user message"""
-            if not message.strip():
-                return history, history, "", [] # No change if empty
-            # Add user message (response is None for now)
-            history = add_message(history, message, None)
-            # Return updated history for display, clear input box
-            return history, history, "", [] # history, chatbot_ui, user_input, output_vis
-        def bot_response_stream(
-            history, # Current chat history (list of lists)
-            gen_length, steps, constraints, # Generation settings
-            temperature, top_p, top_k, alg, alg_temp, # Sampling settings
-            delay # Visualization delay
-        ):
-            """Generate bot response and stream visualization states"""
-            if not history or history[-1][1] is not None: # Check if history is present and last response isn't already set
-                 print("Skipping bot response generation: No new user message.")
-                 # Yield empty state if needed to prevent errors downstream
-                 # Ensure history is returned correctly if nothing happens
-                 yield history, [], "Internal Error: No user message found."
-                 return
-            # Format messages for the model
-            # Exclude the last entry as it only contains the user message
-            messages_for_model = format_chat_history(history) # Already includes system prompt
-            print("\n--- Generating Bot Response ---")
-            print(f"History: {history}")
-            print(f"Messages for model: {messages_for_model}")
-            print(f"Constraints text: '{constraints}'")
-            print(f"Gen length: {gen_length}, Steps: {steps}, Temp: {temperature}, Top-P: {top_p}, Top-K: {top_k}, Alg: {alg}, Alg Temp: {alg_temp}")
-            # Call the generation function
-            vis_states, response_text = generate_response_with_visualization(
-                messages_for_model,
-                gen_length=gen_length,
-                steps=steps,
-                constraints_text=constraints,
-                temperature=temperature,
-                top_p=top_p if top_p < 1.0 else None, # None disables top-p
-                top_k=top_k if top_k > 0 else None,   # None disables top-k
-                alg=alg,
-                alg_temp=alg_temp,
-                visualization_delay=delay,
-                # Pass other necessary args like tokenizer, model if not global
-            )
-            print(f"Generated response text: '{response_text}'")
-            print(f"Number of visualization states: {len(vis_states)}")
-            # Update the history with the final response
-            # Make sure history is mutable if needed or reassign
-            if history:
-                 history[-1][1] = response_text
-            else:
-                 print("Warning: History was empty when trying to update response.")
-            # Stream the visualization states
-            if not vis_states:
-                 print("Warning: No visualization states were generated.")
-                 # Yield something to prevent downstream errors
-                 yield history, [("Error: No visualization.", "red")], response_text
-                 return
-            # Yield initial state immediately if desired, or just start loop
-            # yield history, vis_states[0], response_text
-            for state in vis_states:
-                yield history, state, response_text # Yield updated history, current vis state, final text
-                time.sleep(delay) # Pause between steps
-            # Final yield to ensure the last state is displayed
-            yield history, vis_states[-1], response_text
         def clear_conversation():
-            """Clear the conversation history and visualization"""
-            return [], [], "", [] # history, chatbot, user_input, output_vis
-        # --- Event Wiring ---
-        # Clear button
         clear_btn.click(
-            fn=clear_conversation,
             inputs=[],
             outputs=[chat_history, chatbot_ui, user_input, output_vis]
         )
-        # User message submission flow (2-step using .then)
-        # 1. User submits message -> Update history and chatbot UI immediately
-        submit_action = user_input.submit(
-            fn=user_message_submitted,
-            inputs=[user_input, chat_history],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis] # Update chatbot, clear input
-        )
-        # Connect send button to the same function
-        send_action = send_btn.click(
-            fn=user_message_submitted,
-            inputs=[user_input, chat_history],
-            outputs=[chat_history, chatbot_ui, user_input, output_vis]
-        )
-        # 2. After UI update -> Trigger bot response generation and streaming
-        # Use the updated chat_history from the first step
-        submit_action.then(
-            fn=bot_response_stream,
-            inputs=[
-                chat_history, gen_length, steps, constraints_input,
-                temperature, top_p, top_k, alg, alg_temp,
-                visualization_delay
-            ],
-            outputs=[chatbot_ui, output_vis, user_input] # Update chatbot, visualization. Keep user_input as output to potentially display final text/error? (Check Gradio docs for Textbox output binding on yield) Let's remove user_input from outputs here.
-        )
-        send_action.then(
-            fn=bot_response_stream,
-            inputs=[
-                 chat_history, gen_length, steps, constraints_input,
-                 temperature, top_p, top_k, alg, alg_temp,
-                 visualization_delay
-            ],
-            outputs=[chatbot_ui, output_vis] # Update chatbot and visualization
-        )
-        # Clear input after send/submit (already handled in user_message_submitted)
-        # submit_action.then(lambda: "", outputs=user_input)
-        # send_action.then(lambda: "", outputs=user_input)
     return demo
-# --- Launch the Gradio App ---
 if __name__ == "__main__":
     demo = create_chatbot_demo()
-    # Using queue for streaming and handling multiple users
-    demo.queue().launch(debug=True, share=True)

 import torch
 import numpy as np
 import gradio as gr
+import spaces # Ensure spaces is installed if needed for GPU decorator
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel, AutoConfig
 import time
+import re
+from typing import List, Dict, Tuple, Optional
+# Load model configuration to get special token IDs
+config = AutoConfig.from_pretrained("Dream-org/Dream-v0-Instruct-7B", trust_remote_code=True)
+# Use AutoModel for the base model loading, relying on trust_remote_code=True
+# for the custom DreamModel class and generation mixin.
+model_path = "Dream-org/Dream-v0-Instruct-7B"
 # Determine device
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f"Using device: {device}")
+# Load model and tokenizer
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+print("Loading model...")
+# Ensure torch_dtype is set appropriately for your hardware if needed
 model = AutoModel.from_pretrained(
     model_path,
+    torch_dtype=torch.bfloat16 if device == 'cuda' else torch.float32, # Use bfloat16 only on CUDA
     trust_remote_code=True
+)
+model = model.to(device).eval()
+print("Model loaded.")
+# Constants from Dream's config/tokenizer
+# Use attributes from loaded config/tokenizer objects
 MASK_TOKEN = tokenizer.mask_token
+MASK_ID = config.mask_token_id
+PAD_ID = config.pad_token_id
+EOS_ID = config.eos_token_id
+# Make sure EOS_ID and PAD_ID are handled correctly; Dream uses the same ID for both
+SPECIAL_TOKEN_IDS = {PAD_ID, EOS_ID, MASK_ID}
+# Add other special tokens defined in tokenizer_config.json if needed for hiding
+# Get IDs for im_start, im_end etc. if they should also be hidden/handled specially
+IM_START_ID = tokenizer.convert_tokens_to_ids("<|im_start|>")
+IM_END_ID = tokenizer.convert_tokens_to_ids("<|im_end|>")
+SPECIAL_TOKEN_IDS.add(IM_START_ID)
+SPECIAL_TOKEN_IDS.add(IM_END_ID)
 # --- Helper Functions ---
+def parse_constraints(constraints_text: str) -> Dict[int, List[int]]:
+    """
+    Parse constraints in format: 'position:word, position:word, ...'
+    Returns a dictionary mapping the starting position (0-indexed from the start
+    of the *generated* sequence) to a list of token IDs for the constraint word.
+    """
     constraints = {}
     if not constraints_text:
+        return constraints
     parts = constraints_text.split(',')
     for part in parts:
             continue
         pos_str, word = part.split(':', 1)
         try:
+            # Position relative to the start of the *generation*
             pos = int(pos_str.strip())
             word = word.strip()
+            # Tokenize the word - add leading space if not BOS? Dream handles spaces.
+            # Check Dream tokenizer behavior for spaces. Assuming standard behavior:
+            token_ids = tokenizer.encode(" " + word if pos > 0 else word, add_special_tokens=False)
+            if token_ids and pos >= 0:
+                constraints[pos] = token_ids
         except ValueError:
+            continue # Ignore malformed constraint parts
         except Exception as e:
+            print(f"Warning: Error processing constraint '{part}': {e}")
+            continue
+    return constraints
+def format_chat_history(history: List[List[Optional[str]]]) -> List[Dict[str, str]]:
     """
+    Format chat history for the Dream model's chat template.
     Args:
+        history: List of [user_message, assistant_message] pairs.
+                 The last assistant_message might be None.
     Returns:
+        Formatted list of message dictionaries for tokenizer.apply_chat_template.
     """
     messages = []
+     # Check if the first message is a system prompt, handle accordingly if needed
+    # Based on Dream's examples, the template adds a default system prompt if none exists.
+    # If history starts with System, it should be handled by the template.
+    # Let's assume the template handles the system prompt correctly.
     for user_msg, assistant_msg in history:
+        if user_msg: # Defensive check
              messages.append({"role": "user", "content": user_msg})
+        # Add assistant message only if it exists (it won't for the last turn before generation)
+        if assistant_msg:
             messages.append({"role": "assistant", "content": assistant_msg})
     return messages
+# --- Core Generation Logic with Live Visualization ---
+@spaces.GPU # Decorator for Hugging Face Spaces GPU usage
+def generate_dream_response(
+    history: List[List[Optional[str]]],
+    gen_length: int,
+    steps: int,
+    constraints_text: str,
+    temperature: float,
+    top_p: Optional[float],
+    top_k: Optional[int],
+    alg: str,
+    alg_temp: Optional[float],
+    visualization_delay: float
+    ) -> List[Tuple[str, str]]:
     """
+    Generates text using the Dream model and yields visualization states live.
+    Args:
+        history: Chat history.
+        gen_length: Max new tokens to generate.
+        steps: Number of diffusion steps.
+        constraints_text: User-provided constraints string.
+        temperature: Sampling temperature.
+        top_p: Top-p sampling nucleus.
+        top_k: Top-k sampling.
+        alg: Remasking algorithm ('origin', 'maskgit_plus', 'topk_margin', 'entropy').
+        alg_temp: Temperature for confidence-based algorithms.
+        visualization_delay: Delay between visualization steps.
+    Yields:
+        Tuple[List[List[Optional[str]]], List[Tuple[str, Optional[str]]], str]:
+            - Updated history
+            - Visualization data for HighlightedText
+            - Final response text (repeated in each yield)
     """
+    if not history or not history[-1][0]:
+        # No user message to respond to
+        yield history, [("No input message found.", "red")], ""
+        return
+    # --- 1. Preparation ---
+    last_user_message = history[-1][0]
+    messages_for_template = format_chat_history(history) # Includes the latest user message
+    # Parse constraints relative to the *generated* sequence
+    parsed_constraints = parse_constraints(constraints_text) # Dict[rel_pos, List[token_id]]
+    # Prepare inputs using the chat template
     try:
+        inputs = tokenizer.apply_chat_template(
+            messages_for_template,
+            return_tensors="pt",
+            return_dict=True,
+            add_generation_prompt=True # Important for instruct models
         )
         input_ids = inputs.input_ids.to(device)
+        attention_mask = inputs.attention_mask.to(device)
         prompt_length = input_ids.shape[1]
+    except Exception as e:
+        print(f"Error applying chat template: {e}")
+        yield history, [("Error preparing input.", "red")], ""
+        return
+    # Calculate total sequence length for the model
+    # Max length constraint from model config (e.g., 2048 for original Dream?)
+    # Let's use a reasonable default or allow configuration if needed.
+    # The provided code uses max_position_embeddings=131072, let's stick to user input + gen_length.
+    total_length = prompt_length + gen_length
+    # --- 2. Visualization Setup ---
+    # This list will store the token sequence (just the generated part) at each step
+    step_sequence_history: List[torch.Tensor] = []
+    previous_step_tokens = None # Keep track of the previous step's state
+    # Define the hook function *inside* this function to capture state
+    def live_visualization_hook(step: Optional[int], x: torch.Tensor, logits: Optional[torch.Tensor]) -> torch.Tensor:
+        nonlocal step_sequence_history, parsed_constraints, prompt_length
+        # --- Apply Constraints ---
+        # Constraints are applied *after* the model proposes tokens but *before* they are finalized for the step
+        # Note: The hook receives the state *before* the next model call in the next step,
+        # or the final state after the last step. Let's apply constraints consistently.
+        # The `diffusion_generate` calls the hook *after* updating x based on sampling.
+        current_x = x.clone() # Work on a copy
+        for rel_pos, word_token_ids in parsed_constraints.items():
+            abs_start_pos = prompt_length + rel_pos
+            abs_end_pos = abs_start_pos + len(word_token_ids)
+            # Ensure the constraint fits within the generation length
+            if abs_start_pos < total_length and abs_end_pos <= total_length:
+                try:
+                    constraint_tensor = torch.tensor(word_token_ids, dtype=torch.long, device=current_x.device)
+                    # Force the constraint tokens onto the sequence
+                    current_x[0, abs_start_pos:abs_end_pos] = constraint_tensor
+                except IndexError:
+                     print(f"Warning: Constraint at {rel_pos} ('{tokenizer.decode(word_token_ids)}') goes out of bounds.")
+                except Exception as e:
+                     print(f"Warning: Failed to apply constraint at {rel_pos}: {e}")
+        # Store the state *after* constraints for visualization
+        # We only need the generated part
+        generated_part = current_x[0, prompt_length:].clone().cpu() # Move to CPU to save GPU memory
+        step_sequence_history.append(generated_part)
+        # Return the (potentially modified by constraints) tensor x
+        return current_x # Pass the constrained version to the next step
+    # --- 3. Run Generation ---
+    final_response_text = ""
+    try:
+        print(f"Starting Dream generation: prompt_len={prompt_length}, gen_len={gen_length}, steps={steps}")
+        start_time = time.time()
+        # Initial masked state for visualization
+        initial_generated_state = torch.full((gen_length,), MASK_ID, dtype=torch.long)
+        # Apply constraints to the *initial* visual state if they start at pos 0
+        temp_initial_x = torch.cat((input_ids[0], initial_generated_state.to(device)), dim=0).unsqueeze(0)
+        initial_vis_x = live_visualization_hook(None, temp_initial_x, None) # Apply constraints via hook logic
+        step_sequence_history.insert(0, initial_vis_x[0, prompt_length:].cpu()) # Prepend initial state
         output = model.diffusion_generate(
+            input_ids,
+            attention_mask=attention_mask,
             max_new_tokens=gen_length,
             output_history=False, # We capture history via the hook
             return_dict_in_generate=True,
             steps=steps,
             temperature=temperature,
+            top_p=top_p if top_p is not None and top_p < 1.0 else None, # Ensure top_p < 1 or None
+            top_k=top_k if top_k is not None and top_k > 0 else None,    # Ensure top_k > 0 or None
             alg=alg,
+            alg_temp=alg_temp if alg in ['maskgit_plus', 'topk_margin', 'entropy'] else None, # Only relevant for some algs
+            generation_tokens_hook_func=live_visualization_hook
         )
         end_time = time.time()
+        print(f"Dream generation finished in {end_time - start_time:.2f} seconds.")
+        # --- 4. Process Final Output ---
         final_sequence = output.sequences[0]
         response_tokens = final_sequence[prompt_length:]
+        # Decode the final response text
+        final_response_text = tokenizer.decode(
+            response_tokens,
+            skip_special_tokens=True, # Skip EOS, PAD, MASK etc. in the final text
+            clean_up_tokenization_spaces=True
         ).strip()
+        # Update history with the final response
+        history[-1][1] = final_response_text
     except Exception as e:
+        print(f"Error during generation or processing: {e}")
         import traceback
         traceback.print_exc()
+        yield history, [("Error during generation.", "red")], ""
+        return
+    # --- 5. Stream Visualization ---
+    print(f"Streaming {len(step_sequence_history)} visualization steps...")
+    previous_tokens_vis = None
+    for i, current_tokens_vis in enumerate(step_sequence_history):
+        # print(f"  Step {i}: {current_tokens_vis.tolist()}") # Debug
+        vis_data = []
+        current_decoded_tokens = []
+        # Compare current step tokens with previous step tokens
+        for j in range(gen_length):
+            current_tok_id = current_tokens_vis[j].item()
+            previous_tok_id = previous_tokens_vis[j].item() if previous_tokens_vis is not None else MASK_ID
+            # Decode token - handle potential errors for single IDs if needed
+            try:
+                 # Use skip_special_tokens=False here to see the actual tokens
+                decoded_token = tokenizer.decode([current_tok_id], skip_special_tokens=False)
+                # Explicitly handle mask token display
+                if current_tok_id == MASK_ID:
+                    display_token = MASK_TOKEN
+                else:
+                    display_token = decoded_token
+            except Exception:
+                display_token = f"[ID:{current_tok_id}]" # Fallback
+            # Determine color and handle hiding of special tokens (like LLaDA demo)
+            color = None
+            token_to_display = display_token
+            if current_tok_id == MASK_ID:
+                color = "#444444" # Dark Gray for masks
+            elif previous_tok_id == MASK_ID: # Token was just revealed
+                 # Simple green for newly revealed, no confidence score available from hook
+                 color = "#66CC66" # Light Green
+            else: # Token was already revealed
+                color = "#6699CC" # Light Blue
+            # LLaDA hiding effect: If it's a special token (EOS/PAD) *and* it was revealed before this step, hide it.
+            if current_tok_id in {PAD_ID, EOS_ID} and previous_tok_id == current_tok_id:
+                 # Hide by making it empty or using a background color - empty string is simpler
+                 token_to_display = ""
+                 color = "#FFFFFF" # Or just make it blend in
+            # Add token and color to visualization data
+            if token_to_display: # Avoid adding empty strings if hiding
+                vis_data.append((token_to_display, color))
+            elif len(vis_data) > 0 and isinstance(vis_data[-1], tuple):
+                 # If hidden, and previous was text, add a space for visual separation?
+                 # This might complicate things, let's omit for now.
+                 pass
+            # elif len(vis_data) == 0: # If first token is hidden
+            #     vis_data.append(("", None)) # Placeholder?
+        # Update previous state for next iteration
+        previous_tokens_vis = current_tokens_vis
+        # Yield the current visualization state
+        yield history, vis_data, final_response_text
+        # Pause for the specified delay
+        time.sleep(visualization_delay)
+    print("Visualization streaming complete.")
+# --- Gradio UI ---
 css = '''
 .category-legend{display:none}
+button{min-height: 60px}
 '''
 def create_chatbot_demo():
     with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         gr.Markdown("# Dream 7B - Diffusion Language Model Demo")
         gr.Markdown(
             "[[Model Card](https://huggingface.co/Dream-org/Dream-v0-Instruct-7B)] "
+            "[[Blog](https://hkunlp.github.io/blog/2025/dream/)]"
         )
         # STATE MANAGEMENT
         chat_history = gr.State([])
         # UI COMPONENTS
         with gr.Row():
                 chatbot_ui = gr.Chatbot(
                     label="Conversation",
                     height=500,
+                    show_copy_button=True,
+                    bubble_full_width=False
+                )
                 # Message input
                 with gr.Group():
                         user_input = gr.Textbox(
                             label="Your Message",
                             placeholder="Type your message here...",
+                            scale=7,
+                            autofocus=True,
                             show_label=False,
+                            container=False # Remove container for tighter packing
                         )
+                        send_btn = gr.Button("Send", scale=1, variant="primary")
                 constraints_input = gr.Textbox(
+                    label="Word Constraints (Optional)",
+                    info="Place words at specific positions (0-indexed from start of generation). Format: 'pos:word, pos:word,...'. Example: '0:Once, 5:upon, 10:time'",
+                    placeholder="0:Hello, 10:world",
                     value=""
                 )
             with gr.Column(scale=2):
                 output_vis = gr.HighlightedText(
                     label="Denoising Process Visualization",
                     combine_adjacent=False,
+                    show_legend=False, # Legend isn't very informative here
+                    interactive=False # Not interactive
                 )
         # Advanced generation settings
         with gr.Accordion("Generation Settings", open=False):
             with gr.Row():
                 gen_length = gr.Slider(
+                    minimum=16, maximum=512, value=128, step=8, # Increased max length
                     label="Max New Tokens"
                 )
                 steps = gr.Slider(
+                    minimum=8, maximum=512, value=128, step=8, # Increased max steps
+                    label="Diffusion Steps"
                 )
             with gr.Row():
                 temperature = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.4, step=0.05,
                     label="Temperature"
                 )
+                alg_temp = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.1, step=0.05,
+                    label="Remasking Temp (for confidence algs)"
+                )
+            with gr.Row():
                 top_p = gr.Slider(
+                    minimum=0.0, maximum=1.0, value=0.95, step=0.05,
+                    label="Top-P (0=disabled)"
                 )
                 top_k = gr.Slider(
                     minimum=0, maximum=200, value=0, step=5,
                     label="Top-K (0=disabled)"
                 )
             with gr.Row():
+                 remasking_strategy = gr.Radio(
                     choices=['origin', 'maskgit_plus', 'topk_margin', 'entropy'],
+                    value='entropy', # Default to entropy as in example
+                    label="Remasking Strategy (Algorithm)"
+                )
             with gr.Row():
                 visualization_delay = gr.Slider(
+                    minimum=0.0, maximum=0.5, value=0.02, step=0.01, # Faster default
                     label="Visualization Delay (seconds)"
                 )
         # Clear button
         clear_btn = gr.Button("Clear Conversation")
+        # Current response text box (hidden, maybe useful for debugging)
+        # current_response = gr.Textbox(visible=False)
+        # --- Event Handlers ---
+        def add_user_message_to_history(message: str, history: List[List[Optional[str]]]):
+            """Adds user message, clears input, prepares for bot response."""
+            if not message.strip():
+                gr.Warning("Please enter a message.")
+                return history, history, "", [("Enter a message", "grey")] # Keep vis empty or show prompt
+            # Add user message with placeholder for bot response
+            history.append([message, None])
+            # Return updated history for chatbot, empty input box, empty visualization
+            return history, history, "", []
         def clear_conversation():
+            """Clears the chat history and visualization."""
+            return [], [], "", []
+        # --- Connect UI elements ---
+        # User Input Submission (Textbox Enter or Send Button Click)
+        submit_triggers = [user_input.submit, send_btn.click]
+        # 1. Add user message to UI immediately
+        for trigger in submit_triggers:
+            trigger.then(
+                add_user_message_to_history,
+                inputs=[user_input, chat_history],
+                outputs=[chat_history, chatbot_ui, user_input, output_vis] # Update chat, clear input, clear vis
+            ).then( # 2. Trigger bot response generation (as a generator)
+                generate_dream_response,
+                inputs=[
+                    chat_history, gen_length, steps, constraints_input,
+                    temperature, top_p, top_k, remasking_strategy, alg_temp,
+                    visualization_delay
+                ],
+                outputs=[chatbot_ui, output_vis] # Stream updates to chatbot and visualization
+                # Note: The final text response is implicitly handled by updating chatbot_ui
+            )
+        # Clear Button Action
         clear_btn.click(
+            clear_conversation,
             inputs=[],
             outputs=[chat_history, chatbot_ui, user_input, output_vis]
         )
     return demo
+# --- Launch ---
 if __name__ == "__main__":
     demo = create_chatbot_demo()
+    # Use queue for handling multiple users and streaming
+    demo.queue().launch(debug=True, share=True) # Add share=True for public link if needed