Spaces:

sagar007
/

DeepSeekR1_Search

Runtime error

App Files Files Community

sagar007 commited on Mar 27

Commit

ffc273f

verified ·

1 Parent(s): 8652f53

Update app.py

Browse files

Files changed (1) hide show

app.py +404 -472

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-import spaces # Keep for potential future use or other decorators
 from duckduckgo_search import DDGS
 import time
 import torch
@@ -8,64 +8,76 @@ from datetime import datetime
 import os
 import subprocess
 import numpy as np
-from typing import List, Dict, Tuple, Any
 from functools import lru_cache
 import asyncio
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import warnings
 import traceback # For detailed error logging
-# Suppress specific warnings if needed (optional)
-warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
-# Suppress another common warning with torch.compile backend
-# warnings.filterwarnings("ignore", message="Backend 'inductor' is not available.")
 # --- Configuration ---
 MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 MAX_SEARCH_RESULTS = 5
 TTS_SAMPLE_RATE = 24000
 MAX_TTS_CHARS = 1000 # Max characters for a single TTS chunk
-# GPU_DURATION = 60 # Informational only now, decorator is removed
-MAX_NEW_TOKENS = 300 # Increased slightly
 TEMPERATURE = 0.7
 TOP_P = 0.95
 KOKORO_PATH = 'Kokoro-82M' # Path to TTS model directory
 # --- Initialization ---
-# Use a ThreadPoolExecutor for potentially blocking I/O or CPU-bound tasks
-executor = ThreadPoolExecutor(max_workers=4)
-# Initialize model and tokenizer with better error handling
-try:
-    print("Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    tokenizer.pad_token = tokenizer.eos_token
-    print("Loading model...")
-    # Determine device map based on CUDA availability
-    device_map = "auto" if torch.cuda.is_available() else {"": "cpu"}
-    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Use float32 on CPU
-    print(f"Attempting to load model with device_map='{device_map}' and dtype={torch_dtype}")
-    model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         device_map=device_map,
-        # offload_folder="offload", # Enable if needed for large models and disk space is available
-        low_cpu_mem_usage=True, # Important for faster loading
         torch_dtype=torch_dtype,
-        # attn_implementation="flash_attention_2" # Optional: requires flash-attn installed, use if available for speedup on compatible GPUs
     )
-    print(f"Model loaded successfully. Device map: {model.hf_device_map}")
-    # Ensure model is in evaluation mode
-    model.eval()
 except Exception as e:
     print(f"FATAL: Error initializing LLM model: {str(e)}")
     print(traceback.format_exc())
-    raise # Stop execution if model loading fails
-# --- TTS Setup ---
 VOICE_CHOICES = {
     '🇺🇸 Female (Default)': 'af',
     '🇺🇸 Bella': 'af_bella',
@@ -73,204 +85,181 @@ VOICE_CHOICES = {
     '🇺🇸 Nicole': 'af_nicole'
 }
 TTS_ENABLED = False
-TTS_MODEL = None
-VOICEPACKS = {}  # Cache voice packs
-# Initialize Kokoro TTS in a separate thread to avoid blocking startup
-def setup_tts():
-    global TTS_ENABLED, TTS_MODEL, VOICEPACKS
-    # Check privileges for apt-get
     can_sudo = shutil.which('sudo') is not None
     try:
-        # Check if Kokoro already exists
         if not os.path.exists(KOKORO_PATH):
-            print("Cloning Kokoro-82M repository...")
-            # Install git-lfs if not present (might need sudo/apt)
             try:
-                lfs_install_cmd = ['git', 'lfs', 'install']
-                subprocess.run(lfs_install_cmd, check=True, capture_output=True, text=True)
-            except (FileNotFoundError, subprocess.CalledProcessError) as lfs_err:
-                print(f"Warning: git-lfs command failed: {lfs_err}. Cloning might be slow or incomplete.")
-            clone_cmd = ['git', 'clone', 'https://huggingface.co/hexgrad/Kokoro-82M', KOKORO_PATH]
-            result = subprocess.run(clone_cmd, check=True, capture_output=True, text=True)
-            print("Kokoro cloned successfully.")
-            # print(result.stdout) # Can be verbose
-            # Optionally pull LFS files again (sometimes clone doesn't get them all)
             try:
-                print("Running git lfs pull...")
-                lfs_pull_cmd = ['git', 'lfs', 'pull']
-                subprocess.run(lfs_pull_cmd, cwd=KOKORO_PATH, check=True, capture_output=True, text=True)
-                print("git lfs pull completed.")
-            except (FileNotFoundError, subprocess.CalledProcessError) as lfs_pull_err:
-                 print(f"Warning: git lfs pull failed: {lfs_pull_err}")
         else:
-            print(f"{KOKORO_PATH} directory already exists.")
-        # Install espeak (essential for phonemization)
-        print("Attempting to install espeak-ng or espeak...")
-        apt_update_cmd = ['apt-get', 'update', '-qq']
-        install_cmd_ng = ['apt-get', 'install', '-y', '-qq', 'espeak-ng']
-        install_cmd_legacy = ['apt-get', 'install', '-y', '-qq', 'espeak']
-        if can_sudo:
-             apt_update_cmd.insert(0, 'sudo')
-             install_cmd_ng.insert(0, 'sudo')
-             install_cmd_legacy.insert(0, 'sudo')
         try:
-            print(f"Running: {' '.join(apt_update_cmd)}")
-            subprocess.run(apt_update_cmd, check=True, capture_output=True)
-            print(f"Running: {' '.join(install_cmd_ng)}")
-            subprocess.run(install_cmd_ng, check=True, capture_output=True)
-            print("espeak-ng installed successfully.")
-        except (FileNotFoundError, subprocess.CalledProcessError) as ng_err:
-            print(f"espeak-ng installation failed ({ng_err}), trying espeak...")
             try:
-                print(f"Running: {' '.join(install_cmd_legacy)}")
-                subprocess.run(install_cmd_legacy, check=True, capture_output=True)
-                print("espeak installed successfully.")
-            except (FileNotFoundError, subprocess.CalledProcessError) as legacy_err:
-                print(f"ERROR: Could not install espeak-ng or espeak: {legacy_err}. TTS functionality will be disabled.")
-                return # Cannot proceed without espeak
-        # Set up Kokoro TTS
         if os.path.exists(KOKORO_PATH):
-            import sys
             if KOKORO_PATH not in sys.path:
                 sys.path.append(KOKORO_PATH)
             try:
                 from models import build_model
-                from kokoro import generate as generate_tts_internal # Avoid name clash
-                # Make these functions accessible globally if needed
-                globals()['build_model'] = build_model
                 globals()['generate_tts_internal'] = generate_tts_internal
-                device = 'cuda' if torch.cuda.is_available() else 'cpu'
-                print(f"Loading TTS model onto device: {device}")
                 model_file = os.path.join(KOKORO_PATH, 'kokoro-v0_19.pth')
                 if not os.path.exists(model_file):
-                     print(f"Error: TTS model file not found at {model_file}. Attempting git lfs pull again...")
-                     try:
-                         lfs_pull_cmd = ['git', 'lfs', 'pull']
-                         subprocess.run(lfs_pull_cmd, cwd=KOKORO_PATH, check=True, capture_output=True, text=True)
-                         if not os.path.exists(model_file):
-                            print(f"ERROR: TTS model file STILL not found at {model_file} after lfs pull. TTS disabled.")
-                            return
-                     except Exception as lfs_pull_err:
-                         print(f"Error during git lfs pull: {lfs_pull_err}. TTS disabled.")
-                         return
-                TTS_MODEL = build_model(model_file, device)
-                print("TTS model loaded.")
-                # Preload voices
                 for voice_name, voice_id in VOICE_CHOICES.items():
                     voice_file_path = os.path.join(KOKORO_PATH, 'voices', f'{voice_id}.pt')
                     if os.path.exists(voice_file_path):
                         try:
-                            print(f"Loading voice: {voice_id} ({voice_name})")
-                            # Load using torch.load, map_location handles device placement
-                            VOICEPACKS[voice_id] = torch.load(voice_file_path, map_location=device)
                         except Exception as e:
-                            print(f"Warning: Could not load voice {voice_id}: {str(e)}")
                     else:
-                        print(f"Info: Voice file {voice_file_path} for '{voice_name}' not found, skipping.")
-                if not VOICEPACKS:
-                    print("ERROR: No voicepacks could be loaded. TTS disabled.")
                     return
-                # Ensure default 'af' is loaded if possible, even if not explicitly in choices sometimes
-                if 'af' not in VOICEPACKS:
-                     voice_file_path = os.path.join(KOKORO_PATH, 'voices', 'af.pt')
-                     if os.path.exists(voice_file_path):
-                         try:
-                            print(f"Loading fallback default voice: af")
-                            VOICEPACKS['af'] = torch.load(voice_file_path, map_location=device)
-                         except Exception as e:
-                            print(f"Warning: Could not load fallback default voice 'af': {str(e)}")
                 TTS_ENABLED = True
-                print("TTS setup completed successfully.")
             except ImportError as ie:
-                print(f"ERROR: Importing Kokoro modules failed: {ie}. Check if {KOKORO_PATH} exists and dependencies are met.")
-            except Exception as model_load_err:
-                print(f"ERROR: Loading TTS model or voices failed: {model_load_err}")
                 print(traceback.format_exc())
         else:
-            print(f"ERROR: {KOKORO_PATH} directory not found. TTS disabled.")
-    except subprocess.CalledProcessError as spe:
-        print(f"ERROR: A subprocess command failed during TTS setup: {spe}")
-        print(f"Command: {' '.join(spe.cmd)}")
-        if spe.stderr: print(f"Stderr: {spe.stderr.strip()}")
-        print("TTS setup failed.")
     except Exception as e:
-        print(f"ERROR: An unexpected error occurred during TTS setup: {str(e)}")
         print(traceback.format_exc())
         TTS_ENABLED = False
-# Start TTS setup in a separate thread
-import shutil
-print("Starting TTS setup in background thread...")
-tts_thread = threading.Thread(target=setup_tts, daemon=True)
-tts_thread.start()
-# --- Search and Generation Functions ---
 @lru_cache(maxsize=128)
-def get_web_results(query: str, max_results: int = MAX_SEARCH_RESULTS) -> List[Dict[str, str]]:
-    """Get web search results using DuckDuckGo with caching."""
-    print(f"[Web Search] Searching for: '{query}' (max_results={max_results})")
     try:
-        # Use DDGS context manager for cleanup
         with DDGS() as ddgs:
-            # Fetch results using ddgs.text()
-            results = list(ddgs.text(query, max_results=max_results, safesearch='moderate', timelimit='y')) # Limit to past year
             print(f"[Web Search] Found {len(results)} results.")
-            formatted_results = []
-            for i, result in enumerate(results):
-                formatted_results.append({
-                    "id": i + 1, # Add simple ID for citation
-                    "title": result.get("title", "No Title Available"),
-                    "snippet": result.get("body", "No Snippet Available"),
-                    "url": result.get("href", "#"),
-                })
-            return formatted_results
     except Exception as e:
         print(f"[Web Search] Error: {e}")
         print(traceback.format_exc())
         return []
-def format_prompt(query: str, context: List[Dict[str, str]]) -> str:
-    """Format the prompt with web context for the LLM."""
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    # Format context with IDs for citation
-    context_lines = []
-    if context:
-        for res in context:
-            context_lines.append(f"[{res['id']}] {res['title']}\n{res['snippet']}")
-        context_str = "\n\n".join(context_lines)
-    else:
-        context_str = "No web context available."
-    # Clear instructions for the model
-    prompt = f"""You are a helpful AI assistant. Your task is to answer the user's query based *only* on the provided web search context.
-Follow these instructions carefully:
-1.  Synthesize the information from the context to provide a comprehensive answer.
-2.  Cite the sources used in your answer using bracket notation with the source ID, like [1], [2], etc.
-3.  If multiple sources support a point, you can cite them together, e.g., [1][3].
-4.  Do *not* add information that is not present in the context.
-5.  If the context does not contain relevant information to answer the query, clearly state that you cannot answer based on the provided context.
-6.  Format the answer clearly using markdown.
 Current Time: {current_time}
@@ -282,24 +271,17 @@ Web Context:
 User Query: {query}
 Answer:"""
-    # print(f"--- Formatted Prompt ---\n{prompt[:1000]}...\n--- End Prompt ---") # Debugging: Print start of prompt
-    return prompt
-def format_sources(web_results: List[Dict[str, str]]) -> str:
-    """Format sources into HTML for display."""
     if not web_results:
         return "<div class='no-sources'>No sources found for this query.</div>"
-    sources_html = "<div class='sources-container'>"
     for res in web_results:
-        title = res.get("title", "Source")
         url = res.get("url", "#")
-        snippet = res.get("snippet", "")
-        # Basic HTML escaping for snippet and title
-        title_safe = gr. gradio.utils.escape_html(title)
-        snippet_safe = gr. gradio.utils.escape_html(snippet[:150] + ("..." if len(snippet) > 150 else ""))
-        sources_html += f"""
         <div class='source-item'>
             <div class='source-number'>[{res['id']}]</div>
             <div class='source-content'>
@@ -308,154 +290,130 @@ def format_sources(web_results: List[Dict[str, str]]) -> str:
             </div>
         </div>
         """
-    sources_html += "</div>"
-    return sources_html
-# --- Core Async Logic ---
-# NOTE: @spaces.GPU decorator is REMOVED because it's incompatible with async def
-async def generate_answer(prompt: str) -> str:
-    """Generate answer using the DeepSeek model (Async Wrapper)."""
-    print(f"[LLM Generate] Generating answer for prompt (length {len(prompt)})...")
     start_time = time.time()
     try:
-        # Tokenize input - ensure it runs on the correct device implicitly via model.device
-        inputs = tokenizer(
             prompt,
             return_tensors="pt",
             padding=True,
             truncation=True,
-            max_length=1024, # Model's context window might be larger, adjust if known
             return_attention_mask=True
-        ).to(model.device)
-        # Use torch.inference_mode() for efficiency
-        with torch.inference_mode(), torch.cuda.amp.autocast(enabled=(model.dtype == torch.float16)):
-            # Run model.generate in a separate thread to avoid blocking asyncio event loop
-            outputs = await asyncio.to_thread(
-                model.generate,
-                input_ids=inputs.input_ids,
                 attention_mask=inputs.attention_mask,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_p=TOP_P,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id, # Explicitly set EOS token
                 do_sample=True,
                 num_return_sequences=1
             )
-        # Decode only the newly generated tokens
-        # output_ids = outputs[0][inputs.input_ids.shape[1]:] # Slice generated part
-        # answer_part = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
-        # Alternative: Decode full output and split (can be less reliable if prompt has "Answer:")
-        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        answer_marker = "Answer:"
-        marker_index = full_output.rfind(answer_marker) # Use rfind to find the last occurrence
-        if marker_index != -1:
-            answer_part = full_output[marker_index + len(answer_marker):].strip()
-        else:
-            # Fallback: try to remove the prompt text (less reliable)
-            prompt_decoded = tokenizer.decode(inputs.input_ids[0], skip_special_tokens=True)
-            if full_output.startswith(prompt_decoded):
-                 answer_part = full_output[len(prompt_decoded):].strip()
-                 # Check if the marker is now at the beginning
-                 if answer_part.startswith(answer_marker):
-                     answer_part = answer_part[len(answer_marker):].strip()
-            else:
-                 print("[LLM Generate] Warning: 'Answer:' marker not found and prompt prefix mismatch. Using full output.")
-                 answer_part = full_output # Use full output as last resort
         end_time = time.time()
-        print(f"[LLM Generate] Answer generated successfully in {end_time - start_time:.2f}s. Length: {len(answer_part)}")
-        return answer_part if answer_part else "*Model did not generate a response.*"
     except Exception as e:
         print(f"[LLM Generate] Error: {e}")
         print(traceback.format_exc())
-        return f"Error generating answer: {str(e)}"
-# NOTE: @spaces.GPU decorator is REMOVED because it's incompatible with async def
-async def generate_speech(text: str, voice_id: str = 'af') -> Tuple[int, np.ndarray] | None:
-    """Generate speech from text using Kokoro TTS model (Async Wrapper)."""
-    global TTS_MODEL, TTS_ENABLED, VOICEPACKS
-    if not TTS_ENABLED or TTS_MODEL is None:
-        print("[TTS Generate] Skipping: TTS not enabled or model not loaded.")
-        return None
-    if 'generate_tts_internal' not in globals():
-        print("[TTS Generate] Skipping: TTS generation function not found.")
         return None
     if not text or not text.strip():
-        print("[TTS Generate] Skipping: Empty text provided.")
         return None
-    print(f"[TTS Generate] Requesting speech for text (length {len(text)}) with voice '{voice_id}'")
     start_time = time.time()
     try:
-        device = TTS_MODEL.device
-        # Ensure voicepack is loaded
-        if voice_id not in VOICEPACKS:
-            print(f"[TTS Generate] Warning: Voice '{voice_id}' not preloaded. Attempting fallback.")
-            # Attempt fallback to default 'af' if available
-            voice_id = 'af'
-            if 'af' not in VOICEPACKS:
-                print("[TTS Generate] Error: Default voice 'af' also not available. Cannot generate audio.")
                 return None
-            print("[TTS Generate] Using default voice 'af'.")
-        # Clean the text (simple cleaning)
-        # Remove markdown citations like [1], [2][3] etc.
-        clean_text = re.sub(r'\[\d+\](\[\d+\])*', '', text)
-        # Remove other common markdown artifacts
-        clean_text = clean_text.replace('*', '').replace('#', '').replace('`', '')
-        # Remove excessive whitespace
-        clean_text = ' '.join(clean_text.split())
-        if not clean_text.strip():
-            print("[TTS Generate] Skipping: Text is empty after cleaning.")
-            return None
-        # Truncate if too long
         if len(clean_text) > MAX_TTS_CHARS:
-            print(f"[TTS Generate] Warning: Text too long ({len(clean_text)} chars), truncating to {MAX_TTS_CHARS}.")
             clean_text = clean_text[:MAX_TTS_CHARS]
-            # Find last punctuation or space for cleaner cut
-            cut_off = max(clean_text.rfind('.'), clean_text.rfind('?'), clean_text.rfind('!'), clean_text.rfind(' '))
-            if cut_off != -1:
-                clean_text = clean_text[:cut_off+1]
-            clean_text += "..." # Indicate truncation
         print(f"[TTS Generate] Generating audio for: '{clean_text[:100]}...'")
         gen_func = globals()['generate_tts_internal']
-        # Run the blocking TTS generation in the thread pool executor
         audio_data, _ = await asyncio.get_event_loop().run_in_executor(
             executor,
             gen_func,
-            TTS_MODEL,
-            clean_text,
-            VOICEPACKS[voice_id],
-            'afr' # Language code for Kokoro (check if 'afr' or 'eng' or other is correct for your voices)
         )
         if isinstance(audio_data, torch.Tensor):
-            # Move tensor to CPU before converting to numpy if it's not already
             audio_np = audio_data.detach().cpu().numpy()
         elif isinstance(audio_data, np.ndarray):
             audio_np = audio_data
         else:
-            print("[TTS Generate] Warning: Unexpected audio data type received.")
             return None
         end_time = time.time()
-        print(f"[TTS Generate] Audio generated successfully in {end_time - start_time:.2f}s. Shape: {audio_np.shape}")
-        # Ensure it's 1D array
-        if audio_np.ndim > 1:
-             audio_np = audio_np.flatten()
         return (TTS_SAMPLE_RATE, audio_np)
     except Exception as e:
@@ -463,108 +421,111 @@ async def generate_speech(text: str, voice_id: str = 'af') -> Tuple[int, np.ndar
         print(traceback.format_exc())
         return None
-# Helper to get voice ID from display name
-def get_voice_id(voice_display_name: str) -> str:
     """Maps the user-friendly voice name to the internal voice ID."""
-    return VOICE_CHOICES.get(voice_display_name, 'af') # Default to 'af' if not found
-# --- Main Processing Logic (Async Generator) ---
-import re # Import regex for cleaning
-async def process_query_async(query: str, history: List[List[str]], selected_voice_display_name: str):
-    """Asynchronously process user query: search -> generate answer -> generate speech"""
-    print(f"\n--- New Query Processing ---")
     print(f"Query: '{query}', Voice: '{selected_voice_display_name}'")
     if not query or not query.strip():
         print("Empty query received.")
-        yield (
-            "Please enter a query.", "", gr.Button(value="Search", interactive=True), history, None
-        )
         return
-    if history is None: history = []
-    # Append user query to history immediately for display
-    current_history = history + [[query, None]] # Placeholder for assistant response
-    # 1. Initial state: Searching
     yield (
-        "*Searching the web...*",
-        "<div class='searching'><span>Searching the web...</span></div>", # Added span for CSS animation
-        gr.Button(value="Searching...", interactive=False), # Disable button
         current_history,
-        None
     )
-    # 2. Perform Web Search (non-blocking)
-    loop = asyncio.get_event_loop()
-    web_results = await loop.run_in_executor(executor, get_web_results, query)
-    sources_html = format_sources(web_results)
-    # Update state: Analyzing results
     yield (
-        "*Analyzing search results and generating answer...*",
-        sources_html,
-        gr.Button(value="Generating...", interactive=False),
-        current_history, # History still shows user query, assistant response is pending
-        None
     )
-    # 3. Generate Answer (non-blocking, potentially on GPU)
-    prompt = format_prompt(query, web_results)
-    final_answer = await generate_answer(prompt) # This is already async
-    # Update history with the final answer BEFORE generating audio
-    current_history[-1][1] = final_answer
-    # Update state: Answer generated, preparing audio
     yield (
-        final_answer,
         sources_html,
-        gr.Button(value="Audio...", interactive=False),
-        current_history, # Now history includes the answer
-        None
     )
-    # 4. Generate Speech (non-blocking, potentially on GPU)
-    audio = None
-    tts_message = ""
-    if not tts_thread.is_alive() and not TTS_ENABLED:
-        print("[TTS Status] TTS setup failed or is disabled.")
-        tts_message = "\n\n*(TTS is disabled or failed to initialize)*"
-    elif tts_thread.is_alive():
-        print("[TTS Status] TTS is still initializing in the background.")
-        tts_message = "\n\n*(TTS is still initializing, audio may be delayed or unavailable)*"
-    elif TTS_ENABLED:
-        voice_id = get_voice_id(selected_voice_display_name)
-        # Only generate audio if the answer generation was successful
-        if not final_answer.startswith("Error"):
-             audio = await generate_speech(final_answer, voice_id) # This is already async
-             if audio is None:
-                 print(f"[TTS Status] Audio generation failed for voice '{voice_id}'.")
-                 tts_message = f"\n\n*(Audio generation failed)*"
-             else:
-                 print("[TTS Status] Audio generated successfully.")
         else:
-             print("[TTS Status] Skipping audio generation due to answer error.")
-             tts_message = "\n\n*(Audio skipped due to answer generation error)*"
-    # 5. Final state: Show everything
-    print("--- Query Processing Complete ---")
     yield (
-        final_answer + tts_message,
         sources_html,
-        gr.Button(value="Search", interactive=True), # Re-enable button
-        current_history, # Final history state
-        audio
     )
-# --- Gradio Interface ---
-# (CSS remains the same as your previous version)
 css = """
-/* ... [Your existing refined CSS] ... */
 .gradio-container { max-width: 1200px !important; background-color: #f7f7f8 !important; }
 #header { text-align: center; margin-bottom: 2rem; padding: 2rem 0; background: linear-gradient(135deg, #1a1b1e, #2d2e32); border-radius: 12px; color: white; box-shadow: 0 8px 32px rgba(0,0,0,0.2); }
 #header h1 { color: white; font-size: 2.5rem; margin-bottom: 0.5rem; text-shadow: 0 2px 4px rgba(0,0,0,0.3); }
@@ -589,20 +550,19 @@ css = """
 .sources-container { margin-top: 0; }
 .source-item { display: flex; padding: 10px 0; margin: 0; border-bottom: 1px solid #f3f4f6; transition: background-color 0.2s; }
 .source-item:last-child { border-bottom: none; }
-/* .source-item:hover { background-color: #f9fafb; } */
 .source-number { font-weight: bold; margin-right: 12px; color: #6b7280; width: 20px; text-align: right; flex-shrink: 0;}
 .source-content { flex: 1; min-width: 0;} /* Allow content to shrink */
 .source-title { color: #2563eb; font-weight: 500; text-decoration: none; display: block; margin-bottom: 4px; transition: all 0.2s; font-size: 0.95em; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;}
 .source-title:hover { color: #1d4ed8; text-decoration: underline; }
-.source-date { color: #6b7280; font-size: 0.8em; margin-left: 8px; }
 .source-snippet { color: #4b5563; font-size: 0.9em; line-height: 1.5; }
-.chat-history { max-height: 400px; overflow-y: auto; padding: 1rem; background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 8px; margin-top: 1rem; scrollbar-width: thin; scrollbar-color: #d1d5db #f9fafb; }
 .chat-history::-webkit-scrollbar { width: 6px; }
 .chat-history::-webkit-scrollbar-track { background: #f9fafb; }
 .chat-history::-webkit-scrollbar-thumb { background-color: #d1d5db; border-radius: 20px; }
 .examples-container { background: #f9fafb; border-radius: 8px; padding: 1rem; margin-top: 1rem; border: 1px solid #e5e7eb; }
-.examples-container .gradio-examples { gap: 8px !important; } /* Target examples component */
-.examples-container button { background: white !important; border: 1px solid #d1d5db !important; color: #374151 !important; transition: all 0.2s; margin: 0 !important; font-size: 0.9em !important; padding: 6px 12px !important; }
 .examples-container button:hover { background: #f3f4f6 !important; border-color: #adb5bd !important; }
 .markdown-content { color: #374151 !important; font-size: 1rem; line-height: 1.7; }
 .markdown-content h1, .markdown-content h2, .markdown-content h3 { color: #111827 !important; margin-top: 1.2em !important; margin-bottom: 0.6em !important; font-weight: 600; }
@@ -619,7 +579,7 @@ css = """
 .markdown-content th, .markdown-content td { padding: 8px 12px !important; border: 1px solid #d1d5db !important; text-align: left;}
 .markdown-content th { background: #f9fafb !important; font-weight: 600; }
 .accordion { background: #f9fafb !important; border: 1px solid #e5e7eb !important; border-radius: 8px !important; margin-top: 1rem !important; box-shadow: none !important; }
-.accordion > .label-wrap { padding: 10px 15px !important; } /* Style accordion header */
 .voice-selector { margin: 0; padding: 0; height: 100%; }
 .voice-selector div[data-testid="dropdown"] { height: 100% !important; border-radius: 0 !important;}
 .voice-selector select { background: white !important; color: #374151 !important; border: 1px solid #d1d5db !important; border-left: none !important; border-right: none !important; border-radius: 0 !important; height: 100% !important; padding: 0 10px !important; transition: all 0.2s; appearance: none !important; -webkit-appearance: none !important; background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%236b7280' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='M6 8l4 4 4-4'/%3e%3c/svg%3e") !important; background-position: right 0.5rem center !important; background-repeat: no-repeat !important; background-size: 1.5em 1.5em !important; padding-right: 2.5rem !important; }
@@ -632,7 +592,7 @@ css = """
 .no-sources { padding: 1rem; text-align: center; color: #6b7280; background: #f9fafb; border-radius: 8px; border: 1px solid #e5e7eb;}
 @keyframes pulse { 0% { opacity: 0.7; } 50% { opacity: 1; } 100% { opacity: 0.7; } }
 .searching span { animation: pulse 1.5s infinite ease-in-out; display: inline-block; }
-/* Dark Mode Styles */
 .dark .gradio-container { background-color: #111827 !important; }
 .dark #header { background: linear-gradient(135deg, #1f2937, #374151); }
 .dark #header h3 { color: #9ca3af; }
@@ -654,7 +614,7 @@ css = """
 .dark .source-title { color: #60a5fa; }
 .dark .source-title:hover { color: #93c5fd; }
 .dark .source-snippet { color: #d1d5db; }
-.dark .chat-history { background: #374151; border-color: #4b5563; scrollbar-color: #4b5563 #374151; color: #d1d5db;} /* Ensure chat text is visible */
 .dark .chat-history::-webkit-scrollbar-track { background: #374151; }
 .dark .chat-history::-webkit-scrollbar-thumb { background-color: #4b5563; }
 .dark .examples-container { background: #374151; border-color: #4b5563; }
@@ -671,11 +631,11 @@ css = """
 .dark .markdown-content th, .dark .markdown-content td { border-color: #4b5563 !important; }
 .dark .markdown-content th { background: #374151 !important; }
 .dark .accordion { background: #374151 !important; border-color: #4b5563 !important; }
-.dark .accordion > .label-wrap { color: #d1d5db !important; } /* Accordion label color */
 .dark .voice-selector select { background: #1f2937 !important; color: #d1d5db !important; border-color: #4b5563 !important; background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%239ca3af' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='M6 8l4 4 4-4'/%3e%3c/svg%3e") !important;}
 .dark .voice-selector select:focus { border-color: #3b82f6 !important; }
 .dark .audio-player { background: #374151 !important; border-color: #4b5563;}
-.dark .audio-player audio::-webkit-media-controls-panel { background-color: #374151; } /* Style audio player controls */
 .dark .audio-player audio::-webkit-media-controls-play-button { color: #d1d5db; }
 .dark .audio-player audio::-webkit-media-controls-current-time-display { color: #9ca3af; }
 .dark .audio-player audio::-webkit-media-controls-time-remaining-display { color: #9ca3af; }
@@ -684,146 +644,118 @@ css = """
 .dark .no-sources { background: #374151; color: #9ca3af; border-color: #4b5563;}
 """
 with gr.Blocks(title="AI Search Assistant", css=css, theme=gr.themes.Default(primary_hue="blue")) as demo:
-    # chat_history state persists across interactions for a single user session
-    chat_history = gr.State([])
-    with gr.Column(): # Main container for vertical layout
-        # Header Section
         with gr.Column(elem_id="header"):
             gr.Markdown("# 🔍 AI Search Assistant")
             gr.Markdown("### Powered by DeepSeek & Real-time Web Results with Voice")
-        # Search Input and Controls Section
         with gr.Column(elem_classes="search-container"):
-            with gr.Row(elem_classes="search-box", equal_height=False): # Use Row for horizontal elements
-                search_input = gr.Textbox(
-                    label="",
-                    placeholder="Ask anything...",
-                    scale=5, # Takes more horizontal space
-                    container=False, # Important for direct styling within Row
-                    elem_classes="gradio-textbox"
-                )
-                voice_select = gr.Dropdown(
-                    choices=list(VOICE_CHOICES.keys()),
-                    value=list(VOICE_CHOICES.keys())[0], # Default voice display name
-                    label="", # Visually hidden label
-                    scale=1, # Takes less space
-                    min_width=180, # Fixed width for dropdown
-                    container=False, # Important
-                    elem_classes="voice-selector gradio-dropdown"
-                )
-                search_btn = gr.Button(
-                    "Search",
-                    variant="primary",
-                    scale=0, # Minimal width needed for text
-                    min_width=100,
-                    elem_classes="gradio-button"
-                )
-            # Results Display Section (using Columns for side-by-side layout)
             with gr.Row(elem_classes="results-container", equal_height=False):
-                # Left Column: Answer and Chat History
-                with gr.Column(scale=3): # Takes 3 parts of the width
-                    with gr.Column(elem_classes="answer-box"):
-                        answer_output = gr.Markdown(value="*Your answer will appear here...*", elem_classes="markdown-content")
-                        # Audio player below the answer text
-                        audio_output = gr.Audio(
-                            label="Voice Response",
-                            type="numpy", # Expects (rate, numpy_array) tuple
-                            autoplay=False, # Don't autoplay by default
-                            show_label=False, # Hide the "Voice Response" label visually
-                            elem_classes="audio-player"
-                         )
-                    with gr.Accordion("Chat History", open=False, elem_classes="accordion"):
-                        chat_history_display = gr.Chatbot(
-                            label="Conversation",
-                             bubble_full_width=True, # Bubbles take full width
-                            height=400,
-                            elem_classes="chat-history"
-                        )
                 # Right Column: Sources
-                with gr.Column(scale=2): # Takes 2 parts of the width
-                     with gr.Column(elem_classes="sources-box"):
                         gr.Markdown("### Sources")
-                        sources_output = gr.HTML(value="<div class='no-sources'>Sources will appear here after searching.</div>")
-            # Example Prompts Section
             with gr.Row(elem_classes="examples-container"):
                  gr.Examples(
                     examples=[
                         "Latest news about renewable energy",
-                        "Explain the concept of Large Language Models (LLMs)",
-                        "What are the symptoms and prevention tips for the flu?",
                         "Compare Python and JavaScript for web development",
-                        "Summarize the main points of the Paris Agreement on climate change",
                     ],
-                    inputs=search_input, # Clicking example populates this input
                     label="Try these examples:",
-                    elem_classes="gradio-examples" # Add class for potential styling
                 )
-    # --- Event Handling ---
-    async def handle_interaction(query, history, voice_display_name):
-        """Wrapper to handle the async generator and update outputs."""
-        print(f"[Interaction] Handling query: '{query}'")
-        outputs = { # Dictionary to hold the latest state of outputs
-            "answer": "...",
-            "sources": "...",
-            "button": gr.Button(value="Search", interactive=True),
-            "history": history,
-            "audio": None
-        }
-        try:
-            # Iterate through the updates yielded by the async generator
-            async for update_tuple in process_query_async(query, history, voice_display_name):
-                # Unpack the tuple
-                ans_out, src_out, btn_state, hist_display, aud_out = update_tuple
-                # Update the outputs dictionary
-                outputs["answer"] = ans_out
-                outputs["sources"] = src_out
-                outputs["button"] = btn_state # Can be a gr.Button update dict or object
-                outputs["history"] = hist_display
-                outputs["audio"] = aud_out
-                # Yield the current state of all outputs
-                yield outputs["answer"], outputs["sources"], outputs["button"], outputs["history"], outputs["audio"]
-        except Exception as e:
-            print(f"[Interaction] Error: {e}")
             print(traceback.format_exc())
-            error_message = f"An unexpected error occurred: {e}"
-            # Provide a final error state update
-            final_error_history = history + [[query, f"*Error: {error_message}*"]] if query else history
             yield (
-                error_message,
-                "<div class='error'>Error processing request. Please check logs or try again.</div>",
-                gr.Button(value="Search", interactive=True), # Re-enable button on error
-                final_error_history,
-                None
             )
-    # Connect the handle_interaction function to the button click and input submit events
-    outputs_list = [answer_output, sources_output, search_btn, chat_history_display, audio_output]
-    inputs_list = [search_input, chat_history, voice_select] # Pass the dropdown component itself
     search_btn.click(
-        fn=handle_interaction,
-        inputs=inputs_list,
-        outputs=outputs_list
     )
     search_input.submit(
-        fn=handle_interaction,
-        inputs=inputs_list,
-        outputs=outputs_list
     )
 if __name__ == "__main__":
     print("Starting Gradio application...")
-    # Launch the app with queuing enabled for handling multiple users
     demo.queue(max_size=20).launch(
-        debug=True, # Enable Gradio debug mode for more logs
-        share=True, # Create a public link (useful for Spaces)
-        # server_name="0.0.0.0" # Bind to all interfaces if running locally and need external access
     )

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+# import spaces # Removed as @spaces.GPU is not used with async
 from duckduckgo_search import DDGS
 import time
 import torch
 import os
 import subprocess
 import numpy as np
+from typing import List, Dict, Tuple, Any, Optional, Union
 from functools import lru_cache
 import asyncio
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import warnings
 import traceback # For detailed error logging
+import re # For text cleaning
+import shutil # For checking sudo
+import html # For escaping HTML
 # --- Configuration ---
 MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
 MAX_SEARCH_RESULTS = 5
 TTS_SAMPLE_RATE = 24000
 MAX_TTS_CHARS = 1000 # Max characters for a single TTS chunk
+MAX_NEW_TOKENS = 300
 TEMPERATURE = 0.7
 TOP_P = 0.95
 KOKORO_PATH = 'Kokoro-82M' # Path to TTS model directory
 # --- Initialization ---
+# Use a ThreadPoolExecutor for blocking I/O or CPU-bound tasks
+executor = ThreadPoolExecutor(max_workers=os.cpu_count() or 4) # Use available cores
+# Suppress specific warnings
+warnings.filterwarnings("ignore", category=UserWarning, message="TypedStorage is deprecated")
+warnings.filterwarnings("ignore", message="Backend 'inductor' is not available.")
+# --- LLM Initialization ---
+llm_model: Optional[AutoModelForCausalLM] = None
+llm_tokenizer: Optional[AutoTokenizer] = None
+llm_device = "cpu" # Default device
+try:
+    print("Initializing LLM...")
+    llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    llm_tokenizer.pad_token = llm_tokenizer.eos_token
+    if torch.cuda.is_available():
+        llm_device = "cuda"
+        torch_dtype = torch.float16
+        device_map = "auto" # Let accelerate handle distribution
+        print(f"CUDA detected. Loading model with device_map='{device_map}', dtype={torch_dtype}")
+    else:
+        llm_device = "cpu"
+        torch_dtype = torch.float32 # float32 for CPU
+        device_map = {"": "cpu"}
+        print(f"CUDA not found. Loading model on CPU with dtype={torch_dtype}")
+    llm_model = AutoModelForCausalLM.from_pretrained(
         MODEL_NAME,
         device_map=device_map,
+        low_cpu_mem_usage=True,
         torch_dtype=torch_dtype,
+        # attn_implementation="flash_attention_2" # Optional: Uncomment if flash-attn is installed and compatible GPU
     )
+    print(f"LLM loaded successfully. Device map: {llm_model.hf_device_map if hasattr(llm_model, 'hf_device_map') else 'N/A'}")
+    llm_model.eval() # Set to evaluation mode
 except Exception as e:
     print(f"FATAL: Error initializing LLM model: {str(e)}")
     print(traceback.format_exc())
+    # Depending on environment, you might exit or just disable LLM features
+    llm_model = None
+    llm_tokenizer = None
+    print("LLM features will be unavailable.")
+# --- TTS Initialization ---
 VOICE_CHOICES = {
     '🇺🇸 Female (Default)': 'af',
     '🇺🇸 Bella': 'af_bella',
     '🇺🇸 Nicole': 'af_nicole'
 }
 TTS_ENABLED = False
+tts_model: Optional[Any] = None # Define type more specifically if Kokoro provides it
+voicepacks: Dict[str, Any] = {}  # Cache voice packs
+tts_device = "cpu" # Default device for TTS model
+# Use a lock for thread-safe access during initialization if needed, though Thread ensures sequential execution
+# tts_init_lock = threading.Lock()
+def _run_subprocess(cmd: List[str], check: bool = True, cwd: Optional[str] = None) -> subprocess.CompletedProcess:
+    """Helper to run subprocess and capture output."""
+    print(f"Running command: {' '.join(cmd)}")
+    try:
+        result = subprocess.run(cmd, check=check, capture_output=True, text=True, cwd=cwd)
+        if result.stdout: print(f"Stdout: {result.stdout.strip()}")
+        if result.stderr: print(f"Stderr: {result.stderr.strip()}")
+        return result
+    except FileNotFoundError:
+        print(f"Error: Command not found - {cmd[0]}")
+        raise
+    except subprocess.CalledProcessError as e:
+        print(f"Error running command: {' '.join(e.cmd)}")
+        if e.stdout: print(f"Stdout: {e.stdout.strip()}")
+        if e.stderr: print(f"Stderr: {e.stderr.strip()}")
+        raise
+def setup_tts_task():
+    """Initializes Kokoro TTS model and dependencies."""
+    global TTS_ENABLED, tts_model, voicepacks, tts_device
+    print("[TTS Setup] Starting background initialization...")
+    # Determine TTS device
+    tts_device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"[TTS Setup] Target device: {tts_device}")
     can_sudo = shutil.which('sudo') is not None
+    apt_cmd_prefix = ['sudo'] if can_sudo else []
     try:
+        # 1. Clone Kokoro Repo if needed
         if not os.path.exists(KOKORO_PATH):
+            print(f"[TTS Setup] Cloning repository to {KOKORO_PATH}...")
             try:
+                _run_subprocess(['git', 'lfs', 'install', '--system', '--skip-repo'])
+            except Exception as lfs_err:
+                print(f"[TTS Setup] Warning: git lfs install command failed: {lfs_err}. Continuing clone...")
+            _run_subprocess(['git', 'clone', 'https://huggingface.co/hexgrad/Kokoro-82M', KOKORO_PATH])
             try:
+                 print("[TTS Setup] Running git lfs pull...")
+                 _run_subprocess(['git', 'lfs', 'pull'], cwd=KOKORO_PATH)
+            except Exception as lfs_pull_err:
+                 print(f"[TTS Setup] Warning: git lfs pull failed: {lfs_pull_err}")
         else:
+            print(f"[TTS Setup] Directory {KOKORO_PATH} already exists.")
+        # 2. Install espeak dependency
+        print("[TTS Setup] Checking/Installing espeak...")
         try:
+            _run_subprocess(apt_cmd_prefix + ['apt-get', 'update', '-qq'])
+            _run_subprocess(apt_cmd_prefix + ['apt-get', 'install', '-y', '-qq', 'espeak-ng'])
+            print("[TTS Setup] espeak-ng installed or already present.")
+        except Exception:
+            print("[TTS Setup] espeak-ng failed, trying espeak...")
             try:
+                _run_subprocess(apt_cmd_prefix + ['apt-get', 'install', '-y', '-qq', 'espeak'])
+                print("[TTS Setup] espeak installed or already present.")
+            except Exception as espeak_err:
+                print(f"[TTS Setup] ERROR: Failed to install both espeak-ng and espeak: {espeak_err}. TTS disabled.")
+                return # Critical dependency missing
+        # 3. Load Kokoro Model and Voices
         if os.path.exists(KOKORO_PATH):
+            sys_path_updated = False
             if KOKORO_PATH not in sys.path:
                 sys.path.append(KOKORO_PATH)
+                sys_path_updated = True
             try:
                 from models import build_model
+                from kokoro import generate as generate_tts_internal
+                globals()['build_model'] = build_model # Make available globally
                 globals()['generate_tts_internal'] = generate_tts_internal
                 model_file = os.path.join(KOKORO_PATH, 'kokoro-v0_19.pth')
                 if not os.path.exists(model_file):
+                    print(f"[TTS Setup] ERROR: Model file {model_file} not found. TTS disabled.")
+                    return
+                print(f"[TTS Setup] Loading TTS model from {model_file} onto {tts_device}...")
+                tts_model = build_model(model_file, tts_device)
+                tts_model.eval() # Set to eval mode
+                print("[TTS Setup] TTS model loaded.")
+                # Load voices
+                loaded_voices = 0
                 for voice_name, voice_id in VOICE_CHOICES.items():
                     voice_file_path = os.path.join(KOKORO_PATH, 'voices', f'{voice_id}.pt')
                     if os.path.exists(voice_file_path):
                         try:
+                            print(f"[TTS Setup] Loading voice: {voice_id} ({voice_name})")
+                            # map_location ensures it loads to the correct device
+                            voicepacks[voice_id] = torch.load(voice_file_path, map_location=tts_device)
+                            loaded_voices += 1
                         except Exception as e:
+                            print(f"[TTS Setup] Warning: Failed to load voice {voice_id}: {str(e)}")
                     else:
+                        print(f"[TTS Setup] Info: Voice file {voice_file_path} not found, skipping.")
+                if loaded_voices == 0:
+                    print("[TTS Setup] ERROR: No voicepacks could be loaded. TTS disabled.")
+                    tts_model = None # Unload model if no voices
                     return
                 TTS_ENABLED = True
+                print(f"[TTS Setup] Initialization successful. {loaded_voices} voices loaded. TTS Enabled: {TTS_ENABLED}")
             except ImportError as ie:
+                print(f"[TTS Setup] ERROR: Failed to import Kokoro modules: {ie}. Check clone and path. TTS disabled.")
+            except Exception as load_err:
+                print(f"[TTS Setup] ERROR: Failed loading TTS model/voices: {load_err}. TTS disabled.")
                 print(traceback.format_exc())
+            finally:
+                 # Clean up sys.path if modified
+                 if sys_path_updated and KOKORO_PATH in sys.path:
+                     sys.path.remove(KOKORO_PATH)
         else:
+            print(f"[TTS Setup] ERROR: {KOKORO_PATH} directory not found. TTS disabled.")
     except Exception as e:
+        print(f"[TTS Setup] ERROR: Unexpected error during setup: {str(e)}")
         print(traceback.format_exc())
+        # Ensure TTS is marked as disabled
         TTS_ENABLED = False
+        tts_model = None
+        voicepacks.clear()
+# Start TTS setup in a background thread
+print("Starting TTS setup thread...")
+tts_setup_thread = threading.Thread(target=setup_tts_task, daemon=True)
+tts_setup_thread.start()
+# --- Core Functions ---
 @lru_cache(maxsize=128)
+def get_web_results_sync(query: str, max_results: int = MAX_SEARCH_RESULTS) -> List[Dict[str, Any]]:
+    """Synchronous web search function with caching."""
+    print(f"[Web Search] Searching (sync): '{query}' (max_results={max_results})")
     try:
         with DDGS() as ddgs:
+            results = list(ddgs.text(query, max_results=max_results, safesearch='moderate', timelimit='y'))
             print(f"[Web Search] Found {len(results)} results.")
+            formatted = [{
+                "id": i + 1,
+                "title": res.get("title", "No Title"),
+                "snippet": res.get("body", "No Snippet"),
+                "url": res.get("href", "#"),
+            } for i, res in enumerate(results)]
+            return formatted
     except Exception as e:
         print(f"[Web Search] Error: {e}")
         print(traceback.format_exc())
         return []
+def format_llm_prompt(query: str, context: List[Dict[str, Any]]) -> str:
+    """Formats the prompt for the LLM, including context and instructions."""
     current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    context_str = "\n\n".join(
+        [f"[{res['id']}] {res['title']}\n{res['snippet']}" for res in context]
+    ) if context else "No relevant web context found."
+    return f"""You are a helpful AI assistant. Answer the user's query based *only* on the provided web search context.
+Instructions:
+- Synthesize information from the context to answer concisely.
+- Cite sources using bracket notation like [1], [2], etc., corresponding to the context IDs.
+- If the context is insufficient, state that clearly. Do not add external information.
+- Use markdown for formatting.
 Current Time: {current_time}
 User Query: {query}
 Answer:"""
+def format_sources_html(web_results: List[Dict[str, Any]]) -> str:
+    """Formats search results into HTML for display."""
     if not web_results:
         return "<div class='no-sources'>No sources found for this query.</div>"
+    items_html = ""
     for res in web_results:
+        title_safe = html.escape(res.get("title", "Source"))
+        snippet_safe = html.escape(res.get("snippet", "")[:150] + ("..." if len(res.get("snippet", "")) > 150 else ""))
         url = res.get("url", "#")
+        items_html += f"""
         <div class='source-item'>
             <div class='source-number'>[{res['id']}]</div>
             <div class='source-content'>
             </div>
         </div>
         """
+    return f"<div class='sources-container'>{items_html}</div>"
+async def generate_llm_answer(prompt: str) -> str:
+    """Generates answer using the loaded LLM (Async Wrapper)."""
+    if not llm_model or not llm_tokenizer:
+        return "Error: LLM model is not available."
+    print(f"[LLM Generate] Requesting generation (prompt length {len(prompt)})...")
     start_time = time.time()
     try:
+        inputs = llm_tokenizer(
             prompt,
             return_tensors="pt",
             padding=True,
             truncation=True,
+            max_length=1024, # Consider model's actual max length
             return_attention_mask=True
+        ).to(llm_model.device) # Ensure inputs are on the same device as model parts
+        with torch.inference_mode(), torch.cuda.amp.autocast(enabled=(llm_model.dtype == torch.float16)):
+            # Run blocking model.generate in the executor thread pool
+            outputs = await asyncio.get_event_loop().run_in_executor(
+                executor,
+                llm_model.generate,
+                inputs.input_ids,
                 attention_mask=inputs.attention_mask,
                 max_new_tokens=MAX_NEW_TOKENS,
                 temperature=TEMPERATURE,
                 top_p=TOP_P,
+                pad_token_id=llm_tokenizer.eos_token_id,
+                eos_token_id=llm_tokenizer.eos_token_id,
                 do_sample=True,
                 num_return_sequences=1
             )
+        # Decode only newly generated tokens relative to input
+        output_ids = outputs[0][inputs.input_ids.shape[1]:]
+        answer_part = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+        # Handle potential empty generation
+        if not answer_part:
+             # Sometimes the split method above is needed if the model includes the prompt
+             full_output = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+             answer_marker = "Answer:"
+             marker_index = full_output.rfind(answer_marker)
+             if marker_index != -1:
+                 answer_part = full_output[marker_index + len(answer_marker):].strip()
+             else:
+                 answer_part = "*Model generated an empty response.*" # Fallback message
         end_time = time.time()
+        print(f"[LLM Generate] Generation complete in {end_time - start_time:.2f}s. Length: {len(answer_part)}")
+        return answer_part
     except Exception as e:
         print(f"[LLM Generate] Error: {e}")
         print(traceback.format_exc())
+        return f"Error during answer generation: {str(e)}"
+async def generate_tts_speech(text: str, voice_id: str = 'af') -> Optional[Tuple[int, np.ndarray]]:
+    """Generates speech using the loaded TTS model (Async Wrapper)."""
+    if not TTS_ENABLED or not tts_model or 'generate_tts_internal' not in globals():
+        print("[TTS Generate] Skipping: TTS not ready.")
         return None
     if not text or not text.strip():
+        print("[TTS Generate] Skipping: Empty text.")
         return None
+    print(f"[TTS Generate] Requesting speech (length {len(text)}, voice '{voice_id}')...")
     start_time = time.time()
     try:
+        # Verify voicepack availability
+        actual_voice_id = voice_id
+        if voice_id not in voicepacks:
+            print(f"[TTS Generate] Warning: Voice '{voice_id}' not loaded. Trying default 'af'.")
+            actual_voice_id = 'af'
+            if 'af' not in voicepacks:
+                print("[TTS Generate] Error: Default voice 'af' also not available.")
                 return None
+        # Clean text for TTS
+        clean_text = re.sub(r'\[\d+\](\[\d+\])*', '', text) # Remove citations like [1], [2][3]
+        clean_text = re.sub(r'[\*\#\`]', '', clean_text)      # Remove markdown symbols
+        clean_text = ' '.join(clean_text.split())             # Normalize whitespace
+        if not clean_text: return None # Skip if empty after cleaning
+        # Truncate if necessary
         if len(clean_text) > MAX_TTS_CHARS:
+            print(f"[TTS Generate] Truncating text from {len(clean_text)} to {MAX_TTS_CHARS} chars.")
             clean_text = clean_text[:MAX_TTS_CHARS]
+            last_punct = max(clean_text.rfind(p) for p in '.?! ')
+            if last_punct != -1: clean_text = clean_text[:last_punct+1]
+            clean_text += "..."
         print(f"[TTS Generate] Generating audio for: '{clean_text[:100]}...'")
         gen_func = globals()['generate_tts_internal']
+        voice_pack_data = voicepacks[actual_voice_id]
+        # Run blocking TTS generation in the executor thread pool
+        # Assuming 'afr' is the correct language code for Kokoro's default voices
         audio_data, _ = await asyncio.get_event_loop().run_in_executor(
             executor,
             gen_func,
+            tts_model,      # The loaded model object
+            clean_text,     # The cleaned text string
+            voice_pack_data,# The loaded voice pack tensor/dict
+            'afr'           # Language code (verify this is correct)
         )
         if isinstance(audio_data, torch.Tensor):
             audio_np = audio_data.detach().cpu().numpy()
         elif isinstance(audio_data, np.ndarray):
             audio_np = audio_data
         else:
+            print("[TTS Generate] Warning: Unexpected audio data type.")
             return None
+        # Ensure audio is 1D float32
+        audio_np = audio_np.flatten().astype(np.float32)
         end_time = time.time()
+        print(f"[TTS Generate] Audio generated in {end_time - start_time:.2f}s. Shape: {audio_np.shape}")
         return (TTS_SAMPLE_RATE, audio_np)
     except Exception as e:
         print(traceback.format_exc())
         return None
+def get_voice_id_from_display(voice_display_name: str) -> str:
     """Maps the user-friendly voice name to the internal voice ID."""
+    return VOICE_CHOICES.get(voice_display_name, 'af') # Default to 'af'
+# --- Gradio Interaction Logic ---
+# Define type for chat history using the 'messages' format
+ChatHistoryType = List[Dict[str, str]]
+async def handle_interaction(
+    query: str,
+    history: ChatHistoryType,
+    selected_voice_display_name: str
+):
+    """Main async generator function to handle user queries and update Gradio UI."""
+    print(f"\n--- Handling Query ---")
     print(f"Query: '{query}', Voice: '{selected_voice_display_name}'")
     if not query or not query.strip():
         print("Empty query received.")
+        # Need to yield the current state for all outputs
+        yield history, "*Please enter a query.*", "<div class='no-sources'>Enter a query to search.</div>", None, gr.Button(value="Search", interactive=True)
         return
+    # Append user message to history
+    current_history = history + [{"role": "user", "content": query}]
+    # Add placeholder for assistant response
+    current_history.append({"role": "assistant", "content": "*Searching...*"})
+    # 1. Initial State: Searching
     yield (
         current_history,
+        "*Searching the web...*", # Update answer area
+        "<div class='searching'><span>Searching the web...</span></div>", # Update sources area
+        None, # No audio yet
+        gr.Button(value="Searching...", interactive=False) # Update button state
     )
+    # 2. Perform Web Search (in executor)
+    web_results = await asyncio.get_event_loop().run_in_executor(
+        executor, get_web_results_sync, query
+    )
+    sources_html = format_sources_html(web_results)
+    # Update state: Generating Answer
+    current_history[-1]["content"] = "*Generating answer...*" # Update assistant placeholder
     yield (
+        current_history,
+        "*Generating answer...*", # Update answer area
+        sources_html,             # Show sources
+        None,
+        gr.Button(value="Generating...", interactive=False)
     )
+    # 3. Generate LLM Answer (async)
+    llm_prompt = format_llm_prompt(query, web_results)
+    final_answer = await generate_llm_answer(llm_prompt)
+    # Update assistant message in history with the final answer
+    current_history[-1]["content"] = final_answer
+    # Update state: Generating Audio (if applicable)
     yield (
+        current_history,
+        final_answer, # Show final answer
         sources_html,
+        None,
+        gr.Button(value="Audio...", interactive=False) if TTS_ENABLED else gr.Button(value="Search", interactive=True) # Enable search if TTS disabled
     )
+    # 4. Generate TTS Speech (async)
+    audio_output_data = None
+    tts_status_message = ""
+    if not TTS_ENABLED:
+        if tts_setup_thread.is_alive():
+             tts_status_message = "\n\n*(TTS initializing...)*"
         else:
+             tts_status_message = "\n\n*(TTS disabled or failed)*"
+    elif final_answer and not final_answer.startswith("Error"):
+        voice_id = get_voice_id_from_display(selected_voice_display_name)
+        audio_output_data = await generate_tts_speech(final_answer, voice_id)
+        if audio_output_data is None:
+            tts_status_message = "\n\n*(Audio generation failed)*"
+    # 5. Final State: Show all results
+    final_answer_with_status = final_answer + tts_status_message
+    current_history[-1]["content"] = final_answer_with_status # Update history with status msg too
+    print("--- Query Handling Complete ---")
     yield (
+        current_history,
+        final_answer_with_status, # Show answer + TTS status
         sources_html,
+        audio_output_data,        # Output audio data (or None)
+        gr.Button(value="Search", interactive=True) # Re-enable button
     )
+# --- Gradio UI Definition ---
+# (CSS remains largely the same - ensure it targets default Gradio classes if elem_classes was removed)
 css = """
+/* ... [Your existing refined CSS, but remove selectors using .gradio-examples if you were using it] ... */
+/* Example: Style examples container via its parent or default class if needed */
+/* .examples-container .gradio-examples { ... } */ /* This might still work depending on structure */
 .gradio-container { max-width: 1200px !important; background-color: #f7f7f8 !important; }
 #header { text-align: center; margin-bottom: 2rem; padding: 2rem 0; background: linear-gradient(135deg, #1a1b1e, #2d2e32); border-radius: 12px; color: white; box-shadow: 0 8px 32px rgba(0,0,0,0.2); }
 #header h1 { color: white; font-size: 2.5rem; margin-bottom: 0.5rem; text-shadow: 0 2px 4px rgba(0,0,0,0.3); }
 .sources-container { margin-top: 0; }
 .source-item { display: flex; padding: 10px 0; margin: 0; border-bottom: 1px solid #f3f4f6; transition: background-color 0.2s; }
 .source-item:last-child { border-bottom: none; }
 .source-number { font-weight: bold; margin-right: 12px; color: #6b7280; width: 20px; text-align: right; flex-shrink: 0;}
 .source-content { flex: 1; min-width: 0;} /* Allow content to shrink */
 .source-title { color: #2563eb; font-weight: 500; text-decoration: none; display: block; margin-bottom: 4px; transition: all 0.2s; font-size: 0.95em; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;}
 .source-title:hover { color: #1d4ed8; text-decoration: underline; }
 .source-snippet { color: #4b5563; font-size: 0.9em; line-height: 1.5; }
+.chat-history { /* Style the chatbot container */ max-height: 400px; overflow-y: auto; background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 8px; margin-top: 1rem; scrollbar-width: thin; scrollbar-color: #d1d5db #f9fafb; }
+.chat-history > div { padding: 1rem; } /* Add padding inside the chatbot display area */
 .chat-history::-webkit-scrollbar { width: 6px; }
 .chat-history::-webkit-scrollbar-track { background: #f9fafb; }
 .chat-history::-webkit-scrollbar-thumb { background-color: #d1d5db; border-radius: 20px; }
 .examples-container { background: #f9fafb; border-radius: 8px; padding: 1rem; margin-top: 1rem; border: 1px solid #e5e7eb; }
+/* Default styling for example buttons (since elem_classes might not work) */
+.examples-container button { background: white !important; border: 1px solid #d1d5db !important; color: #374151 !important; transition: all 0.2s; margin: 4px !important; font-size: 0.9em !important; padding: 6px 12px !important; border-radius: 4px !important; }
 .examples-container button:hover { background: #f3f4f6 !important; border-color: #adb5bd !important; }
 .markdown-content { color: #374151 !important; font-size: 1rem; line-height: 1.7; }
 .markdown-content h1, .markdown-content h2, .markdown-content h3 { color: #111827 !important; margin-top: 1.2em !important; margin-bottom: 0.6em !important; font-weight: 600; }
 .markdown-content th, .markdown-content td { padding: 8px 12px !important; border: 1px solid #d1d5db !important; text-align: left;}
 .markdown-content th { background: #f9fafb !important; font-weight: 600; }
 .accordion { background: #f9fafb !important; border: 1px solid #e5e7eb !important; border-radius: 8px !important; margin-top: 1rem !important; box-shadow: none !important; }
+.accordion > .label-wrap { padding: 10px 15px !important; }
 .voice-selector { margin: 0; padding: 0; height: 100%; }
 .voice-selector div[data-testid="dropdown"] { height: 100% !important; border-radius: 0 !important;}
 .voice-selector select { background: white !important; color: #374151 !important; border: 1px solid #d1d5db !important; border-left: none !important; border-right: none !important; border-radius: 0 !important; height: 100% !important; padding: 0 10px !important; transition: all 0.2s; appearance: none !important; -webkit-appearance: none !important; background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%236b7280' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='M6 8l4 4 4-4'/%3e%3c/svg%3e") !important; background-position: right 0.5rem center !important; background-repeat: no-repeat !important; background-size: 1.5em 1.5em !important; padding-right: 2.5rem !important; }
 .no-sources { padding: 1rem; text-align: center; color: #6b7280; background: #f9fafb; border-radius: 8px; border: 1px solid #e5e7eb;}
 @keyframes pulse { 0% { opacity: 0.7; } 50% { opacity: 1; } 100% { opacity: 0.7; } }
 .searching span { animation: pulse 1.5s infinite ease-in-out; display: inline-block; }
+/* Dark Mode Styles (Optional - keep if needed) */
 .dark .gradio-container { background-color: #111827 !important; }
 .dark #header { background: linear-gradient(135deg, #1f2937, #374151); }
 .dark #header h3 { color: #9ca3af; }
 .dark .source-title { color: #60a5fa; }
 .dark .source-title:hover { color: #93c5fd; }
 .dark .source-snippet { color: #d1d5db; }
+.dark .chat-history { background: #374151; border-color: #4b5563; scrollbar-color: #4b5563 #374151; color: #d1d5db;}
 .dark .chat-history::-webkit-scrollbar-track { background: #374151; }
 .dark .chat-history::-webkit-scrollbar-thumb { background-color: #4b5563; }
 .dark .examples-container { background: #374151; border-color: #4b5563; }
 .dark .markdown-content th, .dark .markdown-content td { border-color: #4b5563 !important; }
 .dark .markdown-content th { background: #374151 !important; }
 .dark .accordion { background: #374151 !important; border-color: #4b5563 !important; }
+.dark .accordion > .label-wrap { color: #d1d5db !important; }
 .dark .voice-selector select { background: #1f2937 !important; color: #d1d5db !important; border-color: #4b5563 !important; background-image: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3e%3cpath stroke='%239ca3af' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='M6 8l4 4 4-4'/%3e%3c/svg%3e") !important;}
 .dark .voice-selector select:focus { border-color: #3b82f6 !important; }
 .dark .audio-player { background: #374151 !important; border-color: #4b5563;}
+.dark .audio-player audio::-webkit-media-controls-panel { background-color: #374151; }
 .dark .audio-player audio::-webkit-media-controls-play-button { color: #d1d5db; }
 .dark .audio-player audio::-webkit-media-controls-current-time-display { color: #9ca3af; }
 .dark .audio-player audio::-webkit-media-controls-time-remaining-display { color: #9ca3af; }
 .dark .no-sources { background: #374151; color: #9ca3af; border-color: #4b5563;}
 """
+import sys # Needed for sys.path manipulation in TTS setup
 with gr.Blocks(title="AI Search Assistant", css=css, theme=gr.themes.Default(primary_hue="blue")) as demo:
+    # Use gr.State to store the chat history in the 'messages' format
+    chat_history_state = gr.State([])
+    with gr.Column(): # Main container
+        # Header
         with gr.Column(elem_id="header"):
             gr.Markdown("# 🔍 AI Search Assistant")
             gr.Markdown("### Powered by DeepSeek & Real-time Web Results with Voice")
+        # Search Area
         with gr.Column(elem_classes="search-container"):
+            with gr.Row(elem_classes="search-box", equal_height=False):
+                search_input = gr.Textbox(label="", placeholder="Ask anything...", scale=5, container=False)
+                voice_select = gr.Dropdown(choices=list(VOICE_CHOICES.keys()), value=list(VOICE_CHOICES.keys())[0], label="", scale=1, min_width=180, container=False, elem_classes="voice-selector")
+                search_btn = gr.Button("Search", variant="primary", scale=0, min_width=100)
+            # Results Area
             with gr.Row(elem_classes="results-container", equal_height=False):
+                # Left Column: Answer & History
+                with gr.Column(scale=3):
+                    # Chatbot display (uses 'messages' format now)
+                    chatbot_display = gr.Chatbot(
+                        label="Conversation",
+                        bubble_full_width=True,
+                        height=500,
+                        elem_classes="chat-history",
+                        type="messages", # Use the recommended type
+                        avatar_images=(None, os.path.join(KOKORO_PATH, "icon.png") if os.path.exists(os.path.join(KOKORO_PATH, "icon.png")) else None) # Optional: Add avatar for assistant
+                    )
+                    # Separate Markdown for status/intermediate answer
+                    answer_status_output = gr.Markdown(value="*Enter a query to start.*", elem_classes="answer-box markdown-content")
+                    # Audio Output
+                    audio_player = gr.Audio(label="Voice Response", type="numpy", autoplay=False, show_label=False, elem_classes="audio-player")
                 # Right Column: Sources
+                with gr.Column(scale=2):
+                    with gr.Column(elem_classes="sources-box"):
                         gr.Markdown("### Sources")
+                        sources_output_html = gr.HTML(value="<div class='no-sources'>Sources will appear here.</div>")
+            # Examples Area
             with gr.Row(elem_classes="examples-container"):
+                 # REMOVED elem_classes from gr.Examples
                  gr.Examples(
                     examples=[
                         "Latest news about renewable energy",
+                        "Explain Large Language Models (LLMs)",
+                        "Symptoms and prevention tips for the flu",
                         "Compare Python and JavaScript for web development",
+                        "Summarize the main points of the Paris Agreement",
                     ],
+                    inputs=search_input,
                     label="Try these examples:",
                 )
+    # --- Event Handling Setup ---
+    # Define the inputs and outputs for the Gradio event triggers
+    event_inputs = [search_input, chat_history_state, voice_select]
+    event_outputs = [
+        chatbot_display,        # Updated chat history
+        answer_status_output,   # Status or final answer text
+        sources_output_html,    # Formatted sources
+        audio_player,           # Audio data
+        search_btn              # Button state (enabled/disabled)
+    ]
+    # Create a wrapper to adapt the async generator for Gradio's streaming updates
+    async def stream_interaction_updates(query, history, voice_display_name):
+         try:
+             # Iterate through the states yielded by the handler
+             async for state_update in handle_interaction(query, history, voice_display_name):
+                 yield state_update # Yield the tuple of output values
+         except Exception as e:
+            print(f"[Gradio Stream] Error during interaction: {e}")
             print(traceback.format_exc())
+            # Yield a final error state to the UI
+            error_history = history + [{"role":"user", "content":query}, {"role":"assistant", "content":f"*Error: {e}*"}]
             yield (
+                error_history,
+                f"An error occurred: {e}",
+                "<div class='error'>Request failed.</div>",
+                None,
+                gr.Button(value="Search", interactive=True)
             )
+         finally:
+            # Clear the text input after processing is complete (or errored out)
+            # We need to yield the final state *plus* the cleared input
+            # This requires adding search_input to the outputs list for the event triggers
+            # For now, let's not clear it automatically to avoid complexity.
+            # yield (*final_state_tuple, gr.Textbox(value="")) # Example if clearing input
+             print("[Gradio Stream] Interaction stream finished.")
+    # Connect the streaming function to the button click and input submit events
     search_btn.click(
+        fn=stream_interaction_updates,
+        inputs=event_inputs,
+        outputs=event_outputs
     )
     search_input.submit(
+        fn=stream_interaction_updates,
+        inputs=event_inputs,
+        outputs=event_outputs
     )
 if __name__ == "__main__":
     print("Starting Gradio application...")
     demo.queue(max_size=20).launch(
+        debug=True,
+        share=True,
+        # server_name="0.0.0.0" # Optional: Bind to all interfaces
     )