import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import random
import spaces
import torch

# Get the number of available CPU cores
import multiprocessing
n_cores = multiprocessing.cpu_count()

# Initialize model with optimized parameters
model_path = hf_hub_download(
    repo_id="AstroMLab/AstroSage-8B-GGUF",
    filename="AstroSage-8B-Q8_0.gguf"
)

# Optimized LLaMA parameters for A100
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Keep context window reasonable
    n_threads=n_cores,  # Use all available CPU cores
    n_batch=512,  # Increase batch size for faster processing
    n_gpu_layers=35,  # Offload more layers to GPU
    chat_format="llama-3",
    seed=42,
    f16_kv=True,  # Use FP16 for key/value cache
    logits_all=False,
    use_mmap=False,  # Disable memory mapping for faster loading
    use_gpu=True,
    tensor_split=None,  # Let the model handle tensor splitting
)

# Optimize CUDA settings if available
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 for faster matrix multiplication
    torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner

# Placeholder responses for when context is empty
GREETING_MESSAGES = [
    "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
    "Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?",
    "AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?",
    "The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?",
]

def user(user_message, history):
    """Add user message to chat history."""
    if history is None:
        history = []
    return "", history + [{"role": "user", "content": user_message}]

@spaces.GPU
def bot(history):
    """Generate and stream the bot's response with optimized parameters."""
    if not history:
        history = []
    
    # Optimize context by limiting history
    max_history_tokens = 1024  # Reserve half of context for response
    recent_history = history[-5:]  # Keep only last 5 messages for context
    
    # Prepare the messages for the model
    messages = [
        {
            "role": "system",
            "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
        }
    ]
    
    # Add optimized chat history
    for message in recent_history[:-1]:
        messages.append({"role": message["role"], "content": message["content"]})
    
    # Add the current user message
    messages.append({"role": "user", "content": history[-1]["content"]})
    
    # Start generating the response
    history.append({"role": "assistant", "content": ""})
    
    # Optimized streaming parameters
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
        stream=True,
        top_k=40,  # Add top-k sampling
        repeat_penalty=1.1,  # Slight penalty for repetition
        mirostat_mode=2,  # Enable Mirostat sampling
        mirostat_tau=5.0,
        mirostat_eta=0.1,
    )
    
    for chunk in response:
        if chunk and "content" in chunk["choices"][0]["delta"]:
            history[-1]["content"] += chunk["choices"][0]["delta"]["content"]
            yield history

def initial_greeting():
    """Return properly formatted initial greeting."""
    return [{"role": "assistant", "content": random.choice(GREETING_MESSAGES)}]

# Custom CSS for a space theme
custom_css = """
#component-0 {
    background-color: #1a1a2e;
    border-radius: 15px;
    padding: 20px;
}
.dark {
    background-color: #0f0f1a;
}
.contain {
    max-width: 1200px !important;
}
"""

# Create the Gradio interface with optimized queue settings
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
    gr.Markdown(
        """
        # 🌌 AstroSage: Your Cosmic AI Companion
        
        Welcome to AstroSage, an advanced AI assistant specializing in astronomy, astrophysics, and cosmology. 
        Powered by the AstroSage-8B model, I'm here to help you explore the wonders of the universe!
        
        ### What Can I Help You With?
        - 🪐 Explanations of astronomical phenomena
        - 🚀 Space exploration and missions
        - ⭐ Stars, galaxies, and cosmology
        - 🌍 Planetary science and exoplanets
        - 📊 Astrophysics concepts and theories
        - 🔭 Astronomical instruments and observations
        
        Just type your question below and let's embark on a cosmic journey together!
        """
    )
    
    chatbot = gr.Chatbot(
        label="Chat with AstroSage",
        bubble_full_width=False,
        show_label=True,
        height=450,
        type="messages"
    )
    
    with gr.Row():
        msg = gr.Textbox(
            label="Type your message here",
            placeholder="Ask me anything about space and astronomy...",
            scale=9
        )
        clear = gr.Button("Clear Chat", scale=1)
    
    # Example questions for quick start
    gr.Examples(
        examples=[
            "What is a black hole and how does it form?",
            "Can you explain the life cycle of a star?",
            "What are exoplanets and how do we detect them?",
            "Tell me about the James Webb Space Telescope.",
            "What is dark matter and why is it important?"
        ],
        inputs=msg,
        label="Example Questions"
    )
    
    # Set up the message chain with optimized queuing
    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        chatbot,
        chatbot,
        queue=True,  # Enable queuing for bot responses
        batch=True,  # Enable batching
        max_batch_size=4  # Process up to 4 requests together
    )
    
    # Clear button functionality
    clear.click(lambda: None, None, chatbot, queue=False)
    
    # Initial greeting
    demo.load(initial_greeting, None, chatbot, queue=False)

# Launch the app with optimized settings
if __name__ == "__main__":
    demo.queue(concurrency_count=2)  # Allow 2 concurrent requests
    demo.launch()