Spaces:

AstroMLab
/

AstroSage-8B

Runtime error

File size: 6,393 Bytes

be1aa47
 
c94cc88
11d7701
87b2e49
 
c94cc88
87b2e49
 
 
 
 
c94cc88
 
 
 
be1aa47
87b2e49
be1aa47
c94cc88
87b2e49
 
 
 
11d7701
be1aa47
87b2e49
be1aa47
87b2e49
 
 
be1aa47
 
87b2e49
 
 
 
 
11d7701
 
 
 
 
 
 
 
43a1946
 
bb88b85
 
43a1946
ca35e53
1116052
43a1946
87b2e49
bb88b85
 
87b2e49
 
 
 
 
43a1946
 
 
 
87b2e49
43a1946
 
b39c68e
87b2e49
 
43a1946
b39c68e
43a1946
 
 
 
 
 
87b2e49
b39c68e
43a1946
b39c68e
 
 
87b2e49
 
 
 
 
 
b39c68e
 
 
 
43a1946
 
fe25716
bb88b85
 
 
 
b39c68e
 
 
 
 
 
 
 
 
 
 
 
 
 
be1aa47
87b2e49
b39c68e
 
 
 
 
14d38df
b39c68e
 
 
 
 
14d38df
b39c68e
 
 
 
 
 
 
 
 
 
 
 
43a1946
 
b39c68e
 
abe401d
b39c68e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87b2e49
b39c68e
43a1946
b39c68e
 
 
43a1946
 
 
87b2e49
 
 
 
b39c68e
 
43a1946
 
 
b39c68e
bb88b85
6a2645a
87b2e49
b39c68e
0264b98
b39c68e

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import random
import spaces
import torch

# Get the number of available CPU cores
import multiprocessing
n_cores = multiprocessing.cpu_count()

# Initialize model with optimized parameters
model_path = hf_hub_download(
    repo_id="AstroMLab/AstroSage-8B-GGUF",
    filename="AstroSage-8B-Q8_0.gguf"
)

# Optimized LLaMA parameters for A100
llm = Llama(
    model_path=model_path,
    n_ctx=2048,  # Keep context window reasonable
    n_threads=n_cores,  # Use all available CPU cores
    n_batch=512,  # Increase batch size for faster processing
    n_gpu_layers=35,  # Offload more layers to GPU
    chat_format="llama-3",
    seed=42,
    f16_kv=True,  # Use FP16 for key/value cache
    logits_all=False,
    use_mmap=False,  # Disable memory mapping for faster loading
    use_gpu=True,
    tensor_split=None,  # Let the model handle tensor splitting
)

# Optimize CUDA settings if available
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 for faster matrix multiplication
    torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner

# Placeholder responses for when context is empty
GREETING_MESSAGES = [
    "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
    "Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?",
    "AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?",
    "The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?",
]

def user(user_message, history):
    """Add user message to chat history."""
    if history is None:
        history = []
    return "", history + [{"role": "user", "content": user_message}]

@spaces.GPU
def bot(history):
    """Generate and stream the bot's response with optimized parameters."""
    if not history:
        history = []
    
    # Optimize context by limiting history
    max_history_tokens = 1024  # Reserve half of context for response
    recent_history = history[-5:]  # Keep only last 5 messages for context
    
    # Prepare the messages for the model
    messages = [
        {
            "role": "system",
            "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
        }
    ]
    
    # Add optimized chat history
    for message in recent_history[:-1]:
        messages.append({"role": message["role"], "content": message["content"]})
    
    # Add the current user message
    messages.append({"role": "user", "content": history[-1]["content"]})
    
    # Start generating the response
    history.append({"role": "assistant", "content": ""})
    
    # Optimized streaming parameters
    response = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
        stream=True,
        top_k=40,  # Add top-k sampling
        repeat_penalty=1.1,  # Slight penalty for repetition
        mirostat_mode=2,  # Enable Mirostat sampling
        mirostat_tau=5.0,
        mirostat_eta=0.1,
    )
    
    for chunk in response:
        if chunk and "content" in chunk["choices"][0]["delta"]:
            history[-1]["content"] += chunk["choices"][0]["delta"]["content"]
            yield history

def initial_greeting():
    """Return properly formatted initial greeting."""
    return [{"role": "assistant", "content": random.choice(GREETING_MESSAGES)}]

# Custom CSS for a space theme
custom_css = """
#component-0 {
    background-color: #1a1a2e;
    border-radius: 15px;
    padding: 20px;
}
.dark {
    background-color: #0f0f1a;
}
.contain {
    max-width: 1200px !important;
}
"""

# Create the Gradio interface with optimized queue settings
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
    gr.Markdown(
        """
        # 🌌 AstroSage: Your Cosmic AI Companion
        
        Welcome to AstroSage, an advanced AI assistant specializing in astronomy, astrophysics, and cosmology. 
        Powered by the AstroSage-8B model, I'm here to help you explore the wonders of the universe!
        
        ### What Can I Help You With?
        - 🪐 Explanations of astronomical phenomena
        - 🚀 Space exploration and missions
        - ⭐ Stars, galaxies, and cosmology
        - 🌍 Planetary science and exoplanets
        - 📊 Astrophysics concepts and theories
        - 🔭 Astronomical instruments and observations
        
        Just type your question below and let's embark on a cosmic journey together!
        """
    )
    
    chatbot = gr.Chatbot(
        label="Chat with AstroSage",
        bubble_full_width=False,
        show_label=True,
        height=450,
        type="messages"
    )
    
    with gr.Row():
        msg = gr.Textbox(
            label="Type your message here",
            placeholder="Ask me anything about space and astronomy...",
            scale=9
        )
        clear = gr.Button("Clear Chat", scale=1)
    
    # Example questions for quick start
    gr.Examples(
        examples=[
            "What is a black hole and how does it form?",
            "Can you explain the life cycle of a star?",
            "What are exoplanets and how do we detect them?",
            "Tell me about the James Webb Space Telescope.",
            "What is dark matter and why is it important?"
        ],
        inputs=msg,
        label="Example Questions"
    )
    
    # Set up the message chain with optimized queuing
    msg.submit(
        user,
        [msg, chatbot],
        [msg, chatbot],
        queue=False
    ).then(
        bot,
        chatbot,
        chatbot,
        queue=True,  # Enable queuing for bot responses
        batch=True,  # Enable batching
        max_batch_size=4  # Process up to 4 requests together
    )
    
    # Clear button functionality
    clear.click(lambda: None, None, chatbot, queue=False)
    
    # Initial greeting
    demo.load(initial_greeting, None, chatbot, queue=False)

# Launch the app with optimized settings
if __name__ == "__main__":
    #demo.queue(concurrency_count=2)  # Allow 2 concurrent requests
    demo.launch()