import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import random import spaces import torch # Get the number of available CPU cores import multiprocessing n_cores = multiprocessing.cpu_count() # Initialize model with optimized parameters model_path = hf_hub_download( repo_id="AstroMLab/AstroSage-8B-GGUF", filename="AstroSage-8B-Q8_0.gguf" ) # Optimized LLaMA parameters for A100 llm = Llama( model_path=model_path, n_ctx=2048, # Keep context window reasonable n_threads=n_cores, # Use all available CPU cores n_batch=512, # Increase batch size for faster processing n_gpu_layers=35, # Offload more layers to GPU chat_format="llama-3", seed=42, f16_kv=True, # Use FP16 for key/value cache logits_all=False, use_mmap=False, # Disable memory mapping for faster loading use_gpu=True, tensor_split=None, # Let the model handle tensor splitting ) # Optimize CUDA settings if available if torch.cuda.is_available(): torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication torch.backends.cudnn.benchmark = True # Enable cudnn autotuner # Placeholder responses for when context is empty GREETING_MESSAGES = [ "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?", "Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?", "AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?", "The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?", ] def user(user_message, history): """Add user message to chat history.""" if history is None: history = [] return "", history + [{"role": "user", "content": user_message}] @spaces.GPU def bot(history): """Generate and stream the bot's response with optimized parameters.""" if not history: history = [] # Optimize context by limiting history max_history_tokens = 1024 # Reserve half of context for response recent_history = history[-5:] # Keep only last 5 messages for context # Prepare the messages for the model messages = [ { "role": "system", "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy." } ] # Add optimized chat history for message in recent_history[:-1]: messages.append({"role": message["role"], "content": message["content"]}) # Add the current user message messages.append({"role": "user", "content": history[-1]["content"]}) # Start generating the response history.append({"role": "assistant", "content": ""}) # Optimized streaming parameters response = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, top_p=0.95, stream=True, top_k=40, # Add top-k sampling repeat_penalty=1.1, # Slight penalty for repetition mirostat_mode=2, # Enable Mirostat sampling mirostat_tau=5.0, mirostat_eta=0.1, ) for chunk in response: if chunk and "content" in chunk["choices"][0]["delta"]: history[-1]["content"] += chunk["choices"][0]["delta"]["content"] yield history def initial_greeting(): """Return properly formatted initial greeting.""" return [{"role": "assistant", "content": random.choice(GREETING_MESSAGES)}] # Custom CSS for a space theme custom_css = """ #component-0 { background-color: #1a1a2e; border-radius: 15px; padding: 20px; } .dark { background-color: #0f0f1a; } .contain { max-width: 1200px !important; } """ # Create the Gradio interface with optimized queue settings with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo: gr.Markdown( """ # 🌌 AstroSage: Your Cosmic AI Companion Welcome to AstroSage, an advanced AI assistant specializing in astronomy, astrophysics, and cosmology. Powered by the AstroSage-8B model, I'm here to help you explore the wonders of the universe! ### What Can I Help You With? - 🪐 Explanations of astronomical phenomena - 🚀 Space exploration and missions - ⭐ Stars, galaxies, and cosmology - 🌍 Planetary science and exoplanets - 📊 Astrophysics concepts and theories - 🔭 Astronomical instruments and observations Just type your question below and let's embark on a cosmic journey together! """ ) chatbot = gr.Chatbot( label="Chat with AstroSage", bubble_full_width=False, show_label=True, height=450, type="messages" ) with gr.Row(): msg = gr.Textbox( label="Type your message here", placeholder="Ask me anything about space and astronomy...", scale=9 ) clear = gr.Button("Clear Chat", scale=1) # Example questions for quick start gr.Examples( examples=[ "What is a black hole and how does it form?", "Can you explain the life cycle of a star?", "What are exoplanets and how do we detect them?", "Tell me about the James Webb Space Telescope.", "What is dark matter and why is it important?" ], inputs=msg, label="Example Questions" ) # Set up the message chain with optimized queuing msg.submit( user, [msg, chatbot], [msg, chatbot], queue=False ).then( bot, chatbot, chatbot, queue=True, # Enable queuing for bot responses batch=True, # Enable batching max_batch_size=4 # Process up to 4 requests together ) # Clear button functionality clear.click(lambda: None, None, chatbot, queue=False) # Initial greeting demo.load(initial_greeting, None, chatbot, queue=False) # Launch the app with optimized settings if __name__ == "__main__": demo.queue(concurrency_count=2) # Allow 2 concurrent requests demo.launch()