File size: 6,392 Bytes
be1aa47 c94cc88 11d7701 87b2e49 c94cc88 87b2e49 c94cc88 be1aa47 87b2e49 be1aa47 c94cc88 87b2e49 11d7701 be1aa47 87b2e49 be1aa47 87b2e49 be1aa47 87b2e49 11d7701 43a1946 bb88b85 43a1946 ca35e53 1116052 43a1946 87b2e49 bb88b85 87b2e49 43a1946 87b2e49 43a1946 b39c68e 87b2e49 43a1946 b39c68e 43a1946 87b2e49 b39c68e 43a1946 b39c68e 87b2e49 b39c68e 43a1946 fe25716 bb88b85 b39c68e be1aa47 87b2e49 b39c68e 14d38df b39c68e 14d38df b39c68e 43a1946 b39c68e abe401d b39c68e 87b2e49 b39c68e 43a1946 b39c68e 43a1946 87b2e49 b39c68e 43a1946 b39c68e bb88b85 6a2645a 87b2e49 b39c68e 87b2e49 b39c68e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import random
import spaces
import torch
# Get the number of available CPU cores
import multiprocessing
n_cores = multiprocessing.cpu_count()
# Initialize model with optimized parameters
model_path = hf_hub_download(
repo_id="AstroMLab/AstroSage-8B-GGUF",
filename="AstroSage-8B-Q8_0.gguf"
)
# Optimized LLaMA parameters for A100
llm = Llama(
model_path=model_path,
n_ctx=2048, # Keep context window reasonable
n_threads=n_cores, # Use all available CPU cores
n_batch=512, # Increase batch size for faster processing
n_gpu_layers=35, # Offload more layers to GPU
chat_format="llama-3",
seed=42,
f16_kv=True, # Use FP16 for key/value cache
logits_all=False,
use_mmap=False, # Disable memory mapping for faster loading
use_gpu=True,
tensor_split=None, # Let the model handle tensor splitting
)
# Optimize CUDA settings if available
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication
torch.backends.cudnn.benchmark = True # Enable cudnn autotuner
# Placeholder responses for when context is empty
GREETING_MESSAGES = [
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
"Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?",
"AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?",
"The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?",
]
def user(user_message, history):
"""Add user message to chat history."""
if history is None:
history = []
return "", history + [{"role": "user", "content": user_message}]
@spaces.GPU
def bot(history):
"""Generate and stream the bot's response with optimized parameters."""
if not history:
history = []
# Optimize context by limiting history
max_history_tokens = 1024 # Reserve half of context for response
recent_history = history[-5:] # Keep only last 5 messages for context
# Prepare the messages for the model
messages = [
{
"role": "system",
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
}
]
# Add optimized chat history
for message in recent_history[:-1]:
messages.append({"role": message["role"], "content": message["content"]})
# Add the current user message
messages.append({"role": "user", "content": history[-1]["content"]})
# Start generating the response
history.append({"role": "assistant", "content": ""})
# Optimized streaming parameters
response = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
top_p=0.95,
stream=True,
top_k=40, # Add top-k sampling
repeat_penalty=1.1, # Slight penalty for repetition
mirostat_mode=2, # Enable Mirostat sampling
mirostat_tau=5.0,
mirostat_eta=0.1,
)
for chunk in response:
if chunk and "content" in chunk["choices"][0]["delta"]:
history[-1]["content"] += chunk["choices"][0]["delta"]["content"]
yield history
def initial_greeting():
"""Return properly formatted initial greeting."""
return [{"role": "assistant", "content": random.choice(GREETING_MESSAGES)}]
# Custom CSS for a space theme
custom_css = """
#component-0 {
background-color: #1a1a2e;
border-radius: 15px;
padding: 20px;
}
.dark {
background-color: #0f0f1a;
}
.contain {
max-width: 1200px !important;
}
"""
# Create the Gradio interface with optimized queue settings
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
gr.Markdown(
"""
# π AstroSage: Your Cosmic AI Companion
Welcome to AstroSage, an advanced AI assistant specializing in astronomy, astrophysics, and cosmology.
Powered by the AstroSage-8B model, I'm here to help you explore the wonders of the universe!
### What Can I Help You With?
- πͺ Explanations of astronomical phenomena
- π Space exploration and missions
- β Stars, galaxies, and cosmology
- π Planetary science and exoplanets
- π Astrophysics concepts and theories
- π Astronomical instruments and observations
Just type your question below and let's embark on a cosmic journey together!
"""
)
chatbot = gr.Chatbot(
label="Chat with AstroSage",
bubble_full_width=False,
show_label=True,
height=450,
type="messages"
)
with gr.Row():
msg = gr.Textbox(
label="Type your message here",
placeholder="Ask me anything about space and astronomy...",
scale=9
)
clear = gr.Button("Clear Chat", scale=1)
# Example questions for quick start
gr.Examples(
examples=[
"What is a black hole and how does it form?",
"Can you explain the life cycle of a star?",
"What are exoplanets and how do we detect them?",
"Tell me about the James Webb Space Telescope.",
"What is dark matter and why is it important?"
],
inputs=msg,
label="Example Questions"
)
# Set up the message chain with optimized queuing
msg.submit(
user,
[msg, chatbot],
[msg, chatbot],
queue=False
).then(
bot,
chatbot,
chatbot,
queue=True, # Enable queuing for bot responses
batch=True, # Enable batching
max_batch_size=4 # Process up to 4 requests together
)
# Clear button functionality
clear.click(lambda: None, None, chatbot, queue=False)
# Initial greeting
demo.load(initial_greeting, None, chatbot, queue=False)
# Launch the app with optimized settings
if __name__ == "__main__":
demo.queue(concurrency_count=2) # Allow 2 concurrent requests
demo.launch() |