import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import random # Initialize model model_path = hf_hub_download( repo_id="AstroMLab/AstroSage-8B-GGUF", filename="AstroSage-8B-Q8_0.gguf" ) llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, chat_format="llama-3", seed=42, f16_kv=True, logits_all=False, use_mmap=True, use_gpu=True ) # Placeholder responses for when context is empty GREETING_MESSAGES = [ "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?", "Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?", "AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?", "The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?", ] def respond_stream(message, history): if not message: return system_message = "Assume the role of AstroSage, a helpful chatbot designed to answer user queries about astronomy, astrophysics, and cosmology." messages = [{"role": "system", "content": system_message}] for user, assistant in history: messages.append({"role": "user", "content": user}) if assistant: messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) try: past_tokens = "" # Accumulate and yield all tokens so far for chunk in llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, top_p=0.9, stream=True ): delta = chunk["choices"][0]["delta"] if "content" in delta: new_tokens = delta["content"] past_tokens += new_tokens yield past_tokens # Yield the accumulated response to allow streaming except Exception as e: yield f"Error during generation: {e}" initial_message = random.choice(GREETING_MESSAGES) chatbot = gr.Chatbot([[None, initial_message]]).style(height=750) # Set height with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=0.8): chatbot.render() with gr.Column(scale=0.2): clear = gr.Button("Clear") clear.click(lambda: [], None, chatbot,queue=False) demo.queue().launch()