Spaces:

AstroMLab
/

AstroSage-8B

Runtime error

App Files Files Community

Tijmen2 commited on Nov 18, 2024

Commit

fec3834

verified ·

1 Parent(s): 9780084

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -40

app.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import spaces
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
-import torch
 import random
-model_name = "AstroMLab/AstroSage-8B"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-streamer = TextStreamer(tokenizer)
-# Load the model with 8-bit quantization using bitsandbytes
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16,
-    load_in_8bit=True,
 )
 # Placeholder responses for when context is empty
@@ -30,43 +33,42 @@ def user(user_message, history):
 @spaces.GPU(duration=20)
 def bot(history):
-    """Generate the chatbot response."""
     if not history:
         history = []
-    # Prepare input prompt for the model
-    system_prompt = (
-        "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and cosmology. "
-        "Provide accurate, scientific information while making complex concepts accessible. "
-        "You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
-    )
-    # Construct the chat history as a single input string
-    prompt = system_prompt + "\n\n"
-    for message in history:
-        if message["role"] == "user":
-            prompt += f"User: {message['content']}\n"
-        else:
-            prompt += f"AstroSage: {message['content']}\n"
-    prompt += "AstroSage: "
-    # Generate response
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=512,
         temperature=0.7,
         top_p=0.95,
-        do_sample=True,
-        streamer=streamer
     )
-    # Decode the generated output and update history
-    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response_text = response_text[len(prompt):].strip()
-    history.append({"role": "assistant", "content": response_text})
-    yield history
 def initial_greeting():
     """Return properly formatted initial greeting."""

 import spaces
 import gradio as gr
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 import random
+model_path = hf_hub_download(
+    repo_id="AstroMLab/AstroSage-8B-GGUF",
+    filename="AstroSage-8B-Q8_0.gguf"
+)
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,
+    chat_format="llama-3",
+    n_gpu_layers=-1,  # ensure all layers are on GPU
+    flash_attn=True,
 )
 # Placeholder responses for when context is empty
 @spaces.GPU(duration=20)
 def bot(history):
+    """Yield the chatbot response for streaming."""
     if not history:
         history = []
+    # Prepare the messages for the model
+    messages = [
+        {
+            "role": "system",
+            "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and cosmology. Provide accurate, scientific information while making complex concepts accessible. You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
+        }
+    ]
+    # Add chat history
+    for message in history[:-1]:  # Exclude the last message which we just added
+        messages.append({"role": message["role"], "content": message["content"]})
+    # Add the current user message
+    messages.append({"role": "user", "content": history[-1]["content"]})
+    # Start generating the response
+    history.append({"role": "assistant", "content": ""})
+    # Stream the response
+    response = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
         temperature=0.7,
         top_p=0.95,
+        stream=True,
     )
+    for chunk in response:
+        if chunk and "content" in chunk["choices"][0]["delta"]:
+            history[-1]["content"] += chunk["choices"][0]["delta"]["content"]
+            yield history
 def initial_greeting():
     """Return properly formatted initial greeting."""