Spaces:
Runtime error
Runtime error
File size: 2,878 Bytes
a72fea7 832a4d2 0787acc a72fea7 0787acc 832a4d2 3dc2f1d a72fea7 0787acc 832a4d2 3dc2f1d 832a4d2 3dc2f1d 0787acc 35ddf38 0787acc 6300d69 0787acc 99b9339 3dc2f1d 0787acc 99b9339 029560f 0787acc 3dc2f1d 0787acc 029560f 3dc2f1d 0787acc 6300d69 0787acc a72fea7 0787acc a72fea7 3dc2f1d a72fea7 0787acc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch
# Load the model and tokenizer
model_name_or_path = "michailroussos/model_llama_8d"
max_seq_length = 2048
dtype = None
print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name_or_path,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print("Model loaded successfully!")
# Define response function
def respond(
message,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
):
try:
# Debug: Print inputs
print("\n[DEBUG] Incoming user message:", message)
print("[DEBUG] Chat history before appending:", history)
# Prepare messages
messages = [{"role": "system", "content": system_message}]
for user, assistant in history:
if user:
messages.append({"role": "user", "content": user})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
# Debug: Print prepared messages
print("[DEBUG] Prepared messages:", messages)
# Tokenize and prepare inputs
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda")
# Debug: Print tokenized inputs
print("[DEBUG] Tokenized inputs:", inputs)
# Generate response
output_ids = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
use_cache=True,
)
# Decode response
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
print("[DEBUG] Decoded response:", response)
# Update history
history.append((message, response))
return response, history
except Exception as e:
print("[ERROR] Exception in respond function:", str(e))
return f"Error: {str(e)}", history
# Create ChatInterface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)
|