Spaces:
Runtime error
Runtime error
File size: 2,374 Bytes
a72fea7 832a4d2 f9e2c2e a72fea7 832a4d2 a72fea7 832a4d2 a72fea7 832a4d2 e8ace7a a72fea7 832a4d2 e8ace7a 832a4d2 1ea5080 832a4d2 db497f0 ebd9e26 e8ace7a ebd9e26 1ea5080 ebd9e26 832a4d2 db497f0 a72fea7 1ea5080 bafd5e5 ebd9e26 a72fea7 832a4d2 a72fea7 832a4d2 a72fea7 c8a8974 a72fea7 37a21af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import gradio as gr
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch
# Load the model and tokenizer locally
max_seq_length = 2048
dtype = None
model_name_or_path = "michailroussos/model_llama_8d"
# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name_or_path,
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model) # Enable optimized inference
# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
# Combine system message and conversation history
messages = [{"role": "system", "content": system_message}]
for user_msg, assistant_msg in history:
if user_msg:
messages.append({"role": "user", "content": user_msg})
if assistant_msg:
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
# Tokenize inputs
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda" if torch.cuda.is_available() else "cpu")
attention_mask = inputs.ne(tokenizer.pad_token_id).long() # Explicitly set attention mask
# Generate response tokens
generated_tokens = model.generate(
input_ids=inputs,
attention_mask=attention_mask,
max_new_tokens=max_tokens,
use_cache=True,
temperature=temperature,
top_p=top_p,
)
# Decode generated tokens
response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
# Yield response in the required Gradio format
yield [{"role": "assistant", "content": response}]
# Define the Gradio interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
type="messages"
)
if __name__ == "__main__":
demo.launch(share=True)
|