Spaces:
Runtime error
Runtime error
File size: 2,703 Bytes
a72fea7 832a4d2 f9e2c2e a72fea7 832a4d2 07df911 a72fea7 832a4d2 a72fea7 832a4d2 3a645a0 9b00c4f 3a645a0 a72fea7 9b00c4f 3a645a0 832a4d2 1ea5080 832a4d2 db497f0 9b00c4f 3a645a0 ebd9e26 1ea5080 3a645a0 832a4d2 db497f0 a72fea7 1ea5080 1188d49 3a645a0 9b00c4f 4668547 9b00c4f 3a645a0 9b00c4f 3a645a0 584beb9 9b00c4f 6195f56 3a645a0 832a4d2 a72fea7 9b00c4f a72fea7 9b00c4f a72fea7 9b00c4f a72fea7 bad2083 a72fea7 9b00c4f a72fea7 4668547 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
from unsloth import FastLanguageModel
import torch
# Load the model and tokenizer locally
max_seq_length = 2048
model_name_or_path = "unsloth/Llama-3.2-3B-Instruct"
# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name_or_path,
max_seq_length=max_seq_length,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model) # Enable optimized inference
# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
# Start by preparing only the conversation history (user-assistant pairs)
messages = []
if history:
for entry in history:
messages.append({"role": "user", "content": entry["user"]})
messages.append({"role": "assistant", "content": entry["assistant"]})
# Add the user's new message to the list
messages.append({"role": "user", "content": message})
# Tokenize the input (prepare the data for the model)
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda" if torch.cuda.is_available() else "cpu")
# Generate the response
attention_mask = inputs.ne(tokenizer.pad_token_id).long()
generated_tokens = model.generate(
input_ids=inputs,
attention_mask=attention_mask,
max_new_tokens=max_tokens,
use_cache=True,
temperature=temperature,
top_p=top_p,
)
response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
# Update the conversation history with the new user-assistant pair
if history is None:
history = []
history.append({"user": message, "assistant": response})
# Prepare the history for Gradio
formatted_history = []
for entry in history:
formatted_history.append({"role": "user", "content": entry["user"]})
formatted_history.append({"role": "assistant", "content": entry["assistant"]})
# Return the formatted history for Gradio to display
return formatted_history
# Define the Gradio interface
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
],
type="messages",
)
if __name__ == "__main__":
demo.launch(share=False) # Use share=False for local testing
|