Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

File size: 2,199 Bytes

a72fea7
832a4d2
f9e2c2e
a72fea7
832a4d2
 
 
a72fea7
832a4d2
 
 
 
 
 
 
a72fea7
832a4d2
 
e8ace7a
 
 
 
 
 
a72fea7
832a4d2
 
 
 
1ea5080
832a4d2
db497f0
bad2083
 
 
ebd9e26
1ea5080
ebd9e26
832a4d2
db497f0
a72fea7
 
1ea5080
ebd9e26
f5a59a6
 
 
ebd9e26
832a4d2
a72fea7
 
 
 
 
 
832a4d2
a72fea7
bad2083
a72fea7
 
 
f5a59a6

import gradio as gr
from unsloth import FastLanguageModel
import torch

# Load the model and tokenizer locally
max_seq_length = 2048
model_name_or_path = "michailroussos/model_llama_8d"

# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)  # Enable optimized inference

# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    attention_mask = inputs.ne(tokenizer.pad_token_id).long()
    
    generated_tokens = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )
    response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    history.append((message, response))  # Update history with new exchange
    return history  # Return the updated history

# Define the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
    type="messages",
)

if __name__ == "__main__":
    demo.launch(share=False)  # Set share=False for local testing