Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

File size: 2,374 Bytes

a72fea7
832a4d2
 
f9e2c2e
a72fea7
832a4d2
 
 
 
a72fea7
832a4d2
 
 
 
 
 
 
 
a72fea7
832a4d2
 
e8ace7a
 
 
 
 
 
 
a72fea7
832a4d2
e8ace7a
832a4d2
 
 
1ea5080
832a4d2
db497f0
 
ebd9e26
e8ace7a
ebd9e26
 
1ea5080
ebd9e26
832a4d2
db497f0
a72fea7
 
1ea5080
bafd5e5
ebd9e26
 
 
 
 
 
a72fea7
 
832a4d2
a72fea7
 
 
 
 
 
832a4d2
a72fea7
c8a8974
a72fea7
 
 
37a21af

import gradio as gr
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

# Load the model and tokenizer locally
max_seq_length = 2048
dtype = None
model_name_or_path = "michailroussos/model_llama_8d"

# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)  # Enable optimized inference

# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
    # Combine system message and conversation history
    messages = [{"role": "system", "content": system_message}]
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    # Tokenize inputs
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    attention_mask = inputs.ne(tokenizer.pad_token_id).long()  # Explicitly set attention mask

    # Generate response tokens
    generated_tokens = model.generate(
        input_ids=inputs,
        attention_mask=attention_mask,
        max_new_tokens=max_tokens,
        use_cache=True,
        temperature=temperature,
        top_p=top_p,
    )

    # Decode generated tokens
    response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    # Yield response in the required Gradio format
    yield [{"role": "assistant", "content": response}]



# Define the Gradio interface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
    type="messages"
)

if __name__ == "__main__":
    demo.launch(share=True)