File size: 2,021 Bytes
a72fea7
3dc2f1d
832a4d2
a72fea7
3dc2f1d
832a4d2
3dc2f1d
04cf79a
a72fea7
3dc2f1d
832a4d2
 
 
3dc2f1d
832a4d2
 
a72fea7
3dc2f1d
 
35ddf38
3dc2f1d
 
6300d69
3dc2f1d
 
99b9339
3dc2f1d
99b9339
 
 
 
 
3dc2f1d
 
99b9339
029560f
 
 
 
3dc2f1d
 
 
 
 
 
029560f
 
 
3dc2f1d
 
 
029560f
3dc2f1d
 
 
6300d69
3dc2f1d
a72fea7
3dc2f1d
 
 
 
a72fea7
 
3dc2f1d
a72fea7
3dc2f1d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Define constants
max_seq_length = 2048
dtype = None
model_name_or_path = "michailroussos/model_llama_8d"

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

# Optimize model for inference
FastLanguageModel.for_inference(model)

# Function to generate a response
def chat_with_model(user_message, chat_history=None):
    try:
        # Prepare the input messages
        messages = [{"role": "user", "content": user_message}]
        
        # Tokenize and prepare inputs for the model
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # Generate response
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],  # Ensure attention_mask is included
            streamer=None,  # Collect output as tensor
            max_new_tokens=128,
            use_cache=True,
            temperature=1.5,
            min_p=0.1,
        )

        # Decode the generated tokens into a string
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Append the response to the chat history
        if chat_history is None:
            chat_history = []
        chat_history.append((user_message, response))
        return "", chat_history
    except Exception as e:
        return f"Error: {str(e)}", chat_history

# Create the chat interface
demo = gr.ChatInterface(
    fn=chat_with_model,
    chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
    title="Hugging Face Chat Model",
    description="Chat with a Hugging Face model using FastLanguageModel.",
)

# Launch the app
if __name__ == "__main__":
    demo.launch()