File size: 2,878 Bytes
a72fea7
832a4d2
0787acc
 
a72fea7
0787acc
 
832a4d2
3dc2f1d
a72fea7
0787acc
832a4d2
 
 
3dc2f1d
832a4d2
 
3dc2f1d
0787acc
35ddf38
0787acc
 
 
 
 
 
 
 
 
6300d69
0787acc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99b9339
 
 
 
 
3dc2f1d
 
0787acc
 
 
99b9339
029560f
 
0787acc
 
 
 
3dc2f1d
 
 
0787acc
 
 
 
 
 
 
029560f
3dc2f1d
0787acc
 
 
6300d69
0787acc
a72fea7
0787acc
 
 
 
 
 
 
a72fea7
 
3dc2f1d
a72fea7
0787acc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
from unsloth import FastLanguageModel
from transformers import AutoTokenizer
import torch

# Load the model and tokenizer
model_name_or_path = "michailroussos/model_llama_8d"
max_seq_length = 2048
dtype = None

print("Loading model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print("Model loaded successfully!")

# Define response function
def respond(
    message,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    try:
        # Debug: Print inputs
        print("\n[DEBUG] Incoming user message:", message)
        print("[DEBUG] Chat history before appending:", history)

        # Prepare messages
        messages = [{"role": "system", "content": system_message}]
        for user, assistant in history:
            if user:
                messages.append({"role": "user", "content": user})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})
        messages.append({"role": "user", "content": message})

        # Debug: Print prepared messages
        print("[DEBUG] Prepared messages:", messages)

        # Tokenize and prepare inputs
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        # Debug: Print tokenized inputs
        print("[DEBUG] Tokenized inputs:", inputs)

        # Generate response
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            use_cache=True,
        )

        # Decode response
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        print("[DEBUG] Decoded response:", response)

        # Update history
        history.append((message, response))
        return response, history

    except Exception as e:
        print("[ERROR] Exception in respond function:", str(e))
        return f"Error: {str(e)}", history


# Create ChatInterface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)