File size: 4,745 Bytes
a72fea7
832a4d2
f9e2c2e
a72fea7
832a4d2
 
04cf79a
a72fea7
832a4d2
 
 
 
 
 
 
a72fea7
832a4d2
 
99b9339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04cf79a
6300d69
3a645a0
6300d69
99b9339
 
 
bc4a9b2
 
99b9339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f11a55
 
 
99b9339
bc4a9b2
 
 
 
 
99b9339
bc4a9b2
 
 
99b9339
bc4a9b2
 
 
 
 
99b9339
 
 
 
 
3a645a0
99b9339
 
 
 
6300d69
832a4d2
a72fea7
9b00c4f
a72fea7
9b00c4f
 
a72fea7
9b00c4f
a72fea7
99b9339
a72fea7
 
 
be78dc3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from unsloth import FastLanguageModel
import torch

# Load the model and tokenizer locally
max_seq_length = 2048
model_name_or_path = "michailroussos/model_llama_8d"

# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)
FastLanguageModel.for_inference(model)  # Enable optimized inference

# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
    print("\n" + "="*50)
    print("===== RESPOND FUNCTION CALLED =====")
    print("="*50)
    
    # Print input parameters
    print(f"Input Message: {message}")
    print(f"System Message: {system_message}")
    print(f"Max Tokens: {max_tokens}")
    print(f"Temperature: {temperature}")
    print(f"Top-p: {top_p}")
    
    # Debug history
    print("\n--- Current History ---")
    print(f"History Type: {type(history)}")
    print(f"History Content: {history}")
    
    # Prepare the messages for the model
    messages = []
    try:
        if history:
            print("\n--- Processing Existing History ---")
            for entry in history:
                messages.append({"role": "user", "content": entry[0]})
                messages.append({"role": "assistant", "content": entry[1]})
        
        # Add the current user message
        print("\n--- Adding Current Message ---")
        messages.append({"role": "user", "content": message})
        
        # Debug messages before tokenization
        print("\n--- Messages Before Tokenization ---")
        for msg in messages:
            print(f"Role: {msg['role']}, Content: {msg['content']}")
        
        # Tokenize the input
        print("\n--- Tokenizing Input ---")
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        
        print(f"Tokenized Inputs Shape: {inputs.shape}")
        print(f"Tokenized Inputs Device: {inputs.device}")
        
        # Generate response
        attention_mask = inputs.ne(tokenizer.pad_token_id).long()
        
        try:
            generated_tokens = model.generate(
                input_ids=inputs,
                attention_mask=attention_mask,
                max_new_tokens=max_tokens,
                use_cache=True,
                temperature=temperature,
                top_p=top_p,
            )
            
            # Decode the generated response
            response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
            print("\n--- Generated Response ---")
            print(f"Raw Response: {response}")
            
            # Check and filter response
            #if "system" in response.lower():
            #    print("WARNING: System message detected in response")
            #    response = "Hello! How can I assist you today?"
            
            # Prepare return history in OpenAI messages format
            return_messages = []
            for entry in (history or []):
                return_messages.append({"role": "user", "content": entry[0]})
                return_messages.append({"role": "assistant", "content": entry[1]})
            
            # Add current conversation turn
            return_messages.append({"role": "user", "content": message})
            return_messages.append({"role": "assistant", "content": response})
            
            print("\n--- Return Messages ---")
            for msg in return_messages:
                print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
            
            return return_messages
        
        except Exception as gen_error:
            print("\n--- GENERATION ERROR ---")
            print(f"Error during model generation: {gen_error}")
            return []
    
    except Exception as prep_error:
        print("\n--- PREPARATION ERROR ---")
        print(f"Error during message preparation: {prep_error}")
        return []

# Define the Gradio interface
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a helpful assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
    type="messages"  # Explicitly set to messages type
)

if __name__ == "__main__":
    demo.launch(share=False)  # Use share=False for local testing