michailroussos commited on
Commit
99b9339
·
1 Parent(s): be78dc3
Files changed (1) hide show
  1. app.py +97 -61
app.py CHANGED
@@ -16,72 +16,107 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
16
 
17
  # Define the response function
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
19
- # Print the inputs at the start
20
- print("===== Respond Function Called =====")
21
- print(f"Received message: {message}")
22
- print(f"Current history: {history}")
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # Prepare the messages for the model
25
  messages = []
26
- if history:
27
- print("Adding previous messages to the history...")
28
- for entry in history:
29
- messages.append({"role": "user", "content": entry[0]})
30
- messages.append({"role": "assistant", "content": entry[1]})
31
-
32
- # Add the current user message
33
- print(f"Adding current user message: {message}")
34
- messages.append({"role": "user", "content": message})
35
-
36
- # Print the messages list before tokenization
37
- print("Messages before tokenization:", messages)
38
-
39
- # Tokenize the input (prepare the data for the model)
40
- print("Preparing the input for the model...")
41
- inputs = tokenizer.apply_chat_template(
42
- messages,
43
- tokenize=True,
44
- add_generation_prompt=True,
45
- return_tensors="pt",
46
- ).to("cuda" if torch.cuda.is_available() else "cpu")
47
-
48
- # Print the tokenized inputs
49
- print(f"Tokenized inputs: {inputs}")
50
-
51
- # Generate the response
52
- attention_mask = inputs.ne(tokenizer.pad_token_id).long()
53
- print(f"Attention mask: {attention_mask}")
54
-
55
  try:
56
- generated_tokens = model.generate(
57
- input_ids=inputs,
58
- attention_mask=attention_mask,
59
- max_new_tokens=max_tokens,
60
- use_cache=True,
61
- temperature=temperature,
62
- top_p=top_p,
63
- )
64
- except Exception as e:
65
- print(f"Error during model generation: {e}")
66
- return []
67
-
68
- # Decode the generated response
69
- response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
70
- print(f"Generated response: {response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Check and filter out unwanted system-level messages or metadata
73
- if "system" in response.lower():
74
- print("System message detected. Replacing with fallback response.")
75
- response = "Hello! How can I assist you today?"
76
-
77
- # Prepare the return format for Gradio (list of [user_message, assistant_message])
78
- if history is None:
79
- history = []
80
-
81
- # Append the new conversation turn
82
- history.append([message, response])
83
-
84
- return history
85
 
86
  # Define the Gradio interface
87
  demo = gr.ChatInterface(
@@ -92,6 +127,7 @@ demo = gr.ChatInterface(
92
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
93
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
94
  ],
 
95
  )
96
 
97
  if __name__ == "__main__":
 
16
 
17
  # Define the response function
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
19
+ # Extensive debugging print statements
20
+ print("\n" + "="*50)
21
+ print("===== RESPOND FUNCTION CALLED =====")
22
+ print("="*50)
23
+
24
+ # Print input parameters
25
+ print(f"Input Message: {message}")
26
+ print(f"System Message: {system_message}")
27
+ print(f"Max Tokens: {max_tokens}")
28
+ print(f"Temperature: {temperature}")
29
+ print(f"Top-p: {top_p}")
30
+
31
+ # Debug history
32
+ print("\n--- Current History ---")
33
+ print(f"History Type: {type(history)}")
34
+ print(f"History Content: {history}")
35
 
36
  # Prepare the messages for the model
37
  messages = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
+ if history:
40
+ print("\n--- Processing Existing History ---")
41
+ for entry in history:
42
+ print(f"Processing entry: {entry}")
43
+ # Ensure entry is a dictionary with 'user' and 'assistant' keys
44
+ if isinstance(entry, dict):
45
+ messages.append({"role": "user", "content": entry.get('user', '')})
46
+ messages.append({"role": "assistant", "content": entry.get('assistant', '')})
47
+ elif isinstance(entry, list) and len(entry) == 2:
48
+ # Handle case where history might be a list of [user, assistant]
49
+ messages.append({"role": "user", "content": entry[0]})
50
+ messages.append({"role": "assistant", "content": entry[1]})
51
+ else:
52
+ print(f"WARNING: Unexpected history entry format: {entry}")
53
+
54
+ # Add the current user message
55
+ print("\n--- Adding Current Message ---")
56
+ messages.append({"role": "user", "content": message})
57
+
58
+ # Debug messages before tokenization
59
+ print("\n--- Messages Before Tokenization ---")
60
+ for msg in messages:
61
+ print(f"Role: {msg['role']}, Content: {msg['content']}")
62
+
63
+ # Tokenize the input
64
+ print("\n--- Tokenizing Input ---")
65
+ inputs = tokenizer.apply_chat_template(
66
+ messages,
67
+ tokenize=True,
68
+ add_generation_prompt=True,
69
+ return_tensors="pt",
70
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
71
+
72
+ print(f"Tokenized Inputs Shape: {inputs.shape}")
73
+ print(f"Tokenized Inputs Device: {inputs.device}")
74
+
75
+ # Generate response
76
+ attention_mask = inputs.ne(tokenizer.pad_token_id).long()
77
+
78
+ try:
79
+ generated_tokens = model.generate(
80
+ input_ids=inputs,
81
+ attention_mask=attention_mask,
82
+ max_new_tokens=max_tokens,
83
+ use_cache=True,
84
+ temperature=temperature,
85
+ top_p=top_p,
86
+ )
87
+
88
+ # Decode the generated response
89
+ response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
90
+ print("\n--- Generated Response ---")
91
+ print(f"Raw Response: {response}")
92
+
93
+ # Check and filter response
94
+ if "system" in response.lower():
95
+ print("WARNING: System message detected in response")
96
+ response = "Hello! How can I assist you today?"
97
+
98
+ # Prepare return history
99
+ return_history = (history or []) + [
100
+ {"user": message, "assistant": response}
101
+ ]
102
+
103
+ print("\n--- Return History ---")
104
+ print(f"Return History Length: {len(return_history)}")
105
+ for entry in return_history:
106
+ print(f"User: {entry['user']}")
107
+ print(f"Assistant: {entry['assistant'][:100]}...") # Truncate long responses
108
+
109
+ return return_history
110
+
111
+ except Exception as gen_error:
112
+ print("\n--- GENERATION ERROR ---")
113
+ print(f"Error during model generation: {gen_error}")
114
+ return []
115
 
116
+ except Exception as prep_error:
117
+ print("\n--- PREPARATION ERROR ---")
118
+ print(f"Error during message preparation: {prep_error}")
119
+ return []
 
 
 
 
 
 
 
 
 
120
 
121
  # Define the Gradio interface
122
  demo = gr.ChatInterface(
 
127
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
128
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
129
  ],
130
+ type="messages" # Explicitly set to messages type
131
  )
132
 
133
  if __name__ == "__main__":