Spaces:
Runtime error
Runtime error
michailroussos
commited on
Commit
·
99b9339
1
Parent(s):
be78dc3
more
Browse files
app.py
CHANGED
@@ -16,72 +16,107 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
|
|
16 |
|
17 |
# Define the response function
|
18 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
19 |
-
#
|
20 |
-
print("
|
21 |
-
print(
|
22 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Prepare the messages for the model
|
25 |
messages = []
|
26 |
-
if history:
|
27 |
-
print("Adding previous messages to the history...")
|
28 |
-
for entry in history:
|
29 |
-
messages.append({"role": "user", "content": entry[0]})
|
30 |
-
messages.append({"role": "assistant", "content": entry[1]})
|
31 |
-
|
32 |
-
# Add the current user message
|
33 |
-
print(f"Adding current user message: {message}")
|
34 |
-
messages.append({"role": "user", "content": message})
|
35 |
-
|
36 |
-
# Print the messages list before tokenization
|
37 |
-
print("Messages before tokenization:", messages)
|
38 |
-
|
39 |
-
# Tokenize the input (prepare the data for the model)
|
40 |
-
print("Preparing the input for the model...")
|
41 |
-
inputs = tokenizer.apply_chat_template(
|
42 |
-
messages,
|
43 |
-
tokenize=True,
|
44 |
-
add_generation_prompt=True,
|
45 |
-
return_tensors="pt",
|
46 |
-
).to("cuda" if torch.cuda.is_available() else "cpu")
|
47 |
-
|
48 |
-
# Print the tokenized inputs
|
49 |
-
print(f"Tokenized inputs: {inputs}")
|
50 |
-
|
51 |
-
# Generate the response
|
52 |
-
attention_mask = inputs.ne(tokenizer.pad_token_id).long()
|
53 |
-
print(f"Attention mask: {attention_mask}")
|
54 |
-
|
55 |
try:
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
print("
|
75 |
-
|
76 |
-
|
77 |
-
# Prepare the return format for Gradio (list of [user_message, assistant_message])
|
78 |
-
if history is None:
|
79 |
-
history = []
|
80 |
-
|
81 |
-
# Append the new conversation turn
|
82 |
-
history.append([message, response])
|
83 |
-
|
84 |
-
return history
|
85 |
|
86 |
# Define the Gradio interface
|
87 |
demo = gr.ChatInterface(
|
@@ -92,6 +127,7 @@ demo = gr.ChatInterface(
|
|
92 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
93 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
|
94 |
],
|
|
|
95 |
)
|
96 |
|
97 |
if __name__ == "__main__":
|
|
|
16 |
|
17 |
# Define the response function
|
18 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
19 |
+
# Extensive debugging print statements
|
20 |
+
print("\n" + "="*50)
|
21 |
+
print("===== RESPOND FUNCTION CALLED =====")
|
22 |
+
print("="*50)
|
23 |
+
|
24 |
+
# Print input parameters
|
25 |
+
print(f"Input Message: {message}")
|
26 |
+
print(f"System Message: {system_message}")
|
27 |
+
print(f"Max Tokens: {max_tokens}")
|
28 |
+
print(f"Temperature: {temperature}")
|
29 |
+
print(f"Top-p: {top_p}")
|
30 |
+
|
31 |
+
# Debug history
|
32 |
+
print("\n--- Current History ---")
|
33 |
+
print(f"History Type: {type(history)}")
|
34 |
+
print(f"History Content: {history}")
|
35 |
|
36 |
# Prepare the messages for the model
|
37 |
messages = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
try:
|
39 |
+
if history:
|
40 |
+
print("\n--- Processing Existing History ---")
|
41 |
+
for entry in history:
|
42 |
+
print(f"Processing entry: {entry}")
|
43 |
+
# Ensure entry is a dictionary with 'user' and 'assistant' keys
|
44 |
+
if isinstance(entry, dict):
|
45 |
+
messages.append({"role": "user", "content": entry.get('user', '')})
|
46 |
+
messages.append({"role": "assistant", "content": entry.get('assistant', '')})
|
47 |
+
elif isinstance(entry, list) and len(entry) == 2:
|
48 |
+
# Handle case where history might be a list of [user, assistant]
|
49 |
+
messages.append({"role": "user", "content": entry[0]})
|
50 |
+
messages.append({"role": "assistant", "content": entry[1]})
|
51 |
+
else:
|
52 |
+
print(f"WARNING: Unexpected history entry format: {entry}")
|
53 |
+
|
54 |
+
# Add the current user message
|
55 |
+
print("\n--- Adding Current Message ---")
|
56 |
+
messages.append({"role": "user", "content": message})
|
57 |
+
|
58 |
+
# Debug messages before tokenization
|
59 |
+
print("\n--- Messages Before Tokenization ---")
|
60 |
+
for msg in messages:
|
61 |
+
print(f"Role: {msg['role']}, Content: {msg['content']}")
|
62 |
+
|
63 |
+
# Tokenize the input
|
64 |
+
print("\n--- Tokenizing Input ---")
|
65 |
+
inputs = tokenizer.apply_chat_template(
|
66 |
+
messages,
|
67 |
+
tokenize=True,
|
68 |
+
add_generation_prompt=True,
|
69 |
+
return_tensors="pt",
|
70 |
+
).to("cuda" if torch.cuda.is_available() else "cpu")
|
71 |
+
|
72 |
+
print(f"Tokenized Inputs Shape: {inputs.shape}")
|
73 |
+
print(f"Tokenized Inputs Device: {inputs.device}")
|
74 |
+
|
75 |
+
# Generate response
|
76 |
+
attention_mask = inputs.ne(tokenizer.pad_token_id).long()
|
77 |
+
|
78 |
+
try:
|
79 |
+
generated_tokens = model.generate(
|
80 |
+
input_ids=inputs,
|
81 |
+
attention_mask=attention_mask,
|
82 |
+
max_new_tokens=max_tokens,
|
83 |
+
use_cache=True,
|
84 |
+
temperature=temperature,
|
85 |
+
top_p=top_p,
|
86 |
+
)
|
87 |
+
|
88 |
+
# Decode the generated response
|
89 |
+
response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
|
90 |
+
print("\n--- Generated Response ---")
|
91 |
+
print(f"Raw Response: {response}")
|
92 |
+
|
93 |
+
# Check and filter response
|
94 |
+
if "system" in response.lower():
|
95 |
+
print("WARNING: System message detected in response")
|
96 |
+
response = "Hello! How can I assist you today?"
|
97 |
+
|
98 |
+
# Prepare return history
|
99 |
+
return_history = (history or []) + [
|
100 |
+
{"user": message, "assistant": response}
|
101 |
+
]
|
102 |
+
|
103 |
+
print("\n--- Return History ---")
|
104 |
+
print(f"Return History Length: {len(return_history)}")
|
105 |
+
for entry in return_history:
|
106 |
+
print(f"User: {entry['user']}")
|
107 |
+
print(f"Assistant: {entry['assistant'][:100]}...") # Truncate long responses
|
108 |
+
|
109 |
+
return return_history
|
110 |
+
|
111 |
+
except Exception as gen_error:
|
112 |
+
print("\n--- GENERATION ERROR ---")
|
113 |
+
print(f"Error during model generation: {gen_error}")
|
114 |
+
return []
|
115 |
|
116 |
+
except Exception as prep_error:
|
117 |
+
print("\n--- PREPARATION ERROR ---")
|
118 |
+
print(f"Error during message preparation: {prep_error}")
|
119 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
# Define the Gradio interface
|
122 |
demo = gr.ChatInterface(
|
|
|
127 |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
128 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
|
129 |
],
|
130 |
+
type="messages" # Explicitly set to messages type
|
131 |
)
|
132 |
|
133 |
if __name__ == "__main__":
|