Ais
commited on
Update app/main.py
Browse files- app/main.py +52 -107
app/main.py
CHANGED
@@ -19,7 +19,7 @@ app.add_middleware(
|
|
19 |
)
|
20 |
|
21 |
# === Load API Key from Hugging Face Secrets ===
|
22 |
-
API_KEY = os.getenv("API_KEY", "undefined")
|
23 |
|
24 |
# === Model Settings ===
|
25 |
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
|
@@ -53,7 +53,7 @@ async def chat(request: Request):
|
|
53 |
auth_header = request.headers.get("Authorization", "")
|
54 |
if not auth_header.startswith("Bearer "):
|
55 |
return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
|
56 |
-
|
57 |
token = auth_header.replace("Bearer ", "").strip()
|
58 |
if token != API_KEY:
|
59 |
return JSONResponse(status_code=401, content={"error": "Invalid API key."})
|
@@ -65,133 +65,78 @@ async def chat(request: Request):
|
|
65 |
if not messages or not isinstance(messages, list):
|
66 |
raise ValueError("Invalid or missing 'messages' field.")
|
67 |
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
except Exception as e:
|
72 |
return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
|
73 |
|
74 |
-
# ✅ FIXED:
|
75 |
-
|
76 |
-
|
77 |
-
# ✅ Build clean conversation prompt
|
78 |
-
formatted_prompt = ""
|
79 |
-
|
80 |
-
for message in recent_messages:
|
81 |
-
role = message.get("role", "")
|
82 |
-
content = message.get("content", "")
|
83 |
-
|
84 |
-
if role == "system":
|
85 |
-
formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
|
86 |
-
elif role == "user":
|
87 |
-
formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
|
88 |
-
elif role == "assistant":
|
89 |
-
formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
|
90 |
-
|
91 |
-
# Add the assistant start token for generation
|
92 |
-
formatted_prompt += "<|im_start|>assistant\n"
|
93 |
-
|
94 |
-
print(f"🤖 Processing {len(recent_messages)} recent messages")
|
95 |
|
96 |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
|
97 |
|
98 |
-
# ✅ Generate Response
|
99 |
with torch.no_grad():
|
100 |
outputs = model.generate(
|
101 |
**inputs,
|
102 |
-
max_new_tokens=
|
103 |
-
temperature=
|
104 |
top_p=0.9,
|
105 |
do_sample=True,
|
106 |
pad_token_id=tokenizer.eos_token_id,
|
107 |
-
|
|
|
108 |
)
|
109 |
|
110 |
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
111 |
|
112 |
-
# ✅
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
# Remove end token
|
122 |
-
if "<|im_end|>" in final_answer:
|
123 |
-
final_answer = final_answer.split("<|im_end|>")[0].strip()
|
124 |
-
|
125 |
-
# ✅ CRITICAL: Remove conversation artifacts that leak through
|
126 |
-
# Remove user/assistant role labels that appear in content
|
127 |
-
final_answer = final_answer.replace("user\n", "").replace("assistant\n", "")
|
128 |
-
|
129 |
-
# Remove repeated questions and conversation artifacts
|
130 |
-
lines = final_answer.split('\n')
|
131 |
-
cleaned_lines = []
|
132 |
-
seen_content = set()
|
133 |
-
found_answer = False
|
134 |
-
|
135 |
-
for line in lines:
|
136 |
-
line = line.strip()
|
137 |
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
# Skip if this exact line was seen before (removes repeats)
|
143 |
-
if line in seen_content:
|
144 |
-
continue
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
# Skip conversation tokens
|
156 |
-
if '<|im_start|>' in line or '<|im_end|>' in line:
|
157 |
-
continue
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
cleaned_lines.append(line)
|
162 |
-
seen_content.add(line)
|
163 |
-
|
164 |
-
final_answer = '\n'.join(cleaned_lines).strip()
|
165 |
-
|
166 |
-
# Remove VS Code context if it leaked through
|
167 |
-
if "[VS Code Context:" in final_answer:
|
168 |
-
context_lines = final_answer.split('\n')
|
169 |
-
cleaned_context_lines = [line for line in context_lines if not line.strip().startswith('[VS Code Context:')]
|
170 |
-
final_answer = '\n'.join(cleaned_context_lines).strip()
|
171 |
-
|
172 |
-
# Remove system prompts that leaked through
|
173 |
-
system_indicators = [
|
174 |
-
"Guidelines:",
|
175 |
-
"Response format:",
|
176 |
-
"You are a helpful",
|
177 |
-
"I'm here to help",
|
178 |
-
"system\n",
|
179 |
-
"assistant\n",
|
180 |
-
"user\n"
|
181 |
-
]
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
# Clean up extra whitespace
|
188 |
-
final_answer = final_answer.replace('\n\n\n', '\n\n').strip()
|
189 |
-
|
190 |
-
# Ensure we have some content
|
191 |
-
if not final_answer or len(final_answer.strip()) < 3:
|
192 |
-
final_answer = "I apologize, but I couldn't generate a proper response. Please try again."
|
193 |
-
|
194 |
-
print(f"✅ Clean response: {final_answer[:100]}...")
|
195 |
|
196 |
# ✅ OpenAI-style Response
|
197 |
return {
|
|
|
19 |
)
|
20 |
|
21 |
# === Load API Key from Hugging Face Secrets ===
|
22 |
+
API_KEY = os.getenv("API_KEY", "undefined") # Add API_KEY in your HF Space Secrets
|
23 |
|
24 |
# === Model Settings ===
|
25 |
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
|
|
|
53 |
auth_header = request.headers.get("Authorization", "")
|
54 |
if not auth_header.startswith("Bearer "):
|
55 |
return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
|
56 |
+
|
57 |
token = auth_header.replace("Bearer ", "").strip()
|
58 |
if token != API_KEY:
|
59 |
return JSONResponse(status_code=401, content={"error": "Invalid API key."})
|
|
|
65 |
if not messages or not isinstance(messages, list):
|
66 |
raise ValueError("Invalid or missing 'messages' field.")
|
67 |
|
68 |
+
# Extract system and user messages
|
69 |
+
system_message = ""
|
70 |
+
user_messages = []
|
71 |
+
|
72 |
+
for msg in messages:
|
73 |
+
if msg.get("role") == "system":
|
74 |
+
system_message = msg.get("content", "")
|
75 |
+
elif msg.get("role") in ["user", "assistant"]:
|
76 |
+
user_messages.append(msg)
|
77 |
+
|
78 |
+
# Get the last user message
|
79 |
+
if not user_messages:
|
80 |
+
raise ValueError("No user messages found.")
|
81 |
+
|
82 |
+
user_prompt = user_messages[-1]["content"]
|
83 |
|
84 |
except Exception as e:
|
85 |
return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
|
86 |
|
87 |
+
# ✅ FIXED: Simplified prompt formatting - no system message in prompt
|
88 |
+
# The system message is handled by the frontend logic, not in the model prompt
|
89 |
+
formatted_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
|
92 |
|
93 |
+
# ✅ Generate Response with better settings for small model
|
94 |
with torch.no_grad():
|
95 |
outputs = model.generate(
|
96 |
**inputs,
|
97 |
+
max_new_tokens=400, # Reduced for more focused responses
|
98 |
+
temperature=0.7,
|
99 |
top_p=0.9,
|
100 |
do_sample=True,
|
101 |
pad_token_id=tokenizer.eos_token_id,
|
102 |
+
repetition_penalty=1.1, # Prevent repetition
|
103 |
+
length_penalty=1.0
|
104 |
)
|
105 |
|
106 |
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
107 |
|
108 |
+
# ✅ FIXED: Better extraction - remove the prompt part completely
|
109 |
+
final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
|
110 |
+
|
111 |
+
# ✅ Additional cleaning to prevent system message leakage
|
112 |
+
if final_answer.lower().startswith(("you are a helpful", "i am a helpful", "as a helpful")):
|
113 |
+
# If the response starts with system-like text, try to extract actual content
|
114 |
+
lines = final_answer.split('\n')
|
115 |
+
cleaned_lines = []
|
116 |
+
found_content = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
for line in lines:
|
119 |
+
line = line.strip()
|
120 |
+
if not line:
|
121 |
+
continue
|
|
|
|
|
|
|
122 |
|
123 |
+
# Skip system-like phrases
|
124 |
+
if any(phrase in line.lower() for phrase in [
|
125 |
+
"you are a helpful", "i am a helpful", "as a helpful assistant",
|
126 |
+
"how can i help", "what can i help", "i'm here to help"
|
127 |
+
]):
|
128 |
+
continue
|
129 |
|
130 |
+
# This looks like actual content
|
131 |
+
found_content = True
|
132 |
+
cleaned_lines.append(line)
|
|
|
|
|
|
|
|
|
133 |
|
134 |
+
if found_content:
|
135 |
+
final_answer = '\n'.join(cleaned_lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
+
# ✅ Fallback if response is too short or looks like system message
|
138 |
+
if len(final_answer.strip()) < 10 or final_answer.lower().startswith(("system", "user", "assistant")):
|
139 |
+
final_answer = "I understand your question. Let me help you with that."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
# ✅ OpenAI-style Response
|
142 |
return {
|