Ais commited on
Commit
45afec6
·
verified ·
1 Parent(s): 9c1b824

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +52 -107
app/main.py CHANGED
@@ -19,7 +19,7 @@ app.add_middleware(
19
  )
20
 
21
  # === Load API Key from Hugging Face Secrets ===
22
- API_KEY = os.getenv("API_KEY", "undefined")
23
 
24
  # === Model Settings ===
25
  BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
@@ -53,7 +53,7 @@ async def chat(request: Request):
53
  auth_header = request.headers.get("Authorization", "")
54
  if not auth_header.startswith("Bearer "):
55
  return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
56
-
57
  token = auth_header.replace("Bearer ", "").strip()
58
  if token != API_KEY:
59
  return JSONResponse(status_code=401, content={"error": "Invalid API key."})
@@ -65,133 +65,78 @@ async def chat(request: Request):
65
  if not messages or not isinstance(messages, list):
66
  raise ValueError("Invalid or missing 'messages' field.")
67
 
68
- temperature = body.get("temperature", 0.7)
69
- max_tokens = body.get("max_tokens", 512)
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  except Exception as e:
72
  return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
73
 
74
- # ✅ FIXED: Only use last 4 messages to prevent stacking
75
- recent_messages = messages[-4:] if len(messages) > 4 else messages
76
-
77
- # ✅ Build clean conversation prompt
78
- formatted_prompt = ""
79
-
80
- for message in recent_messages:
81
- role = message.get("role", "")
82
- content = message.get("content", "")
83
-
84
- if role == "system":
85
- formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
86
- elif role == "user":
87
- formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
88
- elif role == "assistant":
89
- formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
90
-
91
- # Add the assistant start token for generation
92
- formatted_prompt += "<|im_start|>assistant\n"
93
-
94
- print(f"🤖 Processing {len(recent_messages)} recent messages")
95
 
96
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
97
 
98
- # ✅ Generate Response
99
  with torch.no_grad():
100
  outputs = model.generate(
101
  **inputs,
102
- max_new_tokens=max_tokens,
103
- temperature=temperature,
104
  top_p=0.9,
105
  do_sample=True,
106
  pad_token_id=tokenizer.eos_token_id,
107
- eos_token_id=tokenizer.eos_token_id
 
108
  )
109
 
110
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
111
 
112
- # ✅ MUCH BETTER: Extract only the final assistant response
113
- if "<|im_start|>assistant\n" in decoded:
114
- # Get everything after the LAST assistant token
115
- parts = decoded.split("<|im_start|>assistant\n")
116
- final_answer = parts[-1].strip()
117
- else:
118
- # Fallback if no assistant token found
119
- final_answer = decoded.strip()
120
-
121
- # Remove end token
122
- if "<|im_end|>" in final_answer:
123
- final_answer = final_answer.split("<|im_end|>")[0].strip()
124
-
125
- # ✅ CRITICAL: Remove conversation artifacts that leak through
126
- # Remove user/assistant role labels that appear in content
127
- final_answer = final_answer.replace("user\n", "").replace("assistant\n", "")
128
-
129
- # Remove repeated questions and conversation artifacts
130
- lines = final_answer.split('\n')
131
- cleaned_lines = []
132
- seen_content = set()
133
- found_answer = False
134
-
135
- for line in lines:
136
- line = line.strip()
137
 
138
- # Skip empty lines at the start
139
- if not line and not found_answer:
140
- continue
141
-
142
- # Skip if this exact line was seen before (removes repeats)
143
- if line in seen_content:
144
- continue
145
 
146
- # Skip lines that look like user prompts being repeated
147
- if line.endswith('?') and len(line) < 100 and not found_answer:
148
- print(f"🚫 Skipping repeated question: {line}")
149
- continue
 
 
150
 
151
- # Skip role indicators
152
- if line in ['user', 'assistant', 'system']:
153
- continue
154
-
155
- # Skip conversation tokens
156
- if '<|im_start|>' in line or '<|im_end|>' in line:
157
- continue
158
 
159
- # If we get here, this looks like actual content
160
- found_answer = True
161
- cleaned_lines.append(line)
162
- seen_content.add(line)
163
-
164
- final_answer = '\n'.join(cleaned_lines).strip()
165
-
166
- # Remove VS Code context if it leaked through
167
- if "[VS Code Context:" in final_answer:
168
- context_lines = final_answer.split('\n')
169
- cleaned_context_lines = [line for line in context_lines if not line.strip().startswith('[VS Code Context:')]
170
- final_answer = '\n'.join(cleaned_context_lines).strip()
171
-
172
- # Remove system prompts that leaked through
173
- system_indicators = [
174
- "Guidelines:",
175
- "Response format:",
176
- "You are a helpful",
177
- "I'm here to help",
178
- "system\n",
179
- "assistant\n",
180
- "user\n"
181
- ]
182
 
183
- for indicator in system_indicators:
184
- if indicator in final_answer:
185
- final_answer = final_answer.split(indicator)[0].strip()
186
-
187
- # Clean up extra whitespace
188
- final_answer = final_answer.replace('\n\n\n', '\n\n').strip()
189
-
190
- # Ensure we have some content
191
- if not final_answer or len(final_answer.strip()) < 3:
192
- final_answer = "I apologize, but I couldn't generate a proper response. Please try again."
193
-
194
- print(f"✅ Clean response: {final_answer[:100]}...")
195
 
196
  # ✅ OpenAI-style Response
197
  return {
 
19
  )
20
 
21
  # === Load API Key from Hugging Face Secrets ===
22
+ API_KEY = os.getenv("API_KEY", "undefined") # Add API_KEY in your HF Space Secrets
23
 
24
  # === Model Settings ===
25
  BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
 
53
  auth_header = request.headers.get("Authorization", "")
54
  if not auth_header.startswith("Bearer "):
55
  return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
56
+
57
  token = auth_header.replace("Bearer ", "").strip()
58
  if token != API_KEY:
59
  return JSONResponse(status_code=401, content={"error": "Invalid API key."})
 
65
  if not messages or not isinstance(messages, list):
66
  raise ValueError("Invalid or missing 'messages' field.")
67
 
68
+ # Extract system and user messages
69
+ system_message = ""
70
+ user_messages = []
71
+
72
+ for msg in messages:
73
+ if msg.get("role") == "system":
74
+ system_message = msg.get("content", "")
75
+ elif msg.get("role") in ["user", "assistant"]:
76
+ user_messages.append(msg)
77
+
78
+ # Get the last user message
79
+ if not user_messages:
80
+ raise ValueError("No user messages found.")
81
+
82
+ user_prompt = user_messages[-1]["content"]
83
 
84
  except Exception as e:
85
  return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
86
 
87
+ # ✅ FIXED: Simplified prompt formatting - no system message in prompt
88
+ # The system message is handled by the frontend logic, not in the model prompt
89
+ formatted_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
92
 
93
+ # ✅ Generate Response with better settings for small model
94
  with torch.no_grad():
95
  outputs = model.generate(
96
  **inputs,
97
+ max_new_tokens=400, # Reduced for more focused responses
98
+ temperature=0.7,
99
  top_p=0.9,
100
  do_sample=True,
101
  pad_token_id=tokenizer.eos_token_id,
102
+ repetition_penalty=1.1, # Prevent repetition
103
+ length_penalty=1.0
104
  )
105
 
106
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
107
 
108
+ # ✅ FIXED: Better extraction - remove the prompt part completely
109
+ final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
110
+
111
+ # Additional cleaning to prevent system message leakage
112
+ if final_answer.lower().startswith(("you are a helpful", "i am a helpful", "as a helpful")):
113
+ # If the response starts with system-like text, try to extract actual content
114
+ lines = final_answer.split('\n')
115
+ cleaned_lines = []
116
+ found_content = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ for line in lines:
119
+ line = line.strip()
120
+ if not line:
121
+ continue
 
 
 
122
 
123
+ # Skip system-like phrases
124
+ if any(phrase in line.lower() for phrase in [
125
+ "you are a helpful", "i am a helpful", "as a helpful assistant",
126
+ "how can i help", "what can i help", "i'm here to help"
127
+ ]):
128
+ continue
129
 
130
+ # This looks like actual content
131
+ found_content = True
132
+ cleaned_lines.append(line)
 
 
 
 
133
 
134
+ if found_content:
135
+ final_answer = '\n'.join(cleaned_lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ # Fallback if response is too short or looks like system message
138
+ if len(final_answer.strip()) < 10 or final_answer.lower().startswith(("system", "user", "assistant")):
139
+ final_answer = "I understand your question. Let me help you with that."
 
 
 
 
 
 
 
 
 
140
 
141
  # ✅ OpenAI-style Response
142
  return {