Ais
commited on
Update app/main.py
Browse files- app/main.py +26 -11
app/main.py
CHANGED
@@ -19,7 +19,7 @@ app.add_middleware(
|
|
19 |
)
|
20 |
|
21 |
# === Load API Key from Hugging Face Secrets ===
|
22 |
-
API_KEY = os.getenv("API_KEY", "undefined")
|
23 |
|
24 |
# === Model Settings ===
|
25 |
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
|
@@ -65,17 +65,19 @@ async def chat(request: Request):
|
|
65 |
if not messages or not isinstance(messages, list):
|
66 |
raise ValueError("Invalid or missing 'messages' field.")
|
67 |
|
68 |
-
# ✅ FIXED: Process full conversation history, not just last message
|
69 |
temperature = body.get("temperature", 0.7)
|
70 |
max_tokens = body.get("max_tokens", 512)
|
71 |
|
72 |
except Exception as e:
|
73 |
return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
|
74 |
|
75 |
-
# ✅ FIXED:
|
|
|
|
|
|
|
76 |
formatted_prompt = ""
|
77 |
|
78 |
-
for message in
|
79 |
role = message.get("role", "")
|
80 |
content = message.get("content", "")
|
81 |
|
@@ -89,9 +91,8 @@ async def chat(request: Request):
|
|
89 |
# Add the assistant start token for generation
|
90 |
formatted_prompt += "<|im_start|>assistant\n"
|
91 |
|
92 |
-
print(f"🤖 Processing
|
93 |
-
|
94 |
-
|
95 |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
|
96 |
|
97 |
# ✅ Generate Response
|
@@ -102,19 +103,33 @@ async def chat(request: Request):
|
|
102 |
temperature=temperature,
|
103 |
top_p=0.9,
|
104 |
do_sample=True,
|
105 |
-
pad_token_id=tokenizer.eos_token_id
|
|
|
106 |
)
|
107 |
|
108 |
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
109 |
|
110 |
-
# ✅ FIXED:
|
111 |
final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
|
112 |
|
113 |
-
# Remove any
|
114 |
if "<|im_end|>" in final_answer:
|
115 |
final_answer = final_answer.split("<|im_end|>")[0].strip()
|
116 |
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# ✅ OpenAI-style Response
|
120 |
return {
|
|
|
19 |
)
|
20 |
|
21 |
# === Load API Key from Hugging Face Secrets ===
|
22 |
+
API_KEY = os.getenv("API_KEY", "undefined")
|
23 |
|
24 |
# === Model Settings ===
|
25 |
BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
|
|
|
65 |
if not messages or not isinstance(messages, list):
|
66 |
raise ValueError("Invalid or missing 'messages' field.")
|
67 |
|
|
|
68 |
temperature = body.get("temperature", 0.7)
|
69 |
max_tokens = body.get("max_tokens", 512)
|
70 |
|
71 |
except Exception as e:
|
72 |
return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
|
73 |
|
74 |
+
# ✅ FIXED: Only use last 4 messages to prevent stacking
|
75 |
+
recent_messages = messages[-4:] if len(messages) > 4 else messages
|
76 |
+
|
77 |
+
# ✅ Build clean conversation prompt
|
78 |
formatted_prompt = ""
|
79 |
|
80 |
+
for message in recent_messages:
|
81 |
role = message.get("role", "")
|
82 |
content = message.get("content", "")
|
83 |
|
|
|
91 |
# Add the assistant start token for generation
|
92 |
formatted_prompt += "<|im_start|>assistant\n"
|
93 |
|
94 |
+
print(f"🤖 Processing {len(recent_messages)} recent messages")
|
95 |
+
|
|
|
96 |
inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
|
97 |
|
98 |
# ✅ Generate Response
|
|
|
103 |
temperature=temperature,
|
104 |
top_p=0.9,
|
105 |
do_sample=True,
|
106 |
+
pad_token_id=tokenizer.eos_token_id,
|
107 |
+
eos_token_id=tokenizer.eos_token_id
|
108 |
)
|
109 |
|
110 |
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
111 |
|
112 |
+
# ✅ FIXED: Extract ONLY the new assistant response
|
113 |
final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
|
114 |
|
115 |
+
# Remove any end tokens or artifacts
|
116 |
if "<|im_end|>" in final_answer:
|
117 |
final_answer = final_answer.split("<|im_end|>")[0].strip()
|
118 |
|
119 |
+
# Remove any repeated system prompts or guidelines that leaked through
|
120 |
+
if "Guidelines:" in final_answer:
|
121 |
+
final_answer = final_answer.split("Guidelines:")[0].strip()
|
122 |
+
|
123 |
+
if "Response format:" in final_answer:
|
124 |
+
final_answer = final_answer.split("Response format:")[0].strip()
|
125 |
+
|
126 |
+
# Remove VS Code context if it leaked through
|
127 |
+
if "[VS Code Context:" in final_answer:
|
128 |
+
lines = final_answer.split('\n')
|
129 |
+
cleaned_lines = [line for line in lines if not line.strip().startswith('[VS Code Context:')]
|
130 |
+
final_answer = '\n'.join(cleaned_lines).strip()
|
131 |
+
|
132 |
+
print(f"✅ Clean response: {final_answer[:100]}...")
|
133 |
|
134 |
# ✅ OpenAI-style Response
|
135 |
return {
|