Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 17 days ago

Commit

45afec6

verified ·

1 Parent(s): 9c1b824

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +52 -107

app/main.py CHANGED Viewed

@@ -19,7 +19,7 @@ app.add_middleware(
 )
 # === Load API Key from Hugging Face Secrets ===
-API_KEY = os.getenv("API_KEY", "undefined")
 # === Model Settings ===
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
@@ -53,7 +53,7 @@ async def chat(request: Request):
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer "):
         return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
     token = auth_header.replace("Bearer ", "").strip()
     if token != API_KEY:
         return JSONResponse(status_code=401, content={"error": "Invalid API key."})
@@ -65,133 +65,78 @@ async def chat(request: Request):
         if not messages or not isinstance(messages, list):
             raise ValueError("Invalid or missing 'messages' field.")
-        temperature = body.get("temperature", 0.7)
-        max_tokens = body.get("max_tokens", 512)
     except Exception as e:
         return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
-    # ✅ FIXED: Only use last 4 messages to prevent stacking
-    recent_messages = messages[-4:] if len(messages) > 4 else messages
-    # ✅ Build clean conversation prompt
-    formatted_prompt = ""
-    for message in recent_messages:
-        role = message.get("role", "")
-        content = message.get("content", "")
-        if role == "system":
-            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
-        elif role == "user":
-            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
-        elif role == "assistant":
-            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
-    # Add the assistant start token for generation
-    formatted_prompt += "<|im_start|>assistant\n"
-    print(f"🤖 Processing {len(recent_messages)} recent messages")
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
-    # ✅ Generate Response
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
             top_p=0.9,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id
         )
     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # ✅ MUCH BETTER: Extract only the final assistant response
-    if "<|im_start|>assistant\n" in decoded:
-        # Get everything after the LAST assistant token
-        parts = decoded.split("<|im_start|>assistant\n")
-        final_answer = parts[-1].strip()
-    else:
-        # Fallback if no assistant token found
-        final_answer = decoded.strip()
-    # Remove end token
-    if "<|im_end|>" in final_answer:
-        final_answer = final_answer.split("<|im_end|>")[0].strip()
-    # ✅ CRITICAL: Remove conversation artifacts that leak through
-    # Remove user/assistant role labels that appear in content
-    final_answer = final_answer.replace("user\n", "").replace("assistant\n", "")
-    # Remove repeated questions and conversation artifacts
-    lines = final_answer.split('\n')
-    cleaned_lines = []
-    seen_content = set()
-    found_answer = False
-    for line in lines:
-        line = line.strip()
-        # Skip empty lines at the start
-        if not line and not found_answer:
-            continue
-        # Skip if this exact line was seen before (removes repeats)
-        if line in seen_content:
-            continue
-        # Skip lines that look like user prompts being repeated
-        if line.endswith('?') and len(line) < 100 and not found_answer:
-            print(f"🚫 Skipping repeated question: {line}")
-            continue
-        # Skip role indicators
-        if line in ['user', 'assistant', 'system']:
-            continue
-        # Skip conversation tokens
-        if '<|im_start|>' in line or '<|im_end|>' in line:
-            continue
-        # If we get here, this looks like actual content
-        found_answer = True
-        cleaned_lines.append(line)
-        seen_content.add(line)
-    final_answer = '\n'.join(cleaned_lines).strip()
-    # Remove VS Code context if it leaked through
-    if "[VS Code Context:" in final_answer:
-        context_lines = final_answer.split('\n')
-        cleaned_context_lines = [line for line in context_lines if not line.strip().startswith('[VS Code Context:')]
-        final_answer = '\n'.join(cleaned_context_lines).strip()
-    # Remove system prompts that leaked through
-    system_indicators = [
-        "Guidelines:",
-        "Response format:",
-        "You are a helpful",
-        "I'm here to help",
-        "system\n",
-        "assistant\n",
-        "user\n"
-    ]
-    for indicator in system_indicators:
-        if indicator in final_answer:
-            final_answer = final_answer.split(indicator)[0].strip()
-    # Clean up extra whitespace
-    final_answer = final_answer.replace('\n\n\n', '\n\n').strip()
-    # Ensure we have some content
-    if not final_answer or len(final_answer.strip()) < 3:
-        final_answer = "I apologize, but I couldn't generate a proper response. Please try again."
-    print(f"✅ Clean response: {final_answer[:100]}...")
     # ✅ OpenAI-style Response
     return {

 )
 # === Load API Key from Hugging Face Secrets ===
+API_KEY = os.getenv("API_KEY", "undefined")  # Add API_KEY in your HF Space Secrets
 # === Model Settings ===
 BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
     auth_header = request.headers.get("Authorization", "")
     if not auth_header.startswith("Bearer "):
         return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
     token = auth_header.replace("Bearer ", "").strip()
     if token != API_KEY:
         return JSONResponse(status_code=401, content={"error": "Invalid API key."})
         if not messages or not isinstance(messages, list):
             raise ValueError("Invalid or missing 'messages' field.")
+        # Extract system and user messages
+        system_message = ""
+        user_messages = []
+        for msg in messages:
+            if msg.get("role") == "system":
+                system_message = msg.get("content", "")
+            elif msg.get("role") in ["user", "assistant"]:
+                user_messages.append(msg)
+        # Get the last user message
+        if not user_messages:
+            raise ValueError("No user messages found.")
+        user_prompt = user_messages[-1]["content"]
     except Exception as e:
         return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
+    # ✅ FIXED: Simplified prompt formatting - no system message in prompt
+    # The system message is handled by the frontend logic, not in the model prompt
+    formatted_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
+    # ✅ Generate Response with better settings for small model
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=400,  # Reduced for more focused responses
+            temperature=0.7,
             top_p=0.9,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
+            repetition_penalty=1.1,  # Prevent repetition
+            length_penalty=1.0
         )
     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # ✅ FIXED: Better extraction - remove the prompt part completely
+    final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
+    # ✅ Additional cleaning to prevent system message leakage
+    if final_answer.lower().startswith(("you are a helpful", "i am a helpful", "as a helpful")):
+        # If the response starts with system-like text, try to extract actual content
+        lines = final_answer.split('\n')
+        cleaned_lines = []
+        found_content = False
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            # Skip system-like phrases
+            if any(phrase in line.lower() for phrase in [
+                "you are a helpful", "i am a helpful", "as a helpful assistant",
+                "how can i help", "what can i help", "i'm here to help"
+            ]):
+                continue
+            # This looks like actual content
+            found_content = True
+            cleaned_lines.append(line)
+        if found_content:
+            final_answer = '\n'.join(cleaned_lines)
+    # ✅ Fallback if response is too short or looks like system message
+    if len(final_answer.strip()) < 10 or final_answer.lower().startswith(("system", "user", "assistant")):
+        final_answer = "I understand your question. Let me help you with that."
     # ✅ OpenAI-style Response
     return {