Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 14 days ago

Commit

3afe501

verified ·

1 Parent(s): 45afec6

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +107 -56

app/main.py CHANGED Viewed

@@ -41,6 +41,57 @@ model.eval()
 print("✅ Model and adapter loaded successfully.")
 # === Root Route ===
 @app.get("/")
 def root():
@@ -64,79 +115,74 @@ async def chat(request: Request):
         messages = body.get("messages", [])
         if not messages or not isinstance(messages, list):
             raise ValueError("Invalid or missing 'messages' field.")
-        # Extract system and user messages
-        system_message = ""
-        user_messages = []
-        for msg in messages:
-            if msg.get("role") == "system":
-                system_message = msg.get("content", "")
-            elif msg.get("role") in ["user", "assistant"]:
-                user_messages.append(msg)
-        # Get the last user message
-        if not user_messages:
-            raise ValueError("No user messages found.")
-        user_prompt = user_messages[-1]["content"]
     except Exception as e:
         return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
-    # ✅ FIXED: Simplified prompt formatting - no system message in prompt
-    # The system message is handled by the frontend logic, not in the model prompt
-    formatted_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
-    # ✅ Generate Response with better settings for small model
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=400,  # Reduced for more focused responses
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
-            repetition_penalty=1.1,  # Prevent repetition
-            length_penalty=1.0
         )
-    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # ✅ FIXED: Better extraction - remove the prompt part completely
-    final_answer = decoded.split("<|im_start|>assistant\n")[-1].strip()
-    # ✅ Additional cleaning to prevent system message leakage
-    if final_answer.lower().startswith(("you are a helpful", "i am a helpful", "as a helpful")):
-        # If the response starts with system-like text, try to extract actual content
-        lines = final_answer.split('\n')
-        cleaned_lines = []
-        found_content = False
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            # Skip system-like phrases
-            if any(phrase in line.lower() for phrase in [
-                "you are a helpful", "i am a helpful", "as a helpful assistant",
-                "how can i help", "what can i help", "i'm here to help"
-            ]):
-                continue
-            # This looks like actual content
-            found_content = True
-            cleaned_lines.append(line)
-        if found_content:
-            final_answer = '\n'.join(cleaned_lines)
-    # ✅ Fallback if response is too short or looks like system message
-    if len(final_answer.strip()) < 10 or final_answer.lower().startswith(("system", "user", "assistant")):
-        final_answer = "I understand your question. Let me help you with that."
     # ✅ OpenAI-style Response
     return {
@@ -152,5 +198,10 @@ async def chat(request: Request):
                 },
                 "finish_reason": "stop"
             }
-        ]
     }

 print("✅ Model and adapter loaded successfully.")
+def clean_response(raw_response):
+    """
+    Clean the model response by removing unwanted artifacts while preserving the actual answer.
+    """
+    if not raw_response or len(raw_response.strip()) < 2:
+        return "I apologize, but I couldn't generate a proper response. Please try again."
+    # Remove common chat template artifacts
+    cleaned = raw_response.strip()
+    # Remove system/user/assistant prefixes that might leak through
+    prefixes_to_remove = [
+        "system\n", "user\n", "assistant\n",
+        "System:", "User:", "Assistant:",
+        "<|im_start|>", "<|im_end|>",
+        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+        "You are a helpful assistant.",
+        "I am a helpful assistant.",
+        "As a helpful assistant,",
+    ]
+    for prefix in prefixes_to_remove:
+        if cleaned.lower().startswith(prefix.lower()):
+            cleaned = cleaned[len(prefix):].strip()
+    # Remove any remaining template artifacts
+    lines = cleaned.split('\n')
+    filtered_lines = []
+    for line in lines:
+        line_stripped = line.strip()
+        # Skip empty lines at the beginning
+        if not line_stripped and not filtered_lines:
+            continue
+        # Skip obvious template artifacts
+        if line_stripped in ["system", "user", "assistant"]:
+            continue
+        filtered_lines.append(line)
+    cleaned = '\n'.join(filtered_lines).strip()
+    # If we still have content, return it
+    if cleaned and len(cleaned) > 5:
+        return cleaned
+    # Fallback only if truly empty
+    return "I understand your question. Let me help you with that."
 # === Root Route ===
 @app.get("/")
 def root():
         messages = body.get("messages", [])
         if not messages or not isinstance(messages, list):
             raise ValueError("Invalid or missing 'messages' field.")
     except Exception as e:
         return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
+    # ✅ FIXED: Use proper Qwen2.5 chat template formatting
+    try:
+        # Use the tokenizer's built-in chat template - this is the correct way!
+        formatted_prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        print(f"🔍 Formatted prompt: {formatted_prompt}")
+    except Exception as e:
+        print(f"❌ Chat template error: {e}")
+        # Fallback to manual formatting if needed
+        formatted_prompt = ""
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            if role == "system":
+                formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+            elif role == "user":
+                formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
+            elif role == "assistant":
+                formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
+        formatted_prompt += "<|im_start|>assistant\n"
     inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
+    # ✅ Generate Response with optimized settings
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=512,  # Increased for better responses
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
+            repetition_penalty=1.05,  # Slightly reduced
+            length_penalty=1.0,
+            early_stopping=True
         )
+    # ✅ FIXED: Better response extraction
+    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print(f"🔍 Full generated response: {full_response}")
+    # Extract only the new generated part (after the prompt)
+    if formatted_prompt in full_response:
+        generated_part = full_response.split(formatted_prompt)[-1].strip()
+    else:
+        # If we can't find the exact prompt, try to extract the assistant's response
+        assistant_marker = "<|im_start|>assistant\n"
+        if assistant_marker in full_response:
+            parts = full_response.split(assistant_marker)
+            generated_part = parts[-1].split("<|im_end|>")[0].strip() if len(parts) > 1 else full_response
+        else:
+            generated_part = full_response
+    print(f"🔍 Extracted generated part: {generated_part}")
+    # ✅ Clean the response but keep it intact
+    final_answer = clean_response(generated_part)
+    print(f"🔍 Final cleaned answer: {final_answer}")
     # ✅ OpenAI-style Response
     return {
                 },
                 "finish_reason": "stop"
             }
+        ],
+        "usage": {
+            "prompt_tokens": len(inputs.input_ids[0]),
+            "completion_tokens": len(outputs[0]) - len(inputs.input_ids[0]),
+            "total_tokens": len(outputs[0])
+        }
     }