Spaces:

Ais203
/

aigen

Sleeping

App Files Files Community

Ais commited on 17 days ago

Commit

9c1b824

verified ·

1 Parent(s): bf0f8a4

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +208 -187

app/main.py CHANGED Viewed

@@ -1,190 +1,211 @@
-// chat.ts - Fixed Apollo AI Chat Module
-import * as vscode from 'vscode';
-// Use global fetch if available (VS Code >=1.74), otherwise fallback to dynamic import of node-fetch
-async function getFetch(): Promise<any> {
-  if (typeof (globalThis as any).fetch !== 'undefined') {
-    return (globalThis as any).fetch;
-  } else {
-    try {
-      const fetch = require('node-fetch');
-      return fetch.default || fetch;
-    } catch (error) {
-      throw new Error('Unable to load fetch. Please ensure node-fetch is installed.');
-    }
-  }
-}
-// Configuration
-const API_URL = 'https://ais0909-aigen.hf.space/v1/chat/completions';
-const API_KEY = 'aigenapikey1234567890';
-const MAX_RETRIES = 3;
-const TIMEOUT_MS = 300000; // 5 minutes
-interface APIResponse {
-  choices?: Array<{
-    message?: { content?: string };
-    text?: string;
-  }>;
-  generated_text?: string;
-  error?: string;
-  id?: string;
-  object?: string;
-  model?: string;
-}
-interface ChatContext {
-  currentFile?: string;
-  language?: string;
-  workspaceFolder?: string;
-  selectedText?: string;
-}
-export class ApolloAI {
-  private static conversationHistory: Array<{role: string, content: string}> = [];
-  private static context: ChatContext = {};
-  static setContext(context: ChatContext) {
-    this.context = context;
-  }
-  static addToHistory(role: 'user' | 'assistant', content: string) {
-    this.conversationHistory.push({ role, content });
-    // Keep only last 2 messages to prevent conversation stacking
-    if (this.conversationHistory.length > 2) {
-      this.conversationHistory = this.conversationHistory.slice(-2);
-    }
-  }
-  static clearHistory() {
-    this.conversationHistory = [];
-  }
-  static getHistory() {
-    return [...this.conversationHistory];
-  }
-}
-export async function askAI(prompt: string, options: {
-  temperature?: number;
-  maxTokens?: number;
-  includeContext?: boolean;
-  retries?: number;
-  forceMode?: boolean;
-} = {}): Promise<string> {
-  const {
-    temperature = 0.7,
-    maxTokens = 1500,
-    includeContext = true,
-    retries = MAX_RETRIES,
-    forceMode = false
-  } = options;
-  console.log('🤖 Apollo AI: Starting request for prompt:', prompt.substring(0, 100) + '...');
-  console.log('🔧 Force mode:', forceMode);
-  // Build messages array for proper conversation
-  const messages = [];
-  // ✅ FIXED: Much simpler system messages
-  if (forceMode) {
-    messages.push({
-      role: 'system',
-      content: 'Give direct, brief answers only. No explanations.'
-    });
-  } else {
-    messages.push({
-      role: 'system',
-      content: 'You are a helpful assistant.'
-    });
-  }
-  // Add conversation history (only if includeContext is true and we have history)
-  if (includeContext && ApolloAI.getHistory().length > 0) {
-    const history = ApolloAI.getHistory().slice(-2); // Last 2 messages only
-    for (const msg of history) {
-      messages.push({
-        role: msg.role,
-        content: msg.content
-      });
-    }
-  }
-  // Add current user message
-  messages.push({
-    role: 'user',
-    content: prompt
-  });
-  // Add VS Code context if available (but not in conversation history)
-  const editor = vscode.window.activeTextEditor;
-  if (includeContext && editor && !forceMode) {
-    const fileName = editor.document.fileName.split(/[/\\]/).pop();
-    const language = editor.document.languageId;
-    messages[messages.length - 1].content += `\n\n[VS Code Context: Editing ${fileName} (${language})]`;
-  }
-  const headers = {
-    'Authorization': `Bearer ${API_KEY}`,
-    'Content-Type': 'application/json',
-    'User-Agent': 'Apollo-AI-VSCode-Extension/1.2.0'
-  };
-  const body = {
-    messages: messages,
-    temperature: forceMode ? 0.3 : temperature, // Lower temperature for force mode
-    max_tokens: forceMode ? 200 : maxTokens,    // Much shorter responses for force mode
-    stream: false
-  };
-  for (let attempt = 1; attempt <= retries; attempt++) {
-    try {
-      const fetchImpl = await getFetch();
-      console.log(`🚀 Apollo AI: Attempt ${attempt}/${retries}, sending request to API...`);
-      console.log('📤 Request body:', JSON.stringify(body, null, 2));
-      const controller = new AbortController();
-      const timeoutId = setTimeout(() => controller.abort(), TIMEOUT_MS);
-      const res = await fetchImpl(API_URL, {
-        method: 'POST',
-        headers,
-        body: JSON.stringify(body),
-        signal: controller.signal
-      });
-      clearTimeout(timeoutId);
-      console.log('📨 Apollo AI: Received response, status:', res.status);
-      if (!res.ok) {
-        const errorText = await res.text().catch(() => 'Unable to read error response');
-        console.error(`❌ Apollo AI: API Error ${res.status}: ${errorText}`);
-        if (res.status === 429) {
-          throw new Error('⏱️ Rate limit exceeded. Please wait a moment and try again.');
-        } else if (res.status === 401) {
-          throw new Error('🔑 Authentication failed. Please check your API key.');
-        } else if (res.status >= 500) {
-          throw new Error('🔧 Server error. The AI service is temporarily unavailable.');
-        }
-        throw new Error(`API Error (${res.status}): ${res.statusText}`);
-      }
-      const json: APIResponse = await res.json();
-      console.log('📦 Apollo AI: Raw JSON response:', JSON.stringify(json, null, 2));
-      // ✅ FIXED: Extract response from proper JSON structure
-      let responseText = '';
-      // Handle the actual API response format
-      if (json.choices && json.choices[0] && json.choices[0].message) {
-        responseText = json.choices[0].message.content || '';
-        console.log('✅ Extracted content from JSON response:', responseText.substring(0, 100) + '...');
-      } else if (json.generated_text) {
-        responseText = json.generated_text;
-        console.log('✅ Extracted generated_text from response

+import os
+import torch
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+from starlette.middleware.cors import CORSMiddleware
+# === Setup FastAPI ===
+app = FastAPI()
+# === CORS (optional for frontend access) ===
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# === Load API Key from Hugging Face Secrets ===
+API_KEY = os.getenv("API_KEY", "undefined")
+# === Model Settings ===
+BASE_MODEL = "Qwen/Qwen2-0.5B-Instruct"
+ADAPTER_PATH = "adapter"
+print("🔧 Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+print("🧠 Loading base model on CPU...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    trust_remote_code=True,
+    torch_dtype=torch.float32
+).cpu()
+print("🔗 Applying LoRA adapter...")
+model = PeftModel.from_pretrained(base_model, ADAPTER_PATH).cpu()
+model.eval()
+print("✅ Model and adapter loaded successfully.")
+# === Root Route ===
+@app.get("/")
+def root():
+    return {"message": "🧠 Qwen2.5-0.5B-Instruct API is running on CPU!"}
+# === Chat Completion API ===
+@app.post("/v1/chat/completions")
+async def chat(request: Request):
+    # ✅ API Key Authorization
+    auth_header = request.headers.get("Authorization", "")
+    if not auth_header.startswith("Bearer "):
+        return JSONResponse(status_code=401, content={"error": "Missing Bearer token in Authorization header."})
+    token = auth_header.replace("Bearer ", "").strip()
+    if token != API_KEY:
+        return JSONResponse(status_code=401, content={"error": "Invalid API key."})
+    # ✅ Parse Request
+    try:
+        body = await request.json()
+        messages = body.get("messages", [])
+        if not messages or not isinstance(messages, list):
+            raise ValueError("Invalid or missing 'messages' field.")
+        temperature = body.get("temperature", 0.7)
+        max_tokens = body.get("max_tokens", 512)
+    except Exception as e:
+        return JSONResponse(status_code=400, content={"error": f"Bad request: {str(e)}"})
+    # ✅ FIXED: Only use last 4 messages to prevent stacking
+    recent_messages = messages[-4:] if len(messages) > 4 else messages
+    # ✅ Build clean conversation prompt
+    formatted_prompt = ""
+    for message in recent_messages:
+        role = message.get("role", "")
+        content = message.get("content", "")
+        if role == "system":
+            formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n"
+        elif role == "user":
+            formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n"
+        elif role == "assistant":
+            formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n"
+    # Add the assistant start token for generation
+    formatted_prompt += "<|im_start|>assistant\n"
+    print(f"🤖 Processing {len(recent_messages)} recent messages")
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cpu")
+    # ✅ Generate Response
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # ✅ MUCH BETTER: Extract only the final assistant response
+    if "<|im_start|>assistant\n" in decoded:
+        # Get everything after the LAST assistant token
+        parts = decoded.split("<|im_start|>assistant\n")
+        final_answer = parts[-1].strip()
+    else:
+        # Fallback if no assistant token found
+        final_answer = decoded.strip()
+    # Remove end token
+    if "<|im_end|>" in final_answer:
+        final_answer = final_answer.split("<|im_end|>")[0].strip()
+    # ✅ CRITICAL: Remove conversation artifacts that leak through
+    # Remove user/assistant role labels that appear in content
+    final_answer = final_answer.replace("user\n", "").replace("assistant\n", "")
+    # Remove repeated questions and conversation artifacts
+    lines = final_answer.split('\n')
+    cleaned_lines = []
+    seen_content = set()
+    found_answer = False
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines at the start
+        if not line and not found_answer:
+            continue
+        # Skip if this exact line was seen before (removes repeats)
+        if line in seen_content:
+            continue
+        # Skip lines that look like user prompts being repeated
+        if line.endswith('?') and len(line) < 100 and not found_answer:
+            print(f"🚫 Skipping repeated question: {line}")
+            continue
+        # Skip role indicators
+        if line in ['user', 'assistant', 'system']:
+            continue
+        # Skip conversation tokens
+        if '<|im_start|>' in line or '<|im_end|>' in line:
+            continue
+        # If we get here, this looks like actual content
+        found_answer = True
+        cleaned_lines.append(line)
+        seen_content.add(line)
+    final_answer = '\n'.join(cleaned_lines).strip()
+    # Remove VS Code context if it leaked through
+    if "[VS Code Context:" in final_answer:
+        context_lines = final_answer.split('\n')
+        cleaned_context_lines = [line for line in context_lines if not line.strip().startswith('[VS Code Context:')]
+        final_answer = '\n'.join(cleaned_context_lines).strip()
+    # Remove system prompts that leaked through
+    system_indicators = [
+        "Guidelines:",
+        "Response format:",
+        "You are a helpful",
+        "I'm here to help",
+        "system\n",
+        "assistant\n",
+        "user\n"
+    ]
+    for indicator in system_indicators:
+        if indicator in final_answer:
+            final_answer = final_answer.split(indicator)[0].strip()
+    # Clean up extra whitespace
+    final_answer = final_answer.replace('\n\n\n', '\n\n').strip()
+    # Ensure we have some content
+    if not final_answer or len(final_answer.strip()) < 3:
+        final_answer = "I apologize, but I couldn't generate a proper response. Please try again."
+    print(f"✅ Clean response: {final_answer[:100]}...")
+    # ✅ OpenAI-style Response
+    return {
+        "id": "chatcmpl-local-001",
+        "object": "chat.completion",
+        "model": "Qwen2.5-0.5B-Instruct-LoRA",
+        "choices": [
+            {
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": final_answer
+                },
+                "finish_reason": "stop"
+            }
+        ]
+    }