Spaces:

Echo-AI-official
/

Deepseek-R1-1.5b-API

Running

App Files Files Community

Echo-ai commited on Feb 3

Commit

b839d79

verified ·

1 Parent(s): f023e65

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -51

app.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import os
 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import HTMLResponse
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
 # Configuration
-MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
-MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
@@ -33,8 +36,8 @@ else:
 # Initialize FastAPI
 app = FastAPI(
     title="DeepSeek-R1 OpenAI-Compatible API",
-    description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
-    version="1.0.0"
 )
 # CORS Configuration
@@ -45,36 +48,68 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Load the model
-print("Loading model...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=2048,
-        n_threads=4,
         n_gpu_layers=0,
         verbose=False
     )
-    print("Model loaded successfully!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
-# Root endpoint with documentation
 @app.get("/", response_class=HTMLResponse)
 async def root():
     return f"""
     <html>
         <head>
-            <title>DeepSeek-R1 OpenAI API</title>
             <style>
                 body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
                 .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
                 a {{ color: #007bff; text-decoration: none; }}
                 code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
             </style>
         </head>
         <body>
-            <h1>DeepSeek-R1 OpenAI-Compatible API</h1>
             <div class="warning">
                 <h3>⚠️ Important Notice</h3>
@@ -84,29 +119,29 @@ async def root():
                 3. Set visibility to Private</p>
             </div>
             <h2>API Documentation</h2>
             <ul>
                 <li><a href="/docs">Interactive Swagger Documentation</a></li>
                 <li><a href="/redoc">ReDoc Documentation</a></li>
             </ul>
-            <h2>API Endpoints</h2>
-            <h3>Chat Completion</h3>
-            <p><code>POST /v1/chat/completions</code></p>
-            <p>Parameters:</p>
-            <ul>
-                <li><strong>messages</strong>: List of message objects</li>
-                <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
-                <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
-                <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
-            </ul>
-            <h2>Example Request</h2>
             <pre>
-curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
 -H "Content-Type: application/json" \\
 -d '{{
   "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
   "max_tokens": 150
 }}'
             </pre>
@@ -114,30 +149,26 @@ curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/co
     </html>
     """
-# OpenAI-Compatible Request Schema
-class ChatCompletionRequest(BaseModel):
-    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
-    messages: list[dict]
-    max_tokens: int = 128
-    temperature: float = 0.7
-    top_p: float = 0.9
-    stream: bool = False
-# OpenAI-Compatible Response Schema
-class ChatCompletionResponse(BaseModel):
-    id: str = "chatcmpl-12345"
-    object: str = "chat.completion"
-    created: int = 1693161600
-    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
-    choices: list[dict]
-    usage: dict
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
@@ -146,8 +177,12 @@ async def chat_completion(request: ChatCompletionRequest):
             stop=["</s>"]
         )
-        return ChatCompletionResponse(
-            choices=[{
                 "index": 0,
                 "message": {
                     "role": "assistant",
@@ -155,18 +190,32 @@ async def chat_completion(request: ChatCompletionRequest):
                 },
                 "finish_reason": "stop"
             }],
-            usage={
                 "prompt_tokens": len(prompt),
                 "completion_tokens": len(response['choices'][0]['text']),
                 "total_tokens": len(prompt) + len(response['choices'][0]['text'])
             }
-        )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
-def health_check():
-    return {"status": "healthy"}
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
 import requests
+import time
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, HTMLResponse
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
+from typing import Generator
+import threading
 # Configuration
+MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"  # Changed to Q4 for faster inference
+MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
 # Initialize FastAPI
 app = FastAPI(
     title="DeepSeek-R1 OpenAI-Compatible API",
+    description="Optimized OpenAI-compatible API with streaming support",
+    version="2.0.0"
 )
 # CORS Configuration
     allow_headers=["*"],
 )
+# Global model loader with optimized settings
+print("Loading model with optimized settings...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=1024,  # Reduced context window for faster processing
+        n_threads=8,  # Increased threads for better CPU utilization
+        n_batch=512,  # Larger batch size for improved throughput
         n_gpu_layers=0,
+        use_mlock=True,  # Prevent swapping to disk
         verbose=False
     )
+    print("Model loaded with optimized settings!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
+# Streaming generator
+def generate_stream(prompt: str, max_tokens: int, temperature: float, top_p: float) -> Generator[str, None, None]:
+    start_time = time.time()
+    stream = llm.create_completion(
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stop=["</s>"],
+        stream=True
+    )
+    for chunk in stream:
+        delta = chunk['choices'][0]['text']
+        yield f"data: {delta}\n\n"
+        # Early stopping if taking too long
+        if time.time() - start_time > 30:  # 30s timeout
+            break
+# OpenAI-Compatible Request Schema
+class ChatCompletionRequest(BaseModel):
+    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
+    messages: list[dict]
+    max_tokens: int = 256
+    temperature: float = 0.7
+    top_p: float = 0.9
+    stream: bool = False
+# Enhanced root endpoint with performance info
 @app.get("/", response_class=HTMLResponse)
 async def root():
     return f"""
     <html>
         <head>
+            <title>DeepSeek-R1 Optimized API</title>
             <style>
                 body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
                 .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
+                .info {{ color: #0c5460; background: #d1ecf1; padding: 15px; border-radius: 5px; }}
                 a {{ color: #007bff; text-decoration: none; }}
                 code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
             </style>
         </head>
         <body>
+            <h1>DeepSeek-R1 Optimized API</h1>
             <div class="warning">
                 <h3>⚠️ Important Notice</h3>
                 3. Set visibility to Private</p>
             </div>
+            <div class="info">
+                <h3>⚡ Performance Optimizations</h3>
+                <ul>
+                    <li>Quantization: Q4_K_M (optimized speed/quality balance)</li>
+                    <li>Batch processing: 512 tokens/chunk</li>
+                    <li>Streaming support with 30s timeout</li>
+                    <li>8 CPU threads utilization</li>
+                </ul>
+            </div>
             <h2>API Documentation</h2>
             <ul>
                 <li><a href="/docs">Interactive Swagger Documentation</a></li>
                 <li><a href="/redoc">ReDoc Documentation</a></li>
             </ul>
+            <h2>Example Streaming Request</h2>
             <pre>
+curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
 -H "Content-Type: application/json" \\
 -d '{{
   "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
+  "stream": true,
   "max_tokens": 150
 }}'
             </pre>
     </html>
     """
+# Async endpoint handler
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
+        if request.stream:
+            return StreamingResponse(
+                generate_stream(
+                    prompt=prompt,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                    top_p=request.top_p
+                ),
+                media_type="text/event-stream"
+            )
+        # Non-streaming response
+        start_time = time.time()
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
             stop=["</s>"]
         )
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": request.model,
+            "choices": [{
                 "index": 0,
                 "message": {
                     "role": "assistant",
                 },
                 "finish_reason": "stop"
             }],
+            "usage": {
                 "prompt_tokens": len(prompt),
                 "completion_tokens": len(response['choices'][0]['text']),
                 "total_tokens": len(prompt) + len(response['choices'][0]['text'])
             }
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "model_loaded": True,
+        "performance_settings": {
+            "n_threads": llm.params.n_threads,
+            "n_ctx": llm.params.n_ctx,
+            "n_batch": llm.params.n_batch
+        }
+    }
 if __name__ == "__main__":
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=7860,
+        timeout_keep_alive=300  # Keep alive for streaming connections
+    )