Spaces:

Echo-AI-official
/

Deepseek-R1-1.5b-API

Running

App Files Files Community

Echo-ai commited on Feb 3

Commit

f023e65

verified ·

1 Parent(s): 3ddd5b6

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -16

app.py CHANGED Viewed

@@ -2,20 +2,21 @@ import os
 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
 MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
 os.makedirs(MODEL_DIR, exist_ok=True)
 if not os.path.exists(MODEL_PATH):
     print(f"Downloading model from {MODEL_URL}...")
     response = requests.get(MODEL_URL, stream=True)
@@ -29,8 +30,12 @@ if not os.path.exists(MODEL_PATH):
 else:
     print("Model already exists. Skipping download.")
-app = FastAPI(title="DeepSeek-R1 OpenAI-Compatible API")
 # CORS Configuration
 app.add_middleware(
@@ -40,21 +45,76 @@ app.add_middleware(
     allow_headers=["*"],
 )
 print("Loading model...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
-        n_ctx=2048,
-        n_threads=4,
-        n_gpu_layers=0,
         verbose=False
     )
     print("Model loaded successfully!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
 class ChatCompletionRequest(BaseModel):
     model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
     messages: list[dict]
@@ -63,7 +123,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: float = 0.9
     stream: bool = False
 class ChatCompletionResponse(BaseModel):
     id: str = "chatcmpl-12345"
     object: str = "chat.completion"
@@ -72,14 +132,12 @@ class ChatCompletionResponse(BaseModel):
     choices: list[dict]
     usage: dict
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
@@ -88,7 +146,6 @@ async def chat_completion(request: ChatCompletionRequest):
             stop=["</s>"]
         )
         return ChatCompletionResponse(
             choices=[{
                 "index": 0,
@@ -107,11 +164,9 @@ async def chat_completion(request: ChatCompletionRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 def health_check():
     return {"status": "healthy"}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import requests
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
 from llama_cpp import Llama
 from pydantic import BaseModel
 import uvicorn
+# Configuration
 MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
 MODEL_DIR = "model"
 MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
+# Create model directory if it doesn't exist
 os.makedirs(MODEL_DIR, exist_ok=True)
+# Download the model if it doesn't exist
 if not os.path.exists(MODEL_PATH):
     print(f"Downloading model from {MODEL_URL}...")
     response = requests.get(MODEL_URL, stream=True)
 else:
     print("Model already exists. Skipping download.")
+# Initialize FastAPI
+app = FastAPI(
+    title="DeepSeek-R1 OpenAI-Compatible API",
+    description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
+    version="1.0.0"
+)
 # CORS Configuration
 app.add_middleware(
     allow_headers=["*"],
 )
+# Load the model
 print("Loading model...")
 try:
     llm = Llama(
         model_path=MODEL_PATH,
+        n_ctx=2048,
+        n_threads=4,
+        n_gpu_layers=0,
         verbose=False
     )
     print("Model loaded successfully!")
 except Exception as e:
     raise RuntimeError(f"Failed to load model: {str(e)}")
+# Root endpoint with documentation
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    return f"""
+    <html>
+        <head>
+            <title>DeepSeek-R1 OpenAI API</title>
+            <style>
+                body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
+                .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
+                a {{ color: #007bff; text-decoration: none; }}
+                code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
+            </style>
+        </head>
+        <body>
+            <h1>DeepSeek-R1 OpenAI-Compatible API</h1>
+            <div class="warning">
+                <h3>⚠️ Important Notice</h3>
+                <p>For private use, please duplicate this space:<br>
+                1. Click your profile picture in the top-right<br>
+                2. Select "Duplicate Space"<br>
+                3. Set visibility to Private</p>
+            </div>
+            <h2>API Documentation</h2>
+            <ul>
+                <li><a href="/docs">Interactive Swagger Documentation</a></li>
+                <li><a href="/redoc">ReDoc Documentation</a></li>
+            </ul>
+            <h2>API Endpoints</h2>
+            <h3>Chat Completion</h3>
+            <p><code>POST /v1/chat/completions</code></p>
+            <p>Parameters:</p>
+            <ul>
+                <li><strong>messages</strong>: List of message objects</li>
+                <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
+                <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
+                <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
+            </ul>
+            <h2>Example Request</h2>
+            <pre>
+curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
+-H "Content-Type: application/json" \\
+-d '{{
+  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
+  "max_tokens": 150
+}}'
+            </pre>
+        </body>
+    </html>
+    """
+# OpenAI-Compatible Request Schema
 class ChatCompletionRequest(BaseModel):
     model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
     messages: list[dict]
     top_p: float = 0.9
     stream: bool = False
+# OpenAI-Compatible Response Schema
 class ChatCompletionResponse(BaseModel):
     id: str = "chatcmpl-12345"
     object: str = "chat.completion"
     choices: list[dict]
     usage: dict
 @app.post("/v1/chat/completions")
 async def chat_completion(request: ChatCompletionRequest):
     try:
         prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
         prompt += "\nassistant:"
         response = llm(
             prompt=prompt,
             max_tokens=request.max_tokens,
             stop=["</s>"]
         )
         return ChatCompletionResponse(
             choices=[{
                 "index": 0,
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 def health_check():
     return {"status": "healthy"}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)