Echo-ai
Update app.py
f9e8a03 verified
import os
import requests
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse
from llama_cpp import Llama
from pydantic import BaseModel
import uvicorn
# Configuration
MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
MODEL_DIR = "model"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
# Create model directory if it doesn't exist
os.makedirs(MODEL_DIR, exist_ok=True)
# Download the model if it doesn't exist
if not os.path.exists(MODEL_PATH):
print(f"Downloading model from {MODEL_URL}...")
response = requests.get(MODEL_URL, stream=True)
if response.status_code == 200:
with open(MODEL_PATH, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print("Model downloaded successfully!")
else:
raise RuntimeError(f"Failed to download model: HTTP {response.status_code}")
else:
print("Model already exists. Skipping download.")
# Initialize FastAPI
app = FastAPI(
title="DeepSeek-R1 OpenAI-Compatible API",
description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
version="1.0.0"
)
# CORS Configuration
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# Load the model
print("Loading model...")
try:
llm = Llama(
model_path=MODEL_PATH,
n_ctx=2048,
n_threads=4,
n_gpu_layers=0,
verbose=False
)
print("Model loaded successfully!")
except Exception as e:
raise RuntimeError(f"Failed to load model: {str(e)}")
# Root endpoint with documentation
@app.get("/", response_class=HTMLResponse)
async def root():
return f"""
<html>
<head>
<title>DeepSeek-R1 OpenAI API</title>
<style>
body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
.warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
a {{ color: #007bff; text-decoration: none; }}
code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
</style>
</head>
<body>
<h1>DeepSeek-R1 OpenAI-Compatible API</h1>
<div class="warning">
<h3>⚠️ Important Notice</h3>
<p>For private use, please duplicate this space:<br>
1. Click your profile picture in the top-right<br>
2. Select "Duplicate Space"<br>
3. Set visibility to Private</p>
</div>
<h2>API Documentation</h2>
<ul>
<li><a href="/docs">Interactive Swagger Documentation</a></li>
<li><a href="/redoc">ReDoc Documentation</a></li>
</ul>
<h2>API Endpoints</h2>
<h3>Chat Completion</h3>
<p><code>POST /v1/chat/completions</code></p>
<p>Parameters:</p>
<ul>
<li><strong>messages</strong>: List of message objects</li>
<li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
<li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
<li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
</ul>
<h2>Example Request</h2>
<pre>
curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
-H "Content-Type: application/json" \\
-d '{{
"messages": [{{"role": "user", "content": "Explain quantum computing"}}],
"max_tokens": 150
}}'
</pre>
</body>
</html>
"""
# OpenAI-Compatible Request Schema
class ChatCompletionRequest(BaseModel):
model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
messages: list[dict]
max_tokens: int = 128
temperature: float = 0.7
top_p: float = 0.9
stream: bool = False
# OpenAI-Compatible Response Schema
class ChatCompletionResponse(BaseModel):
id: str = "chatcmpl-12345"
object: str = "chat.completion"
created: int = 1693161600
model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
choices: list[dict]
usage: dict
@app.post("/v1/chat/completions")
async def chat_completion(request: ChatCompletionRequest):
try:
prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
prompt += "\nassistant:"
response = llm(
prompt=prompt,
max_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
stop=["</s>"]
)
return ChatCompletionResponse(
choices=[{
"index": 0,
"message": {
"role": "assistant",
"content": response['choices'][0]['text'].strip()
},
"finish_reason": "stop"
}],
usage={
"prompt_tokens": len(prompt),
"completion_tokens": len(response['choices'][0]['text']),
"total_tokens": len(prompt) + len(response['choices'][0]['text'])
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
def health_check():
return {"status": "healthy"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)