import os import requests from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import HTMLResponse from llama_cpp import Llama from pydantic import BaseModel import uvicorn # Configuration MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" MODEL_DIR = "model" MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) # Create model directory if it doesn't exist os.makedirs(MODEL_DIR, exist_ok=True) # Download the model if it doesn't exist if not os.path.exists(MODEL_PATH): print(f"Downloading model from {MODEL_URL}...") response = requests.get(MODEL_URL, stream=True) if response.status_code == 200: with open(MODEL_PATH, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print("Model downloaded successfully!") else: raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") else: print("Model already exists. Skipping download.") # Initialize FastAPI app = FastAPI( title="DeepSeek-R1 OpenAI-Compatible API", description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", version="1.0.0" ) # CORS Configuration app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # Load the model print("Loading model...") try: llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=4, n_gpu_layers=0, verbose=False ) print("Model loaded successfully!") except Exception as e: raise RuntimeError(f"Failed to load model: {str(e)}") # Root endpoint with documentation @app.get("/", response_class=HTMLResponse) async def root(): return f""" DeepSeek-R1 OpenAI API

DeepSeek-R1 OpenAI-Compatible API

⚠️ Important Notice

For private use, please duplicate this space:
1. Click your profile picture in the top-right
2. Select "Duplicate Space"
3. Set visibility to Private

API Documentation

API Endpoints

Chat Completion

POST /v1/chat/completions

Parameters:

messages: List of message objects
max_tokens: Maximum response length (default: 128)
temperature: Sampling temperature (default: 0.7)
top_p: Nucleus sampling threshold (default: 0.9)

Example Request

curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
-H "Content-Type: application/json" \\
-d '{{
  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
  "max_tokens": 150
}}'

""" # OpenAI-Compatible Request Schema class ChatCompletionRequest(BaseModel): model: str = "DeepSeek-R1-Distill-Qwen-1.5B" messages: list[dict] max_tokens: int = 128 temperature: float = 0.7 top_p: float = 0.9 stream: bool = False # OpenAI-Compatible Response Schema class ChatCompletionResponse(BaseModel): id: str = "chatcmpl-12345" object: str = "chat.completion" created: int = 1693161600 model: str = "DeepSeek-R1-Distill-Qwen-1.5B" choices: list[dict] usage: dict @app.post("/v1/chat/completions") async def chat_completion(request: ChatCompletionRequest): try: prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages]) prompt += "\nassistant:" response = llm( prompt=prompt, max_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, stop=[""] ) return ChatCompletionResponse( choices=[{ "index": 0, "message": { "role": "assistant", "content": response['choices'][0]['text'].strip() }, "finish_reason": "stop" }], usage={ "prompt_tokens": len(prompt), "completion_tokens": len(response['choices'][0]['text']), "total_tokens": len(prompt) + len(response['choices'][0]['text']) } ) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") def health_check(): return {"status": "healthy"} if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)