Spaces:

Echo-AI-official
/

Deepseek-R1-1.5b-API

Running

File size: 3,208 Bytes

3ddd5b6

import os
import requests
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from llama_cpp import Llama
from pydantic import BaseModel
import uvicorn


MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
MODEL_DIR = "model"
MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)


os.makedirs(MODEL_DIR, exist_ok=True)


if not os.path.exists(MODEL_PATH):
    print(f"Downloading model from {MODEL_URL}...")
    response = requests.get(MODEL_URL, stream=True)
    if response.status_code == 200:
        with open(MODEL_PATH, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Model downloaded successfully!")
    else:
        raise RuntimeError(f"Failed to download model: HTTP {response.status_code}")
else:
    print("Model already exists. Skipping download.")


app = FastAPI(title="DeepSeek-R1 OpenAI-Compatible API")

# CORS Configuration
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)


print("Loading model...")
try:
    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=2048,  
        n_threads=4,  
        n_gpu_layers=0, 
        verbose=False
    )
    print("Model loaded successfully!")
except Exception as e:
    raise RuntimeError(f"Failed to load model: {str(e)}")


class ChatCompletionRequest(BaseModel):
    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
    messages: list[dict]
    max_tokens: int = 128
    temperature: float = 0.7
    top_p: float = 0.9
    stream: bool = False


class ChatCompletionResponse(BaseModel):
    id: str = "chatcmpl-12345"
    object: str = "chat.completion"
    created: int = 1693161600
    model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
    choices: list[dict]
    usage: dict


@app.post("/v1/chat/completions")
async def chat_completion(request: ChatCompletionRequest):
    try:
        prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
        prompt += "\nassistant:"

        
        response = llm(
            prompt=prompt,
            max_tokens=request.max_tokens,
            temperature=request.temperature,
            top_p=request.top_p,
            stop=["</s>"]
        )

        
        return ChatCompletionResponse(
            choices=[{
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response['choices'][0]['text'].strip()
                },
                "finish_reason": "stop"
            }],
            usage={
                "prompt_tokens": len(prompt),
                "completion_tokens": len(response['choices'][0]['text']),
                "total_tokens": len(prompt) + len(response['choices'][0]['text'])
            }
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/health")
def health_check():
    return {"status": "healthy"}


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)