Echo-ai commited on
Commit
3ddd5b6
·
verified ·
1 Parent(s): 3ba6e2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from fastapi import FastAPI, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from llama_cpp import Llama
6
+ from pydantic import BaseModel
7
+ import uvicorn
8
+
9
+
10
+ MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
11
+ MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
12
+ MODEL_DIR = "model"
13
+ MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
14
+
15
+
16
+ os.makedirs(MODEL_DIR, exist_ok=True)
17
+
18
+
19
+ if not os.path.exists(MODEL_PATH):
20
+ print(f"Downloading model from {MODEL_URL}...")
21
+ response = requests.get(MODEL_URL, stream=True)
22
+ if response.status_code == 200:
23
+ with open(MODEL_PATH, "wb") as f:
24
+ for chunk in response.iter_content(chunk_size=8192):
25
+ f.write(chunk)
26
+ print("Model downloaded successfully!")
27
+ else:
28
+ raise RuntimeError(f"Failed to download model: HTTP {response.status_code}")
29
+ else:
30
+ print("Model already exists. Skipping download.")
31
+
32
+
33
+ app = FastAPI(title="DeepSeek-R1 OpenAI-Compatible API")
34
+
35
+ # CORS Configuration
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"],
39
+ allow_methods=["*"],
40
+ allow_headers=["*"],
41
+ )
42
+
43
+
44
+ print("Loading model...")
45
+ try:
46
+ llm = Llama(
47
+ model_path=MODEL_PATH,
48
+ n_ctx=2048,
49
+ n_threads=4,
50
+ n_gpu_layers=0,
51
+ verbose=False
52
+ )
53
+ print("Model loaded successfully!")
54
+ except Exception as e:
55
+ raise RuntimeError(f"Failed to load model: {str(e)}")
56
+
57
+
58
+ class ChatCompletionRequest(BaseModel):
59
+ model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
60
+ messages: list[dict]
61
+ max_tokens: int = 128
62
+ temperature: float = 0.7
63
+ top_p: float = 0.9
64
+ stream: bool = False
65
+
66
+
67
+ class ChatCompletionResponse(BaseModel):
68
+ id: str = "chatcmpl-12345"
69
+ object: str = "chat.completion"
70
+ created: int = 1693161600
71
+ model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
72
+ choices: list[dict]
73
+ usage: dict
74
+
75
+
76
+ @app.post("/v1/chat/completions")
77
+ async def chat_completion(request: ChatCompletionRequest):
78
+ try:
79
+ prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
80
+ prompt += "\nassistant:"
81
+
82
+
83
+ response = llm(
84
+ prompt=prompt,
85
+ max_tokens=request.max_tokens,
86
+ temperature=request.temperature,
87
+ top_p=request.top_p,
88
+ stop=["</s>"]
89
+ )
90
+
91
+
92
+ return ChatCompletionResponse(
93
+ choices=[{
94
+ "index": 0,
95
+ "message": {
96
+ "role": "assistant",
97
+ "content": response['choices'][0]['text'].strip()
98
+ },
99
+ "finish_reason": "stop"
100
+ }],
101
+ usage={
102
+ "prompt_tokens": len(prompt),
103
+ "completion_tokens": len(response['choices'][0]['text']),
104
+ "total_tokens": len(prompt) + len(response['choices'][0]['text'])
105
+ }
106
+ )
107
+ except Exception as e:
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+
111
+ @app.get("/health")
112
+ def health_check():
113
+ return {"status": "healthy"}
114
+
115
+
116
+ if __name__ == "__main__":
117
+ uvicorn.run(app, host="0.0.0.0", port=7860)