Echo-ai commited on
Commit
b839d79
·
verified ·
1 Parent(s): f023e65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -51
app.py CHANGED
@@ -1,15 +1,18 @@
1
  import os
2
  import requests
 
3
  from fastapi import FastAPI, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
- from fastapi.responses import HTMLResponse
6
  from llama_cpp import Llama
7
  from pydantic import BaseModel
8
  import uvicorn
 
 
9
 
10
  # Configuration
11
- MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
12
- MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
13
  MODEL_DIR = "model"
14
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
15
 
@@ -33,8 +36,8 @@ else:
33
  # Initialize FastAPI
34
  app = FastAPI(
35
  title="DeepSeek-R1 OpenAI-Compatible API",
36
- description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
37
- version="1.0.0"
38
  )
39
 
40
  # CORS Configuration
@@ -45,36 +48,68 @@ app.add_middleware(
45
  allow_headers=["*"],
46
  )
47
 
48
- # Load the model
49
- print("Loading model...")
50
  try:
51
  llm = Llama(
52
  model_path=MODEL_PATH,
53
- n_ctx=2048,
54
- n_threads=4,
 
55
  n_gpu_layers=0,
 
56
  verbose=False
57
  )
58
- print("Model loaded successfully!")
59
  except Exception as e:
60
  raise RuntimeError(f"Failed to load model: {str(e)}")
61
 
62
- # Root endpoint with documentation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @app.get("/", response_class=HTMLResponse)
64
  async def root():
65
  return f"""
66
  <html>
67
  <head>
68
- <title>DeepSeek-R1 OpenAI API</title>
69
  <style>
70
  body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
71
  .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
 
72
  a {{ color: #007bff; text-decoration: none; }}
73
  code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
74
  </style>
75
  </head>
76
  <body>
77
- <h1>DeepSeek-R1 OpenAI-Compatible API</h1>
78
 
79
  <div class="warning">
80
  <h3>⚠️ Important Notice</h3>
@@ -84,29 +119,29 @@ async def root():
84
  3. Set visibility to Private</p>
85
  </div>
86
 
 
 
 
 
 
 
 
 
 
 
87
  <h2>API Documentation</h2>
88
  <ul>
89
  <li><a href="/docs">Interactive Swagger Documentation</a></li>
90
  <li><a href="/redoc">ReDoc Documentation</a></li>
91
  </ul>
92
 
93
- <h2>API Endpoints</h2>
94
- <h3>Chat Completion</h3>
95
- <p><code>POST /v1/chat/completions</code></p>
96
- <p>Parameters:</p>
97
- <ul>
98
- <li><strong>messages</strong>: List of message objects</li>
99
- <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
100
- <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
101
- <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
102
- </ul>
103
-
104
- <h2>Example Request</h2>
105
  <pre>
106
- curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
107
  -H "Content-Type: application/json" \\
108
  -d '{{
109
  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
 
110
  "max_tokens": 150
111
  }}'
112
  </pre>
@@ -114,30 +149,26 @@ curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/co
114
  </html>
115
  """
116
 
117
- # OpenAI-Compatible Request Schema
118
- class ChatCompletionRequest(BaseModel):
119
- model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
120
- messages: list[dict]
121
- max_tokens: int = 128
122
- temperature: float = 0.7
123
- top_p: float = 0.9
124
- stream: bool = False
125
-
126
- # OpenAI-Compatible Response Schema
127
- class ChatCompletionResponse(BaseModel):
128
- id: str = "chatcmpl-12345"
129
- object: str = "chat.completion"
130
- created: int = 1693161600
131
- model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
132
- choices: list[dict]
133
- usage: dict
134
-
135
  @app.post("/v1/chat/completions")
136
  async def chat_completion(request: ChatCompletionRequest):
137
  try:
138
  prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
139
  prompt += "\nassistant:"
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  response = llm(
142
  prompt=prompt,
143
  max_tokens=request.max_tokens,
@@ -146,8 +177,12 @@ async def chat_completion(request: ChatCompletionRequest):
146
  stop=["</s>"]
147
  )
148
 
149
- return ChatCompletionResponse(
150
- choices=[{
 
 
 
 
151
  "index": 0,
152
  "message": {
153
  "role": "assistant",
@@ -155,18 +190,32 @@ async def chat_completion(request: ChatCompletionRequest):
155
  },
156
  "finish_reason": "stop"
157
  }],
158
- usage={
159
  "prompt_tokens": len(prompt),
160
  "completion_tokens": len(response['choices'][0]['text']),
161
  "total_tokens": len(prompt) + len(response['choices'][0]['text'])
162
  }
163
- )
 
164
  except Exception as e:
165
  raise HTTPException(status_code=500, detail=str(e))
166
 
167
  @app.get("/health")
168
- def health_check():
169
- return {"status": "healthy"}
 
 
 
 
 
 
 
 
170
 
171
  if __name__ == "__main__":
172
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
1
  import os
2
  import requests
3
+ import time
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import StreamingResponse, HTMLResponse
7
  from llama_cpp import Llama
8
  from pydantic import BaseModel
9
  import uvicorn
10
+ from typing import Generator
11
+ import threading
12
 
13
  # Configuration
14
+ MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" # Changed to Q4 for faster inference
15
+ MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
16
  MODEL_DIR = "model"
17
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
18
 
 
36
  # Initialize FastAPI
37
  app = FastAPI(
38
  title="DeepSeek-R1 OpenAI-Compatible API",
39
+ description="Optimized OpenAI-compatible API with streaming support",
40
+ version="2.0.0"
41
  )
42
 
43
  # CORS Configuration
 
48
  allow_headers=["*"],
49
  )
50
 
51
+ # Global model loader with optimized settings
52
+ print("Loading model with optimized settings...")
53
  try:
54
  llm = Llama(
55
  model_path=MODEL_PATH,
56
+ n_ctx=1024, # Reduced context window for faster processing
57
+ n_threads=8, # Increased threads for better CPU utilization
58
+ n_batch=512, # Larger batch size for improved throughput
59
  n_gpu_layers=0,
60
+ use_mlock=True, # Prevent swapping to disk
61
  verbose=False
62
  )
63
+ print("Model loaded with optimized settings!")
64
  except Exception as e:
65
  raise RuntimeError(f"Failed to load model: {str(e)}")
66
 
67
+ # Streaming generator
68
+ def generate_stream(prompt: str, max_tokens: int, temperature: float, top_p: float) -> Generator[str, None, None]:
69
+ start_time = time.time()
70
+ stream = llm.create_completion(
71
+ prompt=prompt,
72
+ max_tokens=max_tokens,
73
+ temperature=temperature,
74
+ top_p=top_p,
75
+ stop=["</s>"],
76
+ stream=True
77
+ )
78
+
79
+ for chunk in stream:
80
+ delta = chunk['choices'][0]['text']
81
+ yield f"data: {delta}\n\n"
82
+
83
+ # Early stopping if taking too long
84
+ if time.time() - start_time > 30: # 30s timeout
85
+ break
86
+
87
+ # OpenAI-Compatible Request Schema
88
+ class ChatCompletionRequest(BaseModel):
89
+ model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
90
+ messages: list[dict]
91
+ max_tokens: int = 256
92
+ temperature: float = 0.7
93
+ top_p: float = 0.9
94
+ stream: bool = False
95
+
96
+ # Enhanced root endpoint with performance info
97
  @app.get("/", response_class=HTMLResponse)
98
  async def root():
99
  return f"""
100
  <html>
101
  <head>
102
+ <title>DeepSeek-R1 Optimized API</title>
103
  <style>
104
  body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
105
  .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
106
+ .info {{ color: #0c5460; background: #d1ecf1; padding: 15px; border-radius: 5px; }}
107
  a {{ color: #007bff; text-decoration: none; }}
108
  code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
109
  </style>
110
  </head>
111
  <body>
112
+ <h1>DeepSeek-R1 Optimized API</h1>
113
 
114
  <div class="warning">
115
  <h3>⚠️ Important Notice</h3>
 
119
  3. Set visibility to Private</p>
120
  </div>
121
 
122
+ <div class="info">
123
+ <h3>⚡ Performance Optimizations</h3>
124
+ <ul>
125
+ <li>Quantization: Q4_K_M (optimized speed/quality balance)</li>
126
+ <li>Batch processing: 512 tokens/chunk</li>
127
+ <li>Streaming support with 30s timeout</li>
128
+ <li>8 CPU threads utilization</li>
129
+ </ul>
130
+ </div>
131
+
132
  <h2>API Documentation</h2>
133
  <ul>
134
  <li><a href="/docs">Interactive Swagger Documentation</a></li>
135
  <li><a href="/redoc">ReDoc Documentation</a></li>
136
  </ul>
137
 
138
+ <h2>Example Streaming Request</h2>
 
 
 
 
 
 
 
 
 
 
 
139
  <pre>
140
+ curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
141
  -H "Content-Type: application/json" \\
142
  -d '{{
143
  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
144
+ "stream": true,
145
  "max_tokens": 150
146
  }}'
147
  </pre>
 
149
  </html>
150
  """
151
 
152
+ # Async endpoint handler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  @app.post("/v1/chat/completions")
154
  async def chat_completion(request: ChatCompletionRequest):
155
  try:
156
  prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
157
  prompt += "\nassistant:"
158
 
159
+ if request.stream:
160
+ return StreamingResponse(
161
+ generate_stream(
162
+ prompt=prompt,
163
+ max_tokens=request.max_tokens,
164
+ temperature=request.temperature,
165
+ top_p=request.top_p
166
+ ),
167
+ media_type="text/event-stream"
168
+ )
169
+
170
+ # Non-streaming response
171
+ start_time = time.time()
172
  response = llm(
173
  prompt=prompt,
174
  max_tokens=request.max_tokens,
 
177
  stop=["</s>"]
178
  )
179
 
180
+ return {
181
+ "id": f"chatcmpl-{int(time.time())}",
182
+ "object": "chat.completion",
183
+ "created": int(time.time()),
184
+ "model": request.model,
185
+ "choices": [{
186
  "index": 0,
187
  "message": {
188
  "role": "assistant",
 
190
  },
191
  "finish_reason": "stop"
192
  }],
193
+ "usage": {
194
  "prompt_tokens": len(prompt),
195
  "completion_tokens": len(response['choices'][0]['text']),
196
  "total_tokens": len(prompt) + len(response['choices'][0]['text'])
197
  }
198
+ }
199
+
200
  except Exception as e:
201
  raise HTTPException(status_code=500, detail=str(e))
202
 
203
  @app.get("/health")
204
+ async def health_check():
205
+ return {
206
+ "status": "healthy",
207
+ "model_loaded": True,
208
+ "performance_settings": {
209
+ "n_threads": llm.params.n_threads,
210
+ "n_ctx": llm.params.n_ctx,
211
+ "n_batch": llm.params.n_batch
212
+ }
213
+ }
214
 
215
  if __name__ == "__main__":
216
+ uvicorn.run(
217
+ app,
218
+ host="0.0.0.0",
219
+ port=7860,
220
+ timeout_keep_alive=300 # Keep alive for streaming connections
221
+ )