Echo-ai commited on
Commit
f9e8a03
·
verified ·
1 Parent(s): b839d79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -100
app.py CHANGED
@@ -1,18 +1,15 @@
1
  import os
2
  import requests
3
- import time
4
  from fastapi import FastAPI, HTTPException
5
  from fastapi.middleware.cors import CORSMiddleware
6
- from fastapi.responses import StreamingResponse, HTMLResponse
7
  from llama_cpp import Llama
8
  from pydantic import BaseModel
9
  import uvicorn
10
- from typing import Generator
11
- import threading
12
 
13
  # Configuration
14
- MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf" # Changed to Q4 for faster inference
15
- MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
16
  MODEL_DIR = "model"
17
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
18
 
@@ -36,8 +33,8 @@ else:
36
  # Initialize FastAPI
37
  app = FastAPI(
38
  title="DeepSeek-R1 OpenAI-Compatible API",
39
- description="Optimized OpenAI-compatible API with streaming support",
40
- version="2.0.0"
41
  )
42
 
43
  # CORS Configuration
@@ -48,68 +45,36 @@ app.add_middleware(
48
  allow_headers=["*"],
49
  )
50
 
51
- # Global model loader with optimized settings
52
- print("Loading model with optimized settings...")
53
  try:
54
  llm = Llama(
55
  model_path=MODEL_PATH,
56
- n_ctx=1024, # Reduced context window for faster processing
57
- n_threads=8, # Increased threads for better CPU utilization
58
- n_batch=512, # Larger batch size for improved throughput
59
  n_gpu_layers=0,
60
- use_mlock=True, # Prevent swapping to disk
61
  verbose=False
62
  )
63
- print("Model loaded with optimized settings!")
64
  except Exception as e:
65
  raise RuntimeError(f"Failed to load model: {str(e)}")
66
 
67
- # Streaming generator
68
- def generate_stream(prompt: str, max_tokens: int, temperature: float, top_p: float) -> Generator[str, None, None]:
69
- start_time = time.time()
70
- stream = llm.create_completion(
71
- prompt=prompt,
72
- max_tokens=max_tokens,
73
- temperature=temperature,
74
- top_p=top_p,
75
- stop=["</s>"],
76
- stream=True
77
- )
78
-
79
- for chunk in stream:
80
- delta = chunk['choices'][0]['text']
81
- yield f"data: {delta}\n\n"
82
-
83
- # Early stopping if taking too long
84
- if time.time() - start_time > 30: # 30s timeout
85
- break
86
-
87
- # OpenAI-Compatible Request Schema
88
- class ChatCompletionRequest(BaseModel):
89
- model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
90
- messages: list[dict]
91
- max_tokens: int = 256
92
- temperature: float = 0.7
93
- top_p: float = 0.9
94
- stream: bool = False
95
-
96
- # Enhanced root endpoint with performance info
97
  @app.get("/", response_class=HTMLResponse)
98
  async def root():
99
  return f"""
100
  <html>
101
  <head>
102
- <title>DeepSeek-R1 Optimized API</title>
103
  <style>
104
  body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
105
  .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
106
- .info {{ color: #0c5460; background: #d1ecf1; padding: 15px; border-radius: 5px; }}
107
  a {{ color: #007bff; text-decoration: none; }}
108
  code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
109
  </style>
110
  </head>
111
  <body>
112
- <h1>DeepSeek-R1 Optimized API</h1>
113
 
114
  <div class="warning">
115
  <h3>⚠️ Important Notice</h3>
@@ -119,29 +84,29 @@ async def root():
119
  3. Set visibility to Private</p>
120
  </div>
121
 
122
- <div class="info">
123
- <h3>⚡ Performance Optimizations</h3>
124
- <ul>
125
- <li>Quantization: Q4_K_M (optimized speed/quality balance)</li>
126
- <li>Batch processing: 512 tokens/chunk</li>
127
- <li>Streaming support with 30s timeout</li>
128
- <li>8 CPU threads utilization</li>
129
- </ul>
130
- </div>
131
-
132
  <h2>API Documentation</h2>
133
  <ul>
134
  <li><a href="/docs">Interactive Swagger Documentation</a></li>
135
  <li><a href="/redoc">ReDoc Documentation</a></li>
136
  </ul>
137
 
138
- <h2>Example Streaming Request</h2>
 
 
 
 
 
 
 
 
 
 
 
139
  <pre>
140
- curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
141
  -H "Content-Type: application/json" \\
142
  -d '{{
143
  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
144
- "stream": true,
145
  "max_tokens": 150
146
  }}'
147
  </pre>
@@ -149,26 +114,30 @@ curl -N -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat
149
  </html>
150
  """
151
 
152
- # Async endpoint handler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  @app.post("/v1/chat/completions")
154
  async def chat_completion(request: ChatCompletionRequest):
155
  try:
156
  prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
157
  prompt += "\nassistant:"
158
 
159
- if request.stream:
160
- return StreamingResponse(
161
- generate_stream(
162
- prompt=prompt,
163
- max_tokens=request.max_tokens,
164
- temperature=request.temperature,
165
- top_p=request.top_p
166
- ),
167
- media_type="text/event-stream"
168
- )
169
-
170
- # Non-streaming response
171
- start_time = time.time()
172
  response = llm(
173
  prompt=prompt,
174
  max_tokens=request.max_tokens,
@@ -177,12 +146,8 @@ async def chat_completion(request: ChatCompletionRequest):
177
  stop=["</s>"]
178
  )
179
 
180
- return {
181
- "id": f"chatcmpl-{int(time.time())}",
182
- "object": "chat.completion",
183
- "created": int(time.time()),
184
- "model": request.model,
185
- "choices": [{
186
  "index": 0,
187
  "message": {
188
  "role": "assistant",
@@ -190,32 +155,18 @@ async def chat_completion(request: ChatCompletionRequest):
190
  },
191
  "finish_reason": "stop"
192
  }],
193
- "usage": {
194
  "prompt_tokens": len(prompt),
195
  "completion_tokens": len(response['choices'][0]['text']),
196
  "total_tokens": len(prompt) + len(response['choices'][0]['text'])
197
  }
198
- }
199
-
200
  except Exception as e:
201
  raise HTTPException(status_code=500, detail=str(e))
202
 
203
  @app.get("/health")
204
- async def health_check():
205
- return {
206
- "status": "healthy",
207
- "model_loaded": True,
208
- "performance_settings": {
209
- "n_threads": llm.params.n_threads,
210
- "n_ctx": llm.params.n_ctx,
211
- "n_batch": llm.params.n_batch
212
- }
213
- }
214
 
215
  if __name__ == "__main__":
216
- uvicorn.run(
217
- app,
218
- host="0.0.0.0",
219
- port=7860,
220
- timeout_keep_alive=300 # Keep alive for streaming connections
221
- )
 
1
  import os
2
  import requests
 
3
  from fastapi import FastAPI, HTTPException
4
  from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.responses import HTMLResponse
6
  from llama_cpp import Llama
7
  from pydantic import BaseModel
8
  import uvicorn
 
 
9
 
10
  # Configuration
11
+ MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
12
+ MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf"
13
  MODEL_DIR = "model"
14
  MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME)
15
 
 
33
  # Initialize FastAPI
34
  app = FastAPI(
35
  title="DeepSeek-R1 OpenAI-Compatible API",
36
+ description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B",
37
+ version="1.0.0"
38
  )
39
 
40
  # CORS Configuration
 
45
  allow_headers=["*"],
46
  )
47
 
48
+ # Load the model
49
+ print("Loading model...")
50
  try:
51
  llm = Llama(
52
  model_path=MODEL_PATH,
53
+ n_ctx=2048,
54
+ n_threads=4,
 
55
  n_gpu_layers=0,
 
56
  verbose=False
57
  )
58
+ print("Model loaded successfully!")
59
  except Exception as e:
60
  raise RuntimeError(f"Failed to load model: {str(e)}")
61
 
62
+ # Root endpoint with documentation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @app.get("/", response_class=HTMLResponse)
64
  async def root():
65
  return f"""
66
  <html>
67
  <head>
68
+ <title>DeepSeek-R1 OpenAI API</title>
69
  <style>
70
  body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }}
71
  .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }}
 
72
  a {{ color: #007bff; text-decoration: none; }}
73
  code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }}
74
  </style>
75
  </head>
76
  <body>
77
+ <h1>DeepSeek-R1 OpenAI-Compatible API</h1>
78
 
79
  <div class="warning">
80
  <h3>⚠️ Important Notice</h3>
 
84
  3. Set visibility to Private</p>
85
  </div>
86
 
 
 
 
 
 
 
 
 
 
 
87
  <h2>API Documentation</h2>
88
  <ul>
89
  <li><a href="/docs">Interactive Swagger Documentation</a></li>
90
  <li><a href="/redoc">ReDoc Documentation</a></li>
91
  </ul>
92
 
93
+ <h2>API Endpoints</h2>
94
+ <h3>Chat Completion</h3>
95
+ <p><code>POST /v1/chat/completions</code></p>
96
+ <p>Parameters:</p>
97
+ <ul>
98
+ <li><strong>messages</strong>: List of message objects</li>
99
+ <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li>
100
+ <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li>
101
+ <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li>
102
+ </ul>
103
+
104
+ <h2>Example Request</h2>
105
  <pre>
106
+ curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\
107
  -H "Content-Type: application/json" \\
108
  -d '{{
109
  "messages": [{{"role": "user", "content": "Explain quantum computing"}}],
 
110
  "max_tokens": 150
111
  }}'
112
  </pre>
 
114
  </html>
115
  """
116
 
117
+ # OpenAI-Compatible Request Schema
118
+ class ChatCompletionRequest(BaseModel):
119
+ model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
120
+ messages: list[dict]
121
+ max_tokens: int = 128
122
+ temperature: float = 0.7
123
+ top_p: float = 0.9
124
+ stream: bool = False
125
+
126
+ # OpenAI-Compatible Response Schema
127
+ class ChatCompletionResponse(BaseModel):
128
+ id: str = "chatcmpl-12345"
129
+ object: str = "chat.completion"
130
+ created: int = 1693161600
131
+ model: str = "DeepSeek-R1-Distill-Qwen-1.5B"
132
+ choices: list[dict]
133
+ usage: dict
134
+
135
  @app.post("/v1/chat/completions")
136
  async def chat_completion(request: ChatCompletionRequest):
137
  try:
138
  prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages])
139
  prompt += "\nassistant:"
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  response = llm(
142
  prompt=prompt,
143
  max_tokens=request.max_tokens,
 
146
  stop=["</s>"]
147
  )
148
 
149
+ return ChatCompletionResponse(
150
+ choices=[{
 
 
 
 
151
  "index": 0,
152
  "message": {
153
  "role": "assistant",
 
155
  },
156
  "finish_reason": "stop"
157
  }],
158
+ usage={
159
  "prompt_tokens": len(prompt),
160
  "completion_tokens": len(response['choices'][0]['text']),
161
  "total_tokens": len(prompt) + len(response['choices'][0]['text'])
162
  }
163
+ )
 
164
  except Exception as e:
165
  raise HTTPException(status_code=500, detail=str(e))
166
 
167
  @app.get("/health")
168
+ def health_check():
169
+ return {"status": "healthy"}
 
 
 
 
 
 
 
 
170
 
171
  if __name__ == "__main__":
172
+ uvicorn.run(app, host="0.0.0.0", port=7860)