m1k3wn commited on
Commit
42717bf
·
verified ·
1 Parent(s): 5380536

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -250
app.py CHANGED
@@ -16,224 +16,7 @@ import psutil
16
  # Initialize FastAPI
17
  app = FastAPI()
18
 
19
- # Debugging logging with detailed formatting
20
- # logging.basicConfig(
21
- # level=logging.DEBUG,
22
- # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
23
- # )
24
- # logger = logging.getLogger(__name__)
25
-
26
- # Get HF token
27
- HF_TOKEN = os.environ.get("HF_TOKEN")
28
- if not HF_TOKEN:
29
- logger.warning("No HF_TOKEN found in environment variables")
30
-
31
- MODELS = {
32
- "nidra-v1": "m1k3wn/nidra-v1",
33
- "nidra-v2": "m1k3wn/nidra-v2"
34
- }
35
-
36
- DEFAULT_GENERATION_CONFIGS = {
37
- "nidra-v1": {
38
- "max_length": 300,
39
- "min_length": 150,
40
- "num_beams": 8,
41
- "temperature": 0.55,
42
- "do_sample": True,
43
- "top_p": 0.95,
44
- "repetition_penalty": 4.5,
45
- "no_repeat_ngram_size": 4,
46
- "early_stopping": True,
47
- "length_penalty": 1.2,
48
- },
49
- "nidra-v2": {
50
- "max_length": 300,
51
- "min_length": 150,
52
- "num_beams": 8,
53
- "temperature": 0.4,
54
- "do_sample": True,
55
- "top_p": 0.95,
56
- "repetition_penalty": 3.5,
57
- "no_repeat_ngram_size": 4,
58
- "early_stopping": True,
59
- "length_penalty": 1.2,
60
- }
61
- }
62
-
63
- class ModelManager:
64
- _instances: ClassVar[Dict[str, tuple]] = {}
65
- _lock = asyncio.Lock() # Add lock for thread safety
66
-
67
- @classmethod
68
- async def get_model_and_tokenizer(cls, model_name: str):
69
- async with cls._lock:
70
- if model_name not in cls._instances:
71
- try:
72
- model_path = MODELS[model_name]
73
- logger.debug(f"Attempting to load tokenizer from {model_path}")
74
-
75
- try:
76
- tokenizer = T5Tokenizer.from_pretrained(
77
- model_path,
78
- token=HF_TOKEN,
79
- # local_files_only=False
80
- )
81
- logger.debug("Tokenizer loaded successfully")
82
- except Exception as e:
83
- logger.error(f"Detailed tokenizer error: {str(e)}")
84
- logger.error(f"HF_TOKEN present: {bool(HF_TOKEN)}")
85
- raise
86
-
87
- logger.debug("Attempting to load model")
88
- model = T5ForConditionalGeneration.from_pretrained(
89
- model_path,
90
- token=HF_TOKEN,
91
- # local_files_only=False,
92
- low_cpu_mem_usage=True,
93
- torch_dtype=torch.float32
94
- )
95
- logger.debug("Model loaded successfully")
96
-
97
- model.eval()
98
- torch.set_num_threads(8)
99
-
100
- cls._instances[model_name] = (model, tokenizer)
101
-
102
- except Exception as e:
103
- logger.error(f"Error loading {model_name}: {str(e)}")
104
- raise
105
-
106
- return cls._instances[model_name]
107
-
108
- class PredictionRequest(BaseModel):
109
- inputs: str
110
- model: str = "nidra-v1"
111
- parameters: Optional[Dict[str, Any]] = None
112
-
113
- class PredictionResponse(BaseModel):
114
- generated_text: str
115
- selected_model: str # Changed from model_used to avoid namespace conflict
116
-
117
- @app.get("/debug/memory")
118
- async def memory_usage():
119
- process = psutil.Process()
120
- memory_info = process.memory_info()
121
- return {
122
- "memory_used_mb": memory_info.rss / 1024 / 1024,
123
- "memory_percent": process.memory_percent(),
124
- "cpu_percent": process.cpu_percent()
125
- }
126
-
127
- @app.get("/version")
128
- async def version():
129
- return {
130
- "python_version": sys.version,
131
- "models_available": list(MODELS.keys())
132
- }
133
-
134
- @app.get("/health")
135
- async def health():
136
- try:
137
- await ModelManager.get_model_and_tokenizer("nidra-v1")
138
- return {
139
- "status": "healthy",
140
- "loaded_models": list(ModelManager._instances.keys())
141
- }
142
- except Exception as e:
143
- logger.error(f"Health check failed: {str(e)}")
144
- return {
145
- "status": "unhealthy",
146
- "error": str(e)
147
- }
148
-
149
- @app.post("/predict", response_model=PredictionResponse)
150
- async def predict(request: PredictionRequest, background_tasks: BackgroundTasks):
151
- try:
152
- if request.model not in MODELS:
153
- raise HTTPException(
154
- status_code=400,
155
- detail=f"Invalid model. Available models: {list(MODELS.keys())}"
156
- )
157
-
158
- model, tokenizer = await ModelManager.get_model_and_tokenizer(request.model)
159
- generation_params = DEFAULT_GENERATION_CONFIGS[request.model].copy()
160
-
161
- try:
162
- model_generation_config = model.generation_config
163
- generation_params.update({
164
- k: v for k, v in model_generation_config.to_dict().items()
165
- if v is not None
166
- })
167
- except Exception as config_load_error:
168
- logger.warning(f"Using default generation config: {config_load_error}")
169
-
170
- if request.parameters:
171
- generation_params.update(request.parameters)
172
-
173
- logger.debug(f"Final generation parameters: {generation_params}")
174
-
175
- full_input = "Interpret this dream: " + request.inputs
176
- inputs = tokenizer(
177
- full_input,
178
- return_tensors="pt",
179
- truncation=True,
180
- max_length=512,
181
- padding=True,
182
- return_attention_mask=True
183
- )
184
-
185
- async def generate():
186
- return model.generate(
187
- **inputs,
188
- **{k: v for k, v in generation_params.items() if k in [
189
- 'max_length', 'min_length', 'do_sample', 'temperature',
190
- 'top_p', 'top_k', 'num_beams', 'no_repeat_ngram_size',
191
- 'repetition_penalty', 'early_stopping'
192
- ]}
193
- )
194
-
195
- with torch.inference_mode():
196
- outputs = await asyncio.wait_for(generate(), timeout=70.0)
197
-
198
- result = tokenizer.decode(outputs[0], skip_special_tokens=True)
199
- background_tasks.add_task(cleanup_memory)
200
-
201
- return PredictionResponse(
202
- generated_text=result,
203
- selected_model=request.model
204
- )
205
-
206
- except Exception as e:
207
- error_msg = f"Error during prediction: {str(e)}\n{traceback.format_exc()}"
208
- logger.error(error_msg)
209
- raise HTTPException(status_code=500, detail=error_msg)
210
-
211
- def cleanup_memory():
212
- gc.collect()
213
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
214
-
215
- if __name__ == "__main__":
216
- import uvicorn
217
- uvicorn.run(app, host="0.0.0.0", port=7860)
218
- import torch
219
- from fastapi import FastAPI, HTTPException
220
- from pydantic import BaseModel
221
- from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig
222
- from typing import Optional, Dict, Any, ClassVar
223
- import logging
224
- import os
225
- import sys
226
- import traceback
227
- from functools import lru_cache
228
- import gc
229
- import asyncio
230
- from fastapi import BackgroundTasks
231
- import psutil
232
-
233
- # Initialize FastAPI
234
- app = FastAPI()
235
-
236
- # Set up logging with more detailed formatting
237
  logging.basicConfig(
238
  level=logging.DEBUG,
239
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -279,39 +62,36 @@ DEFAULT_GENERATION_CONFIGS = {
279
 
280
  class ModelManager:
281
  _instances: ClassVar[Dict[str, tuple]] = {}
282
- _lock = asyncio.Lock() # Add lock for thread safety
283
 
284
  @classmethod
285
  async def get_model_and_tokenizer(cls, model_name: str):
286
- async with cls._lock:
287
- if model_name not in cls._instances:
288
- try:
289
- model_path = MODELS[model_name]
290
- tokenizer = T5Tokenizer.from_pretrained(
291
- model_path,
292
- token=HF_TOKEN,
293
- local_files_only=True # Cache after first load
294
- )
295
-
296
- model = T5ForConditionalGeneration.from_pretrained(
297
- model_path,
298
- token=HF_TOKEN,
299
- local_files_only=True,
300
- low_cpu_mem_usage=True,
301
- torch_dtype=torch.float32
302
- )
303
-
304
- # Enable parallel processing
305
- model.eval()
306
- torch.set_num_threads(8) # Use all CPU cores
307
-
308
- cls._instances[model_name] = (model, tokenizer)
309
-
310
- except Exception as e:
311
- logger.error(f"Error loading {model_name}: {str(e)}")
312
- raise
313
-
314
- return cls._instances[model_name]
315
 
316
  class PredictionRequest(BaseModel):
317
  inputs: str
@@ -322,6 +102,7 @@ class PredictionResponse(BaseModel):
322
  generated_text: str
323
  selected_model: str # Changed from model_used to avoid namespace conflict
324
 
 
325
  @app.get("/debug/memory")
326
  async def memory_usage():
327
  process = psutil.Process()
@@ -332,6 +113,7 @@ async def memory_usage():
332
  "cpu_percent": process.cpu_percent()
333
  }
334
 
 
335
  @app.get("/version")
336
  async def version():
337
  return {
@@ -339,16 +121,24 @@ async def version():
339
  "models_available": list(MODELS.keys())
340
  }
341
 
 
342
  @app.get("/health")
343
  async def health():
344
  try:
345
- await ModelManager.get_model_and_tokenizer("nidra-v1")
 
 
 
 
 
 
346
  return {
347
  "status": "healthy",
348
  "loaded_models": list(ModelManager._instances.keys())
349
  }
350
  except Exception as e:
351
- logger.error(f"Health check failed: {str(e)}")
 
352
  return {
353
  "status": "unhealthy",
354
  "error": str(e)
 
16
  # Initialize FastAPI
17
  app = FastAPI()
18
 
19
+ # Debugging logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  logging.basicConfig(
21
  level=logging.DEBUG,
22
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 
62
 
63
  class ModelManager:
64
  _instances: ClassVar[Dict[str, tuple]] = {}
 
65
 
66
  @classmethod
67
  async def get_model_and_tokenizer(cls, model_name: str):
68
+ if model_name not in cls._instances:
69
+ try:
70
+ model_path = MODELS[model_name]
71
+ logger.debug(f"Loading tokenizer and model from {model_path}")
72
+
73
+ # Simplified tokenizer loading
74
+ tokenizer = T5Tokenizer.from_pretrained(
75
+ model_path,
76
+ token=HF_TOKEN,
77
+ use_fast=True # Added this
78
+ )
79
+
80
+ # Simplified model loading
81
+ model = T5ForConditionalGeneration.from_pretrained(
82
+ model_path,
83
+ token=HF_TOKEN,
84
+ torch_dtype=torch.float32
85
+ )
86
+
87
+ model.eval()
88
+ cls._instances[model_name] = (model, tokenizer)
89
+
90
+ except Exception as e:
91
+ logger.error(f"Error loading {model_name}: {str(e)}")
92
+ raise
93
+
94
+ return cls._instances[model_name]
 
 
95
 
96
  class PredictionRequest(BaseModel):
97
  inputs: str
 
102
  generated_text: str
103
  selected_model: str # Changed from model_used to avoid namespace conflict
104
 
105
+ # Memory debug endpoint
106
  @app.get("/debug/memory")
107
  async def memory_usage():
108
  process = psutil.Process()
 
113
  "cpu_percent": process.cpu_percent()
114
  }
115
 
116
+ # Version check
117
  @app.get("/version")
118
  async def version():
119
  return {
 
121
  "models_available": list(MODELS.keys())
122
  }
123
 
124
+ # Healthcheck endpoint
125
  @app.get("/health")
126
  async def health():
127
  try:
128
+ logger.debug("Health check started")
129
+ logger.debug(f"HF_TOKEN present: {bool(HF_TOKEN)}")
130
+ logger.debug(f"Available models: {MODELS}")
131
+
132
+ result = await ModelManager.get_model_and_tokenizer("nidra-v1")
133
+ logger.debug("Model and tokenizer loaded successfully")
134
+
135
  return {
136
  "status": "healthy",
137
  "loaded_models": list(ModelManager._instances.keys())
138
  }
139
  except Exception as e:
140
+ error_msg = f"Health check failed: {str(e)}\n{traceback.format_exc()}"
141
+ logger.error(error_msg)
142
  return {
143
  "status": "unhealthy",
144
  "error": str(e)