Update README.md
Browse files
README.md
CHANGED
@@ -153,10 +153,8 @@ Memory: %(memory).2fMB
|
|
153 |
# Add memory usage information
|
154 |
if not hasattr(record, 'memory'):
|
155 |
record.memory = psutil.Process().memory_info().rss / (1024 * 1024)
|
156 |
-
|
157 |
log_fmt = self.FORMATS.get(record.levelno)
|
158 |
formatter = logging.Formatter(log_fmt, datefmt='%Y-%m-%d %H:%M:%S')
|
159 |
-
|
160 |
# Add performance metrics if available
|
161 |
if hasattr(record, 'duration'):
|
162 |
record.message = f"{record.message}\nDuration: {record.duration:.2f}s"
|
@@ -169,13 +167,10 @@ def setup_logging(log_dir: str = "logs") -> logging.Logger:
|
|
169 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
170 |
log_path = (Path(log_dir) / f"l_{timestamp}")
|
171 |
log_path.mkdir(exist_ok=True)
|
172 |
-
|
173 |
-
# Create logger
|
174 |
logger = logging.getLogger("InsuranceLLM")
|
175 |
# Clear any existing handlers
|
176 |
logger.handlers.clear()
|
177 |
logger.setLevel(logging.DEBUG)
|
178 |
-
|
179 |
# Create handlers with level-specific files
|
180 |
handlers = {
|
181 |
'debug': (logging.FileHandler(log_path / f"debug_{timestamp}.log"), logging.DEBUG),
|
@@ -189,15 +184,11 @@ def setup_logging(log_dir: str = "logs") -> logging.Logger:
|
|
189 |
enable_link_path=True
|
190 |
), logging.INFO)
|
191 |
}
|
192 |
-
|
193 |
-
# Configure handlers
|
194 |
formatter = CustomFormatter()
|
195 |
for (handler, level) in handlers.values():
|
196 |
handler.setLevel(level)
|
197 |
handler.setFormatter(formatter)
|
198 |
logger.addHandler(handler)
|
199 |
-
|
200 |
-
# Log startup information (will now appear only once)
|
201 |
logger.info(f"Starting new session {timestamp}")
|
202 |
logger.info(f"Log directory: {log_dir}")
|
203 |
return logger
|
@@ -213,20 +204,16 @@ class PerformanceMetrics:
|
|
213 |
self.tokens = 0
|
214 |
self.response_times = []
|
215 |
self.last_reset = self.start_time
|
216 |
-
|
217 |
def reset_timer(self):
|
218 |
"""Reset the timer for individual response measurements"""
|
219 |
self.last_reset = time.time()
|
220 |
-
|
221 |
def update(self, tokens: int):
|
222 |
self.tokens += tokens
|
223 |
response_time = time.time() - self.last_reset
|
224 |
self.response_times.append(response_time)
|
225 |
-
|
226 |
@property
|
227 |
def elapsed_time(self) -> float:
|
228 |
return time.time() - self.start_time
|
229 |
-
|
230 |
@property
|
231 |
def last_response_time(self) -> float:
|
232 |
return self.response_times[-1] if self.response_times else 0
|
@@ -317,7 +304,6 @@ class InsuranceLLM:
|
|
317 |
"Assistant:"
|
318 |
)
|
319 |
|
320 |
-
|
321 |
def generate_response(self, prompt: str) -> Dict[str, Any]:
|
322 |
if not self.llm_ctx:
|
323 |
raise RuntimeError("Model not loaded. Call load_model() first.")
|
@@ -343,18 +329,10 @@ class InsuranceLLM:
|
|
343 |
text_chunk = chunk["choices"][0]["text"]
|
344 |
response["text"] += text_chunk
|
345 |
response["tokens"] += 1
|
346 |
-
|
347 |
-
# Append to complete response
|
348 |
complete_response += text_chunk
|
349 |
-
|
350 |
-
# Use simple print for streaming output
|
351 |
print(text_chunk, end="", flush=True)
|
352 |
-
|
353 |
-
# Print final newline
|
354 |
print()
|
355 |
-
|
356 |
return response
|
357 |
-
|
358 |
except RuntimeError as e:
|
359 |
if "llama_decode returned -3" in str(e):
|
360 |
self.logger.error("Memory allocation failed. Try reducing context window or batch size")
|
@@ -385,21 +363,14 @@ class InsuranceLLM:
|
|
385 |
question = parts[1].strip()
|
386 |
|
387 |
prompt = self.get_prompt(question, context)
|
388 |
-
|
389 |
-
# Reset timer before generation
|
390 |
self.metrics.reset_timer()
|
391 |
-
|
392 |
-
# Generate response
|
393 |
response = self.generate_response(prompt)
|
394 |
-
|
395 |
# Update metrics after generation
|
396 |
self.metrics.update(response["tokens"])
|
397 |
-
|
398 |
# Print metrics
|
399 |
console.print(f"[dim]Average tokens/sec: {response['tokens']/(self.metrics.last_response_time if self.metrics.last_response_time!=0 else 1):.2f} ||[/dim]",
|
400 |
f"[dim]Tokens generated: {response['tokens']} ||[/dim]",
|
401 |
f"[dim]Response time: {self.metrics.last_response_time:.2f}s[/dim]", end="\n\n\n")
|
402 |
-
|
403 |
except KeyboardInterrupt:
|
404 |
console.print("\n[yellow]Input interrupted. Type '/bye', 'exit', or 'quit' to quit.[/yellow]")
|
405 |
continue
|
@@ -407,7 +378,6 @@ class InsuranceLLM:
|
|
407 |
self.logger.error(f"Error processing input: {str(e)}")
|
408 |
console.print(f"\n[red]Error: {str(e)}[/red]")
|
409 |
continue
|
410 |
-
|
411 |
except Exception as e:
|
412 |
self.logger.error(f"Fatal error in inference loop: {str(e)}")
|
413 |
console.print(f"\n[red]Fatal error: {str(e)}[/red]")
|
|
|
153 |
# Add memory usage information
|
154 |
if not hasattr(record, 'memory'):
|
155 |
record.memory = psutil.Process().memory_info().rss / (1024 * 1024)
|
|
|
156 |
log_fmt = self.FORMATS.get(record.levelno)
|
157 |
formatter = logging.Formatter(log_fmt, datefmt='%Y-%m-%d %H:%M:%S')
|
|
|
158 |
# Add performance metrics if available
|
159 |
if hasattr(record, 'duration'):
|
160 |
record.message = f"{record.message}\nDuration: {record.duration:.2f}s"
|
|
|
167 |
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
168 |
log_path = (Path(log_dir) / f"l_{timestamp}")
|
169 |
log_path.mkdir(exist_ok=True)
|
|
|
|
|
170 |
logger = logging.getLogger("InsuranceLLM")
|
171 |
# Clear any existing handlers
|
172 |
logger.handlers.clear()
|
173 |
logger.setLevel(logging.DEBUG)
|
|
|
174 |
# Create handlers with level-specific files
|
175 |
handlers = {
|
176 |
'debug': (logging.FileHandler(log_path / f"debug_{timestamp}.log"), logging.DEBUG),
|
|
|
184 |
enable_link_path=True
|
185 |
), logging.INFO)
|
186 |
}
|
|
|
|
|
187 |
formatter = CustomFormatter()
|
188 |
for (handler, level) in handlers.values():
|
189 |
handler.setLevel(level)
|
190 |
handler.setFormatter(formatter)
|
191 |
logger.addHandler(handler)
|
|
|
|
|
192 |
logger.info(f"Starting new session {timestamp}")
|
193 |
logger.info(f"Log directory: {log_dir}")
|
194 |
return logger
|
|
|
204 |
self.tokens = 0
|
205 |
self.response_times = []
|
206 |
self.last_reset = self.start_time
|
|
|
207 |
def reset_timer(self):
|
208 |
"""Reset the timer for individual response measurements"""
|
209 |
self.last_reset = time.time()
|
|
|
210 |
def update(self, tokens: int):
|
211 |
self.tokens += tokens
|
212 |
response_time = time.time() - self.last_reset
|
213 |
self.response_times.append(response_time)
|
|
|
214 |
@property
|
215 |
def elapsed_time(self) -> float:
|
216 |
return time.time() - self.start_time
|
|
|
217 |
@property
|
218 |
def last_response_time(self) -> float:
|
219 |
return self.response_times[-1] if self.response_times else 0
|
|
|
304 |
"Assistant:"
|
305 |
)
|
306 |
|
|
|
307 |
def generate_response(self, prompt: str) -> Dict[str, Any]:
|
308 |
if not self.llm_ctx:
|
309 |
raise RuntimeError("Model not loaded. Call load_model() first.")
|
|
|
329 |
text_chunk = chunk["choices"][0]["text"]
|
330 |
response["text"] += text_chunk
|
331 |
response["tokens"] += 1
|
|
|
|
|
332 |
complete_response += text_chunk
|
|
|
|
|
333 |
print(text_chunk, end="", flush=True)
|
|
|
|
|
334 |
print()
|
|
|
335 |
return response
|
|
|
336 |
except RuntimeError as e:
|
337 |
if "llama_decode returned -3" in str(e):
|
338 |
self.logger.error("Memory allocation failed. Try reducing context window or batch size")
|
|
|
363 |
question = parts[1].strip()
|
364 |
|
365 |
prompt = self.get_prompt(question, context)
|
|
|
|
|
366 |
self.metrics.reset_timer()
|
|
|
|
|
367 |
response = self.generate_response(prompt)
|
|
|
368 |
# Update metrics after generation
|
369 |
self.metrics.update(response["tokens"])
|
|
|
370 |
# Print metrics
|
371 |
console.print(f"[dim]Average tokens/sec: {response['tokens']/(self.metrics.last_response_time if self.metrics.last_response_time!=0 else 1):.2f} ||[/dim]",
|
372 |
f"[dim]Tokens generated: {response['tokens']} ||[/dim]",
|
373 |
f"[dim]Response time: {self.metrics.last_response_time:.2f}s[/dim]", end="\n\n\n")
|
|
|
374 |
except KeyboardInterrupt:
|
375 |
console.print("\n[yellow]Input interrupted. Type '/bye', 'exit', or 'quit' to quit.[/yellow]")
|
376 |
continue
|
|
|
378 |
self.logger.error(f"Error processing input: {str(e)}")
|
379 |
console.print(f"\n[red]Error: {str(e)}[/red]")
|
380 |
continue
|
|
|
381 |
except Exception as e:
|
382 |
self.logger.error(f"Fatal error in inference loop: {str(e)}")
|
383 |
console.print(f"\n[red]Fatal error: {str(e)}[/red]")
|