Ali2206 commited on
Commit
67f566e
·
verified ·
1 Parent(s): 650fb34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -55
app.py CHANGED
@@ -47,7 +47,8 @@ MEDICAL_KEYWORDS = {
47
  }
48
  TOKENIZER = "cl100k_base"
49
  MAX_MODEL_LEN = 2048 # Matches your model's actual limit
50
- TARGET_CHUNK_TOKENS = 1500 # Leaves room for prompt and response
 
51
  MEDICAL_SECTION_HEADER = "=== MEDICAL SECTION ==="
52
 
53
  def sanitize_utf8(text: str) -> str:
@@ -115,7 +116,6 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
115
  "status": "complete"
116
  })
117
  elif file_type == "csv":
118
- # Read CSV in chunks to handle large files
119
  chunks = []
120
  for chunk in pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
121
  skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
@@ -146,32 +146,13 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
146
  except Exception as e:
147
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
148
 
149
- def log_system_usage(tag=""):
150
- """Log system resource usage."""
151
- try:
152
- cpu = psutil.cpu_percent(interval=1)
153
- mem = psutil.virtual_memory()
154
- print(f"[{tag}] CPU: {cpu}% | RAM: {mem.used // (1024**2)}MB / {mem.total // (1024**2)}MB")
155
- result = subprocess.run(
156
- ["nvidia-smi", "--query-gpu=memory.used,memory.total,utilization.gpu", "--format=csv,nounits,noheader"],
157
- capture_output=True, text=True
158
- )
159
- if result.returncode == 0:
160
- used, total, util = result.stdout.strip().split(", ")
161
- print(f"[{tag}] GPU: {used}MB / {total}MB | Utilization: {util}%")
162
- except Exception as e:
163
- print(f"[{tag}] GPU/CPU monitor failed: {e}")
164
-
165
  def clean_response(text: str) -> str:
166
  """Clean and format the model response."""
167
  text = sanitize_utf8(text)
168
- # Remove tool calls and JSON artifacts
169
  text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
170
  text = re.sub(r"\['get_[^\]]+\']\n?", "", text)
171
  text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)
172
- # Remove repetitive phrases
173
  text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
174
- # Collapse excessive newlines
175
  text = re.sub(r"\n{3,}", "\n\n", text).strip()
176
  return text
177
 
@@ -183,7 +164,6 @@ def format_final_report(analysis_results: List[str], filename: str) -> str:
183
  report.append(f"File: {filename}")
184
  report.append("=" * 80)
185
 
186
- # Extract sections from all chunks
187
  sections = {
188
  "CRITICAL FINDINGS": [],
189
  "MISSED DIAGNOSES": [],
@@ -194,7 +174,6 @@ def format_final_report(analysis_results: List[str], filename: str) -> str:
194
 
195
  for result in analysis_results:
196
  for section in sections:
197
- # Find section content using regex
198
  section_match = re.search(
199
  rf"{re.escape(section)}:?\s*\n([^*]+?)(?=\n\*|\n\n|$)",
200
  result,
@@ -205,13 +184,11 @@ def format_final_report(analysis_results: List[str], filename: str) -> str:
205
  if content and content not in sections[section]:
206
  sections[section].append(content)
207
 
208
- # Build the final report - prioritize critical findings
209
  if sections["CRITICAL FINDINGS"]:
210
  report.append("\n🚨 **CRITICAL FINDINGS** 🚨")
211
  for content in sections["CRITICAL FINDINGS"]:
212
  report.append(f"\n{content}")
213
 
214
- # Add other sections
215
  for section, contents in sections.items():
216
  if section != "CRITICAL FINDINGS" and contents:
217
  report.append(f"\n**{section.upper()}**")
@@ -236,7 +213,6 @@ def split_content_by_tokens(content: str, max_tokens: int = TARGET_CHUNK_TOKENS)
236
  for para in paragraphs:
237
  para_tokens = count_tokens(para)
238
  if para_tokens > max_tokens:
239
- # Handle very long paragraphs by splitting sentences
240
  sentences = re.split(r'(?<=[.!?])\s+', para)
241
  for sent in sentences:
242
  sent_tokens = count_tokens(sent)
@@ -286,37 +262,28 @@ def init_agent():
286
  return agent
287
 
288
  def analyze_complete_document(content: str, filename: str, agent: TxAgent) -> str:
289
- """Analyze complete document with proper chunking and token management"""
290
  chunks = split_content_by_tokens(content)
291
  analysis_results = []
292
 
293
  for i, chunk in enumerate(chunks):
294
  try:
295
- # Create minimal prompt to save tokens
296
- prompt = f"""
297
- Analyze this medical record section for:
298
- 1. Critical findings (urgent)
299
- 2. Missed diagnoses (with evidence)
300
- 3. Medication issues
301
- 4. Assessment gaps
302
- 5. Follow-up needs
303
-
304
- Content:
305
- {chunk}
306
-
307
- Concise findings only:
308
- """
309
- # Verify we're within token limits
310
- prompt_tokens = count_tokens(prompt)
311
- chunk_tokens = count_tokens(chunk)
312
 
313
- if prompt_tokens + chunk_tokens > MAX_MODEL_LEN - 512: # Leave room for response
314
- # Find a natural truncation point
 
 
315
  adjusted_chunk = ""
316
  tokens_used = 0
317
- max_content_tokens = MAX_MODEL_LEN - prompt_tokens - 512
318
 
319
- for para in re.split(r"\n\s*\n", chunk):
320
  para_tokens = count_tokens(para)
321
  if tokens_used + para_tokens <= max_content_tokens:
322
  adjusted_chunk += "\n\n" + para
@@ -325,7 +292,7 @@ Concise findings only:
325
  break
326
 
327
  if not adjusted_chunk:
328
- # If even one paragraph is too long, split sentences
329
  sentences = re.split(r'(?<=[.!?])\s+', chunk)
330
  for sent in sentences:
331
  sent_tokens = count_tokens(sent)
@@ -337,12 +304,14 @@ Concise findings only:
337
 
338
  chunk = adjusted_chunk.strip()
339
 
 
 
340
  response = ""
341
  for output in agent.run_gradio_chat(
342
  message=prompt,
343
  history=[],
344
  temperature=0.1,
345
- max_new_tokens=512, # Keep responses concise
346
  max_token=MAX_MODEL_LEN,
347
  call_agent=False,
348
  conversation=[],
@@ -407,7 +376,6 @@ def create_ui(agent):
407
 
408
  yield "", None, "⏳ Processing documents (this may take several minutes for large files)..."
409
 
410
- # Process all files completely
411
  file_contents = []
412
  filenames = []
413
  total_tokens = 0
@@ -445,14 +413,12 @@ def create_ui(agent):
445
  yield "", None, f"🔍 Analyzing content ({total_tokens//1000}k tokens)..."
446
 
447
  try:
448
- # Process the complete document
449
  full_report = analyze_complete_document(
450
  combined_content,
451
  combined_filename,
452
  agent
453
  )
454
 
455
- # Save report to file
456
  file_hash_value = hashlib.md5(combined_content.encode()).hexdigest()
457
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt")
458
  with open(report_path, "w", encoding="utf-8") as f:
@@ -465,7 +431,6 @@ def create_ui(agent):
465
  print(error_msg)
466
  yield "", None, error_msg
467
 
468
- # UI event handlers
469
  send_btn.click(
470
  fn=analyze,
471
  inputs=[file_upload, msg_input],
@@ -483,7 +448,6 @@ def create_ui(agent):
483
 
484
  if __name__ == "__main__":
485
  print("🚀 Launching app...")
486
- # Install tiktoken if not available
487
  try:
488
  import tiktoken
489
  except ImportError:
 
47
  }
48
  TOKENIZER = "cl100k_base"
49
  MAX_MODEL_LEN = 2048 # Matches your model's actual limit
50
+ TARGET_CHUNK_TOKENS = 1200 # Reduced to ensure room for prompt and response
51
+ PROMPT_RESERVE = 300 # Tokens reserved for prompt structure
52
  MEDICAL_SECTION_HEADER = "=== MEDICAL SECTION ==="
53
 
54
  def sanitize_utf8(text: str) -> str:
 
116
  "status": "complete"
117
  })
118
  elif file_type == "csv":
 
119
  chunks = []
120
  for chunk in pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
121
  skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
 
146
  except Exception as e:
147
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  def clean_response(text: str) -> str:
150
  """Clean and format the model response."""
151
  text = sanitize_utf8(text)
 
152
  text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
153
  text = re.sub(r"\['get_[^\]]+\']\n?", "", text)
154
  text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)
 
155
  text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
 
156
  text = re.sub(r"\n{3,}", "\n\n", text).strip()
157
  return text
158
 
 
164
  report.append(f"File: {filename}")
165
  report.append("=" * 80)
166
 
 
167
  sections = {
168
  "CRITICAL FINDINGS": [],
169
  "MISSED DIAGNOSES": [],
 
174
 
175
  for result in analysis_results:
176
  for section in sections:
 
177
  section_match = re.search(
178
  rf"{re.escape(section)}:?\s*\n([^*]+?)(?=\n\*|\n\n|$)",
179
  result,
 
184
  if content and content not in sections[section]:
185
  sections[section].append(content)
186
 
 
187
  if sections["CRITICAL FINDINGS"]:
188
  report.append("\n🚨 **CRITICAL FINDINGS** 🚨")
189
  for content in sections["CRITICAL FINDINGS"]:
190
  report.append(f"\n{content}")
191
 
 
192
  for section, contents in sections.items():
193
  if section != "CRITICAL FINDINGS" and contents:
194
  report.append(f"\n**{section.upper()}**")
 
213
  for para in paragraphs:
214
  para_tokens = count_tokens(para)
215
  if para_tokens > max_tokens:
 
216
  sentences = re.split(r'(?<=[.!?])\s+', para)
217
  for sent in sentences:
218
  sent_tokens = count_tokens(sent)
 
262
  return agent
263
 
264
  def analyze_complete_document(content: str, filename: str, agent: TxAgent) -> str:
265
+ """Analyze complete document with strict token management"""
266
  chunks = split_content_by_tokens(content)
267
  analysis_results = []
268
 
269
  for i, chunk in enumerate(chunks):
270
  try:
271
+ # Ultra-minimal prompt to maximize content space
272
+ base_prompt = "Analyze for:\n1. Critical\n2. Missed DX\n3. Med issues\n4. Gaps\n5. Follow-up\n\nContent:\n"
273
+
274
+ # Calculate available space for content
275
+ prompt_tokens = count_tokens(base_prompt)
276
+ max_content_tokens = MAX_MODEL_LEN - prompt_tokens - 100 # Response buffer
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ # Ensure chunk fits
279
+ chunk_tokens = count_tokens(chunk)
280
+ if chunk_tokens > max_content_tokens:
281
+ # Find last paragraph that fits
282
  adjusted_chunk = ""
283
  tokens_used = 0
284
+ paragraphs = re.split(r"\n\s*\n", chunk)
285
 
286
+ for para in paragraphs:
287
  para_tokens = count_tokens(para)
288
  if tokens_used + para_tokens <= max_content_tokens:
289
  adjusted_chunk += "\n\n" + para
 
292
  break
293
 
294
  if not adjusted_chunk:
295
+ # If even one paragraph is too big, split sentences
296
  sentences = re.split(r'(?<=[.!?])\s+', chunk)
297
  for sent in sentences:
298
  sent_tokens = count_tokens(sent)
 
304
 
305
  chunk = adjusted_chunk.strip()
306
 
307
+ prompt = base_prompt + chunk
308
+
309
  response = ""
310
  for output in agent.run_gradio_chat(
311
  message=prompt,
312
  history=[],
313
  temperature=0.1,
314
+ max_new_tokens=300, # Keep responses very concise
315
  max_token=MAX_MODEL_LEN,
316
  call_agent=False,
317
  conversation=[],
 
376
 
377
  yield "", None, "⏳ Processing documents (this may take several minutes for large files)..."
378
 
 
379
  file_contents = []
380
  filenames = []
381
  total_tokens = 0
 
413
  yield "", None, f"🔍 Analyzing content ({total_tokens//1000}k tokens)..."
414
 
415
  try:
 
416
  full_report = analyze_complete_document(
417
  combined_content,
418
  combined_filename,
419
  agent
420
  )
421
 
 
422
  file_hash_value = hashlib.md5(combined_content.encode()).hexdigest()
423
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt")
424
  with open(report_path, "w", encoding="utf-8") as f:
 
431
  print(error_msg)
432
  yield "", None, error_msg
433
 
 
434
  send_btn.click(
435
  fn=analyze,
436
  inputs=[file_upload, msg_input],
 
448
 
449
  if __name__ == "__main__":
450
  print("🚀 Launching app...")
 
451
  try:
452
  import tiktoken
453
  except ImportError: