Ali2206 commited on
Commit
28928c8
·
verified ·
1 Parent(s): 67dd49b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -95
app.py CHANGED
@@ -52,8 +52,8 @@ def file_hash(path: str) -> str:
52
  with open(path, "rb") as f:
53
  return hashlib.md5(f.read()).hexdigest()
54
 
55
- def chunk_hash(chunk: str, prompt: str) -> str:
56
- return hashlib.md5((chunk + prompt).encode("utf-8")).hexdigest()
57
 
58
  def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
59
  """Extract text from a range of PDF pages."""
@@ -147,32 +147,21 @@ def log_system_usage(tag=""):
147
  logger.error(f"[{tag}] GPU/CPU monitor failed: {e}")
148
 
149
  def clean_response(text: str) -> str:
150
- """Clean TxAgent response to group findings under tool-derived headings."""
151
  text = sanitize_utf8(text)
152
- text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
 
153
  text = re.sub(r"\n{3,}", "\n\n", text)
154
  text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
155
-
156
- tool_to_heading = {
157
- "get_abuse_info_by_drug_name": "Drugs",
158
- "get_dependence_info_by_drug_name": "Drugs",
159
- "get_abuse_types_and_related_adverse_reactions_and_controlled_substance_status_by_drug_name": "Drugs",
160
- "get_info_for_patients_by_drug_name": "Drugs",
161
- }
162
-
163
  sections = {}
164
  current_section = None
165
- current_tool = None
166
  lines = text.splitlines()
167
  for line in lines:
168
  line = line.strip()
169
  if not line:
170
  continue
171
- tool_match = re.match(r"\[TOOL:\s*(\w+)\]", line)
172
- if tool_match:
173
- current_tool = tool_match.group(1)
174
- continue
175
- section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up|Drugs)", line)
176
  if section_match:
177
  current_section = section_match.group(1)
178
  if current_section not in sections:
@@ -180,13 +169,7 @@ def clean_response(text: str) -> str:
180
  continue
181
  finding_match = re.match(r"-\s*.+", line)
182
  if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
183
- if current_tool and current_tool in tool_to_heading:
184
- heading = tool_to_heading[current_tool]
185
- if heading not in sections:
186
- sections[heading] = []
187
- sections[heading].append(line)
188
- else:
189
- sections[current_section].append(line)
190
 
191
  cleaned = []
192
  for heading, findings in sections.items():
@@ -212,97 +195,99 @@ def init_agent():
212
  tool_files_dict={"new_tool": target_tool_path},
213
  force_finish=True,
214
  enable_checker=True,
215
- step_rag_num=2,
216
  seed=100,
217
  additional_default_tools=[],
 
218
  )
219
  agent.init_model()
220
  log_system_usage("After Load")
221
  logger.info("Agent Ready")
222
  return agent
223
 
224
- def process_chunk(agent, chunk: str, chunk_idx: int, total_chunks: int, cache_path: str, prompt_template: str) -> tuple:
225
- """Process a single chunk with error handling and caching."""
226
- if not chunk.strip():
227
- logger.warning(f"Chunk {chunk_idx} is empty, skipping...")
228
- return chunk_idx, f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
229
-
230
- chunk_id = chunk_hash(chunk, prompt_template)
231
- chunk_cache_path = os.path.join(cache_path, f"chunk_{chunk_id}.txt")
232
-
233
- if os.path.exists(chunk_cache_path):
234
- with open(chunk_cache_path, "r", encoding="utf-8") as f:
235
- logger.info(f"Cache hit for chunk {chunk_idx}")
236
- return chunk_idx, f.read()
237
-
238
- prompt = prompt_template.format(chunk_idx, total_chunks, chunk=chunk[:1000]) # Truncate to avoid token limits
239
- chunk_response = ""
240
 
 
 
 
 
 
 
 
 
 
 
 
241
  try:
242
- for chunk_output in agent.run_gradio_chat(
243
  message=prompt,
244
  history=[],
245
  temperature=0.2,
246
- max_new_tokens=512,
247
- max_token=2048,
248
  call_agent=False,
249
  conversation=[],
250
  ):
251
- if chunk_output is None:
252
  continue
253
- if isinstance(chunk_output, list):
254
- for m in chunk_output:
255
  if hasattr(m, 'content') and m.content:
256
  cleaned = clean_response(m.content)
257
  if cleaned and re.search(r"###\s*\w+", cleaned):
258
- chunk_response += cleaned + "\n\n"
259
- elif isinstance(chunk_output, str) and chunk_output.strip():
260
- cleaned = clean_response(chunk_output)
261
  if cleaned and re.search(r"###\s*\w+", cleaned):
262
- chunk_response += cleaned + "\n\n"
263
  except Exception as e:
264
- logger.error(f"Error processing chunk {chunk_idx}: {e}")
265
- return chunk_idx, f"--- Analysis for Chunk {chunk_idx} ---\nError occurred: {str(e)}\n\n"
266
-
267
- if chunk_response:
268
- with open(chunk_cache_path, "w", encoding="utf-8") as f:
269
- f.write(chunk_response)
270
- return chunk_idx, f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
271
- return chunk_idx, f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
272
 
273
  def create_ui(agent):
274
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
275
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
276
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
277
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
278
- max_chunks_input = gr.Slider(minimum=1, maximum=50, value=5, step=1, label="Max Chunks to Analyze")
279
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
280
  send_btn = gr.Button("Analyze", variant="primary")
281
  download_output = gr.File(label="Download Full Report")
282
 
283
  prompt_template = """
284
- You are a medical analysis assistant. Analyze the following patient record excerpt for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under appropriate headings based on the tool used (e.g., drug-related findings under 'Drugs'). For each finding, include:
285
  - Clinical context (why the issue was missed or relevant details from the record).
286
  - Potential risks if unaddressed (e.g., disease progression, adverse events).
287
  - Actionable recommendations (e.g., tests, referrals, medication adjustments).
288
- Output ONLY the markdown-formatted findings, with bullet points under each heading. Precede each finding with a tool tag (e.g., [TOOL: get_abuse_info_by_drug_name]) to indicate the tool used. Do NOT include reasoning, tool calls, or intermediate steps. If no issues are found for a tool or category, state "No issues identified" for that section. Ensure the output is specific to the provided text and avoids generic responses.
289
 
290
  Example Output:
291
  ### Drugs
292
- [TOOL: get_abuse_info_by_drug_name]
293
- - [Finding placeholder for drug-related issue]
294
  ### Missed Diagnoses
295
- - [Finding placeholder for missed diagnosis]
296
  ### Incomplete Assessments
297
- - [Finding placeholder for incomplete assessment]
298
  ### Urgent Follow-up
299
- - [Finding placeholder for urgent follow-up]
300
 
301
- Patient Record Excerpt (Chunk {0} of {1}):
302
- {chunk}
303
  """
304
 
305
- def analyze(message: str, history: List[dict], files: List, max_chunks: int):
306
  history.append({"role": "user", "content": message})
307
  history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
308
  yield history, None
@@ -330,39 +315,28 @@ Patient Record Excerpt (Chunk {0} of {1}):
330
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
331
  yield history, None
332
 
333
- chunk_size = 1000 # Reduced for speed
 
334
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
335
- chunks = chunks[:max_chunks] # Limit to max_chunks
336
- total_chunks = len(chunks)
337
- combined_response = ""
338
-
339
  if not chunks:
340
  history.append({"role": "assistant", "content": "No content to analyze."})
341
  yield history, None
342
  return
343
 
344
  try:
345
- # Sequential processing to avoid VLLM error
346
- for chunk_idx, chunk in enumerate(chunks, 1):
347
- animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
348
- history.append({"role": "assistant", "content": f"Analyzing chunk {chunk_idx}/{total_chunks}... {animation}"})
349
- yield history, None
350
-
351
- _, chunk_response = process_chunk(agent, chunk, chunk_idx, total_chunks, file_cache_dir, prompt_template)
352
- combined_response += chunk_response
353
-
354
- history[-1] = {"role": "assistant", "content": combined_response.strip()}
355
- yield history, None
356
 
357
- if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
358
- history[-1]["content"] = combined_response.strip()
359
- else:
360
- history.append({"role": "assistant", "content": "No oversights identified in the provided records."})
361
 
362
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
363
- if report_path:
364
  with open(report_path, "w", encoding="utf-8") as f:
365
- f.write(combined_response)
366
  yield history, report_path if report_path and os.path.exists(report_path) else None
367
 
368
  except Exception as e:
@@ -370,8 +344,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
370
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
371
  yield history, None
372
 
373
- send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload, max_chunks_input], outputs=[chatbot, download_output])
374
- msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload, max_chunks_input], outputs=[chatbot, download_output])
375
  return demo
376
 
377
  if __name__ == "__main__":
 
52
  with open(path, "rb") as f:
53
  return hashlib.md5(f.read()).hexdigest()
54
 
55
+ def batch_hash(chunks: List[str], prompt: str) -> str:
56
+ return hashlib.md5(("".join(chunks) + prompt).encode("utf-8")).hexdigest()
57
 
58
  def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
59
  """Extract text from a range of PDF pages."""
 
147
  logger.error(f"[{tag}] GPU/CPU monitor failed: {e}")
148
 
149
  def clean_response(text: str) -> str:
150
+ """Clean TxAgent response to group findings by section without tool names."""
151
  text = sanitize_utf8(text)
152
+ # Remove tool tags, None, and reasoning
153
+ text = re.sub(r"\[TOOL:[^\]]+\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
154
  text = re.sub(r"\n{3,}", "\n\n", text)
155
  text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
156
+
 
 
 
 
 
 
 
157
  sections = {}
158
  current_section = None
 
159
  lines = text.splitlines()
160
  for line in lines:
161
  line = line.strip()
162
  if not line:
163
  continue
164
+ section_match = re.match(r"###\s*(Drugs|Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
 
 
 
 
165
  if section_match:
166
  current_section = section_match.group(1)
167
  if current_section not in sections:
 
169
  continue
170
  finding_match = re.match(r"-\s*.+", line)
171
  if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
172
+ sections[current_section].append(line)
 
 
 
 
 
 
173
 
174
  cleaned = []
175
  for heading, findings in sections.items():
 
195
  tool_files_dict={"new_tool": target_tool_path},
196
  force_finish=True,
197
  enable_checker=True,
198
+ step_rag_num=1, # Reduced for speed
199
  seed=100,
200
  additional_default_tools=[],
201
+ num_engine_threads=1, # Limit VLLM threads for stability
202
  )
203
  agent.init_model()
204
  log_system_usage("After Load")
205
  logger.info("Agent Ready")
206
  return agent
207
 
208
+ def process_batch(agent, chunks: List[str], cache_path: str, prompt_template: str) -> str:
209
+ """Process a batch of chunks in a single prompt."""
210
+ if not any(chunk.strip() for chunk in chunks):
211
+ logger.warning("All chunks are empty, skipping analysis...")
212
+ return "No oversights identified in the provided records."
213
+
214
+ batch_id = batch_hash(chunks, prompt_template)
215
+ batch_cache_path = os.path.join(cache_path, f"batch_{batch_id}.txt")
 
 
 
 
 
 
 
 
216
 
217
+ if os.path.exists(batch_cache_path):
218
+ with open(batch_cache_path, "r", encoding="utf-8") as f:
219
+ logger.info("Cache hit for batch")
220
+ return f.read()
221
+
222
+ # Combine chunks into one prompt
223
+ chunk_texts = [f"Chunk {i+1}:\n{chunk[:500]}" for i, chunk in enumerate(chunks) if chunk.strip()]
224
+ combined_text = "\n\n".join(chunk_texts)
225
+ prompt = prompt_template.format(chunks=combined_text)
226
+ response = ""
227
+
228
  try:
229
+ for output in agent.run_gradio_chat(
230
  message=prompt,
231
  history=[],
232
  temperature=0.2,
233
+ max_new_tokens=256, # Reduced for speed
234
+ max_token=1024, # Reduced for speed
235
  call_agent=False,
236
  conversation=[],
237
  ):
238
+ if output is None:
239
  continue
240
+ if isinstance(output, list):
241
+ for m in output:
242
  if hasattr(m, 'content') and m.content:
243
  cleaned = clean_response(m.content)
244
  if cleaned and re.search(r"###\s*\w+", cleaned):
245
+ response += cleaned + "\n\n"
246
+ elif isinstance(output, str) and output.strip():
247
+ cleaned = clean_response(output)
248
  if cleaned and re.search(r"###\s*\w+", cleaned):
249
+ response += cleaned + "\n\n"
250
  except Exception as e:
251
+ logger.error(f"Error processing batch: {e}")
252
+ return f"Error occurred: {str(e)}"
253
+
254
+ if response:
255
+ with open(batch_cache_path, "w", encoding="utf-8") as f:
256
+ f.write(response)
257
+ return response
258
+ return "No oversights identified in the provided records."
259
 
260
  def create_ui(agent):
261
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
262
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
263
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
264
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
 
265
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
266
  send_btn = gr.Button("Analyze", variant="primary")
267
  download_output = gr.File(label="Download Full Report")
268
 
269
  prompt_template = """
270
+ You are a medical analysis assistant. Analyze the following patient record excerpts for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under the following headings: 'Drugs', 'Missed Diagnoses', 'Medication Conflicts', 'Incomplete Assessments', 'Urgent Follow-up'. For each finding, include:
271
  - Clinical context (why the issue was missed or relevant details from the record).
272
  - Potential risks if unaddressed (e.g., disease progression, adverse events).
273
  - Actionable recommendations (e.g., tests, referrals, medication adjustments).
274
+ Output ONLY the markdown-formatted findings, with bullet points under each heading. Do NOT include tool references, reasoning, or intermediate steps. If no issues are found for a section, omit that section. Ensure the output is specific to the provided text and avoids generic responses.
275
 
276
  Example Output:
277
  ### Drugs
278
+ - Opioid use disorder not addressed. Missed due to lack of screening. Risks: overdose. Recommend: addiction specialist referral.
 
279
  ### Missed Diagnoses
280
+ - Elevated BP noted without diagnosis. Missed due to inconsistent visits. Risks: stroke. Recommend: BP monitoring, antihypertensives.
281
  ### Incomplete Assessments
282
+ - Chest pain not evaluated. Time constraints likely cause. Risks: cardiac issues. Recommend: ECG, stress test.
283
  ### Urgent Follow-up
284
+ - Abnormal creatinine not addressed. Delayed lab review. Risks: renal failure. Recommend: nephrology referral.
285
 
286
+ Patient Record Excerpts:
287
+ {chunks}
288
  """
289
 
290
+ def analyze(message: str, history: List[dict], files: List):
291
  history.append({"role": "user", "content": message})
292
  history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
293
  yield history, None
 
315
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
316
  yield history, None
317
 
318
+ chunk_size = 500 # Fixed for speed
319
+ max_chunks = 5 # Fixed for speed
320
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
321
+ chunks = chunks[:max_chunks] # Limit to 5 chunks
 
 
 
322
  if not chunks:
323
  history.append({"role": "assistant", "content": "No content to analyze."})
324
  yield history, None
325
  return
326
 
327
  try:
328
+ animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
329
+ history.append({"role": "assistant", "content": f"Analyzing chunks 1-5... {animation}"})
330
+ yield history, None
 
 
 
 
 
 
 
 
331
 
332
+ response = process_batch(agent, chunks, file_cache_dir, prompt_template)
333
+ history[-1] = {"role": "assistant", "content": response.strip()}
334
+ yield history, None
 
335
 
336
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
337
+ if report_path and response.strip() and "No oversights identified" not in response and "Error occurred" not in response:
338
  with open(report_path, "w", encoding="utf-8") as f:
339
+ f.write(response)
340
  yield history, report_path if report_path and os.path.exists(report_path) else None
341
 
342
  except Exception as e:
 
344
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
345
  yield history, None
346
 
347
+ send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
348
+ msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
349
  return demo
350
 
351
  if __name__ == "__main__":