Ali2206 commited on
Commit
a58b5f7
·
verified ·
1 Parent(s): 51aebc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -97
app.py CHANGED
@@ -47,6 +47,9 @@ def file_hash(path: str) -> str:
47
  with open(path, "rb") as f:
48
  return hashlib.md5(f.read()).hexdigest()
49
 
 
 
 
50
  def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
51
  """Extract text from a range of PDF pages."""
52
  try:
@@ -68,17 +71,14 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
68
  if total_pages == 0:
69
  return ""
70
 
71
- # Use 6 processes (adjust based on CPU cores)
72
  num_processes = min(6, multiprocessing.cpu_count())
73
  pages_per_process = max(1, total_pages // num_processes)
74
 
75
- # Create page ranges for parallel processing
76
  ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
77
  for i in range(num_processes)]
78
  if ranges[-1][1] != total_pages:
79
  ranges[-1] = (ranges[-1][0], total_pages)
80
 
81
- # Process page ranges in parallel
82
  with multiprocessing.Pool(processes=num_processes) as pool:
83
  extract_func = partial(extract_page_range, file_path)
84
  results = []
@@ -141,22 +141,17 @@ def log_system_usage(tag=""):
141
  def clean_response(text: str) -> str:
142
  """Clean TxAgent response to group findings under tool-derived headings."""
143
  text = sanitize_utf8(text)
144
- # Remove tool call artifacts, None, and reasoning
145
  text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
146
- # Remove extra whitespace and non-markdown content
147
  text = re.sub(r"\n{3,}", "\n\n", text)
148
- text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text) # Keep markdown-relevant characters
149
 
150
- # Define tool-to-heading mapping
151
  tool_to_heading = {
152
  "get_abuse_info_by_drug_name": "Drugs",
153
  "get_dependence_info_by_drug_name": "Drugs",
154
  "get_abuse_types_and_related_adverse_reactions_and_controlled_substance_status_by_drug_name": "Drugs",
155
  "get_info_for_patients_by_drug_name": "Drugs",
156
- # Add other tools from new_tool.json if applicable
157
  }
158
 
159
- # Parse sections and findings
160
  sections = {}
161
  current_section = None
162
  current_tool = None
@@ -165,22 +160,18 @@ def clean_response(text: str) -> str:
165
  line = line.strip()
166
  if not line:
167
  continue
168
- # Detect tool tag
169
  tool_match = re.match(r"\[TOOL:\s*(\w+)\]", line)
170
  if tool_match:
171
  current_tool = tool_match.group(1)
172
  continue
173
- # Detect section heading
174
- section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", line)
175
  if section_match:
176
  current_section = section_match.group(1)
177
  if current_section not in sections:
178
  sections[current_section] = []
179
  continue
180
- # Detect finding
181
  finding_match = re.match(r"-\s*.+", line)
182
  if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
183
- # Assign to tool-derived heading if tool is specified
184
  if current_tool and current_tool in tool_to_heading:
185
  heading = tool_to_heading[current_tool]
186
  if heading not in sections:
@@ -189,15 +180,14 @@ def clean_response(text: str) -> str:
189
  else:
190
  sections[current_section].append(line)
191
 
192
- # Combine non-empty sections
193
  cleaned = []
194
  for heading, findings in sections.items():
195
- if findings: # Only include sections with findings
196
  cleaned.append(f"### {heading}\n" + "\n".join(findings))
197
 
198
  text = "\n\n".join(cleaned).strip()
199
  if not text:
200
- text = "" # Return empty string if no valid findings
201
  return text
202
 
203
  def init_agent():
@@ -214,7 +204,7 @@ def init_agent():
214
  tool_files_dict={"new_tool": target_tool_path},
215
  force_finish=True,
216
  enable_checker=True,
217
- step_rag_num=4,
218
  seed=100,
219
  additional_default_tools=[],
220
  )
@@ -223,16 +213,77 @@ def init_agent():
223
  print("✅ Agent Ready")
224
  return agent
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def create_ui(agent):
227
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
228
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
229
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
230
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
 
231
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
232
  send_btn = gr.Button("Analyze", variant="primary")
233
  download_output = gr.File(label="Download Full Report")
234
 
235
- def analyze(message: str, history: List[dict], files: List):
236
  history.append({"role": "user", "content": message})
237
  history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
238
  yield history, None
@@ -240,7 +291,6 @@ def create_ui(agent):
240
  extracted = ""
241
  file_hash_value = ""
242
  if files:
243
- # Progress callback for extraction
244
  total_pages = 0
245
  processed_pages = 0
246
  def update_extraction_progress(current, total):
@@ -257,94 +307,46 @@ def create_ui(agent):
257
  extracted = "\n".join(results)
258
  file_hash_value = file_hash(files[0].name) if files else ""
259
 
260
- history.pop() # Remove extraction message
261
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
262
  yield history, None
263
 
264
- # Split extracted text into chunks of ~6,000 characters
265
- chunk_size = 6000
266
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
 
 
267
  combined_response = ""
268
 
269
- prompt_template = """
270
- You are a medical analysis assistant. Analyze the following patient record excerpt for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under appropriate headings based on the tool used (e.g., drug-related findings under 'Drugs'). For each finding, include:
271
- - Clinical context (why the issue was missed or relevant details from the record).
272
- - Potential risks if unaddressed (e.g., disease progression, adverse events).
273
- - Actionable recommendations (e.g., tests, referrals, medication adjustments).
274
- Output ONLY the markdown-formatted findings, with bullet points under each heading. Precede each finding with a tool tag (e.g., [TOOL: get_abuse_info_by_drug_name]) to indicate the tool used. Do NOT include reasoning, tool calls, or intermediate steps. If no issues are found for a tool or category, state "No issues identified" for that section. Ensure the output is specific to the provided text and avoids generic responses.
275
-
276
- Example Output:
277
- ### Drugs
278
- [TOOL: get_abuse_info_by_drug_name]
279
- - Opioid use disorder not addressed. Missed due to lack of screening. Risks: overdose. Recommend: addiction specialist referral.
280
- ### Missed Diagnoses
281
- - Elevated BP noted without diagnosis. Missed due to inconsistent visits. Risks: stroke. Recommend: BP monitoring, antihypertensives.
282
- ### Incomplete Assessments
283
- - Chest pain not evaluated. Time constraints likely cause. Risks: cardiac issues. Recommend: ECG, stress test.
284
- ### Urgent Follow-up
285
- - Abnormal creatinine not addressed. Delayed lab review. Risks: renal failure. Recommend: nephrology referral.
286
-
287
- Patient Record Excerpt (Chunk {0} of {1}):
288
- {chunk}
289
- """
290
 
291
  try:
292
- # Process each chunk and stream results in real-time
293
- for chunk_idx, chunk in enumerate(chunks, 1):
294
- # Update UI with chunk progress
295
- animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
296
- history.append({"role": "assistant", "content": f"Analyzing records... {animation} Chunk {chunk_idx}/{len(chunks)}"})
297
- yield history, None
298
-
299
- prompt = prompt_template.format(chunk_idx, len(chunks), chunk=chunk[:4000]) # Truncate to avoid token limits
300
- chunk_response = ""
301
- for chunk_output in agent.run_gradio_chat(
302
- message=prompt,
303
- history=[],
304
- temperature=0.2,
305
- max_new_tokens=1024,
306
- max_token=4096,
307
- call_agent=False,
308
- conversation=[],
309
- ):
310
- if chunk_output is None:
311
- continue
312
- if isinstance(chunk_output, list):
313
- for m in chunk_output:
314
- if hasattr(m, 'content') and m.content:
315
- cleaned = clean_response(m.content)
316
- if cleaned and re.search(r"###\s*\w+", cleaned):
317
- chunk_response += cleaned + "\n\n"
318
- # Update UI with partial response
319
- if history[-1]["content"].startswith("Analyzing"):
320
- history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
321
- else:
322
- history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
323
- yield history, None
324
- elif isinstance(chunk_output, str) and chunk_output.strip():
325
- cleaned = clean_response(chunk_output)
326
- if cleaned and re.search(r"###\s*\w+", cleaned):
327
- chunk_response += cleaned + "\n\n"
328
- # Update UI with partial response
329
- if history[-1]["content"].startswith("Analyzing"):
330
- history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
331
- else:
332
- history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
333
- yield history, None
334
-
335
- # Append completed chunk response to combined response
336
- if chunk_response:
337
- combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
338
- else:
339
- combined_response += f"--- Analysis for Chunk {chunk_idx} ---\nNo oversights identified for this chunk.\n\n"
340
-
341
- # Finalize UI with complete response
342
  if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
343
  history[-1]["content"] = combined_response.strip()
344
  else:
345
  history.append({"role": "assistant", "content": "No oversights identified in the provided records."})
346
 
347
- # Generate report file
348
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
349
  if report_path:
350
  with open(report_path, "w", encoding="utf-8") as f:
@@ -356,8 +358,8 @@ Patient Record Excerpt (Chunk {0} of {1}):
356
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
357
  yield history, None
358
 
359
- send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
360
- msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
361
  return demo
362
 
363
  if __name__ == "__main__":
 
47
  with open(path, "rb") as f:
48
  return hashlib.md5(f.read()).hexdigest()
49
 
50
+ def chunk_hash(chunk: str) -> str:
51
+ return hashlib.md5(chunk.encode("utf-8")).hexdigest()
52
+
53
  def extract_page_range(file_path: str, start_page: int, end_page: int) -> str:
54
  """Extract text from a range of PDF pages."""
55
  try:
 
71
  if total_pages == 0:
72
  return ""
73
 
 
74
  num_processes = min(6, multiprocessing.cpu_count())
75
  pages_per_process = max(1, total_pages // num_processes)
76
 
 
77
  ranges = [(i * pages_per_process, min((i + 1) * pages_per_process, total_pages))
78
  for i in range(num_processes)]
79
  if ranges[-1][1] != total_pages:
80
  ranges[-1] = (ranges[-1][0], total_pages)
81
 
 
82
  with multiprocessing.Pool(processes=num_processes) as pool:
83
  extract_func = partial(extract_page_range, file_path)
84
  results = []
 
141
  def clean_response(text: str) -> str:
142
  """Clean TxAgent response to group findings under tool-derived headings."""
143
  text = sanitize_utf8(text)
 
144
  text = re.sub(r"\[.*?\]|\bNone\b|To analyze the patient record excerpt.*?medications\.|Since the previous attempts.*?\.|I need to.*?medications\.|Retrieving tools.*?\.", "", text, flags=re.DOTALL)
 
145
  text = re.sub(r"\n{3,}", "\n\n", text)
146
+ text = re.sub(r"[^\n#\-\*\w\s\.\,\:\(\)]+", "", text)
147
 
 
148
  tool_to_heading = {
149
  "get_abuse_info_by_drug_name": "Drugs",
150
  "get_dependence_info_by_drug_name": "Drugs",
151
  "get_abuse_types_and_related_adverse_reactions_and_controlled_substance_status_by_drug_name": "Drugs",
152
  "get_info_for_patients_by_drug_name": "Drugs",
 
153
  }
154
 
 
155
  sections = {}
156
  current_section = None
157
  current_tool = None
 
160
  line = line.strip()
161
  if not line:
162
  continue
 
163
  tool_match = re.match(r"\[TOOL:\s*(\w+)\]", line)
164
  if tool_match:
165
  current_tool = tool_match.group(1)
166
  continue
167
+ section_match = re.match(r"###\s*(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up|Drugs)", line)
 
168
  if section_match:
169
  current_section = section_match.group(1)
170
  if current_section not in sections:
171
  sections[current_section] = []
172
  continue
 
173
  finding_match = re.match(r"-\s*.+", line)
174
  if finding_match and current_section and not re.match(r"-\s*No issues identified", line):
 
175
  if current_tool and current_tool in tool_to_heading:
176
  heading = tool_to_heading[current_tool]
177
  if heading not in sections:
 
180
  else:
181
  sections[current_section].append(line)
182
 
 
183
  cleaned = []
184
  for heading, findings in sections.items():
185
+ if findings:
186
  cleaned.append(f"### {heading}\n" + "\n".join(findings))
187
 
188
  text = "\n\n".join(cleaned).strip()
189
  if not text:
190
+ text = ""
191
  return text
192
 
193
  def init_agent():
 
204
  tool_files_dict={"new_tool": target_tool_path},
205
  force_finish=True,
206
  enable_checker=True,
207
+ step_rag_num=2, # Reduced for speed
208
  seed=100,
209
  additional_default_tools=[],
210
  )
 
213
  print("✅ Agent Ready")
214
  return agent
215
 
216
+ def process_chunk(agent, chunk: str, chunk_idx: int, total_chunks: int, cache_path: str) -> str:
217
+ """Process a single chunk and cache the result."""
218
+ chunk_id = chunk_hash(chunk)
219
+ chunk_cache_path = os.path.join(file_cache_dir, f"chunk_{chunk_id}.txt")
220
+
221
+ if os.path.exists(chunk_cache_path):
222
+ with open(chunk_cache_path, "r", encoding="utf-8") as f:
223
+ return f.read()
224
+
225
+ prompt_template = """
226
+ You are a medical analysis assistant. Analyze the following patient record excerpt for clinical oversights and provide a concise, evidence-based summary in markdown format. Group findings under appropriate headings based on the tool used (e.g., drug-related findings under 'Drugs'). For each finding, include:
227
+ - Clinical context (why the issue was missed or relevant details from the record).
228
+ - Potential risks if unaddressed (e.g., disease progression, adverse events).
229
+ - Actionable recommendations (e.g., tests, referrals, medication adjustments).
230
+ Output ONLY the markdown-formatted findings, with bullet points under each heading. Precede each finding with a tool tag (e.g., [TOOL: get_abuse_info_by_drug_name]) to indicate the tool used. Do NOT include reasoning, tool calls, or intermediate steps. If no issues are found for a tool or category, state "No issues identified" for that section. Ensure the output is specific to the provided text and avoids generic responses.
231
+
232
+ Example Output:
233
+ ### Drugs
234
+ [TOOL: get_abuse_info_by_drug_name]
235
+ - [Finding placeholder for drug-related issue]
236
+ ### Missed Diagnoses
237
+ - [Finding placeholder for missed diagnosis]
238
+ ### Incomplete Assessments
239
+ - [Finding placeholder for incomplete assessment]
240
+ ### Urgent Follow-up
241
+ - [Finding placeholder for urgent follow-up]
242
+
243
+ Patient Record Excerpt (Chunk {0} of {1}):
244
+ {chunk}
245
+ """
246
+ prompt = prompt_template.format(chunk_idx, total_chunks, chunk=chunk[:2000]) # Truncate to avoid token limits
247
+ chunk_response = ""
248
+
249
+ for chunk_output in agent.run_gradio_chat(
250
+ message=prompt,
251
+ history=[],
252
+ temperature=0.2,
253
+ max_new_tokens=512, # Reduced for speed
254
+ max_token=2048, # Reduced for speed
255
+ call_agent=False,
256
+ conversation=[],
257
+ ):
258
+ if chunk_output is None:
259
+ continue
260
+ if isinstance(chunk_output, list):
261
+ for m in chunk_output:
262
+ if hasattr(m, 'content') and m.content:
263
+ cleaned = clean_response(m.content)
264
+ if cleaned and re.search(r"###\s*\w+", cleaned):
265
+ chunk_response += cleaned + "\n\n"
266
+ elif isinstance(chunk_output, str) and chunk_output.strip():
267
+ cleaned = clean_response(chunk_output)
268
+ if cleaned and re.search(r"###\s*\w+", cleaned):
269
+ chunk_response += cleaned + "\n\n"
270
+
271
+ if chunk_response:
272
+ with open(chunk_cache_path, "w", encoding="utf-8") as f:
273
+ f.write(chunk_response)
274
+ return chunk_response
275
+
276
  def create_ui(agent):
277
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
278
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
279
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
280
  file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
281
+ max_chunks_input = gr.Slider(minimum=1, maximum=50, value=10, step=1, label="Max Chunks to Analyze")
282
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
283
  send_btn = gr.Button("Analyze", variant="primary")
284
  download_output = gr.File(label="Download Full Report")
285
 
286
+ def analyze(message: str, history: List[dict], files: List, max_chunks: int):
287
  history.append({"role": "user", "content": message})
288
  history.append({"role": "assistant", "content": "⏳ Extracting text from files..."})
289
  yield history, None
 
291
  extracted = ""
292
  file_hash_value = ""
293
  if files:
 
294
  total_pages = 0
295
  processed_pages = 0
296
  def update_extraction_progress(current, total):
 
307
  extracted = "\n".join(results)
308
  file_hash_value = file_hash(files[0].name) if files else ""
309
 
310
+ history.pop()
311
  history.append({"role": "assistant", "content": "✅ Text extraction complete."})
312
  yield history, None
313
 
314
+ chunk_size = 2000 # Reduced for speed
 
315
  chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
316
+ chunks = chunks[:max_chunks] # Limit to max_chunks
317
+ total_chunks = len(chunks)
318
  combined_response = ""
319
 
320
+ if not chunks:
321
+ history.append({"role": "assistant", "content": "No content to analyze."})
322
+ yield history, None
323
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  try:
326
+ with ThreadPoolExecutor(max_workers=4) as executor: # Parallel processing
327
+ futures = []
328
+ for chunk_idx, chunk in enumerate(chunks, 1):
329
+ futures.append(executor.submit(process_chunk, agent, chunk, chunk_idx, total_chunks, file_cache_dir))
330
+
331
+ for idx, future in enumerate(as_completed(futures)):
332
+ chunk_response = future.result()
333
+ animation = ["🔍", "📊", "🧠", "🔎"][(int(time.time() * 2) % 4)]
334
+ history.append({"role": "assistant", "content": f"Analyzing chunks... {animation} {idx + 1}/{total_chunks}"})
335
+ yield history, None
336
+
337
+ if chunk_response:
338
+ combined_response += f"--- Analysis for Chunk {idx + 1} ---\n{chunk_response}\n"
339
+ else:
340
+ combined_response += f"--- Analysis for Chunk {idx + 1} ---\nNo oversights identified for this chunk.\n\n"
341
+
342
+ history[-1] = {"role": "assistant", "content": combined_response.strip()}
343
+ yield history, None
344
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  if combined_response.strip() and not all("No oversights identified" in chunk for chunk in combined_response.split("--- Analysis for Chunk")):
346
  history[-1]["content"] = combined_response.strip()
347
  else:
348
  history.append({"role": "assistant", "content": "No oversights identified in the provided records."})
349
 
 
350
  report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
351
  if report_path:
352
  with open(report_path, "w", encoding="utf-8") as f:
 
358
  history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
359
  yield history, None
360
 
361
+ send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload, max_chunks_input], outputs=[chatbot, download_output])
362
+ msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload, max_chunks_input], outputs=[chatbot, download_output])
363
  return demo
364
 
365
  if __name__ == "__main__":