Ali2206 commited on
Commit
6af3907
·
verified ·
1 Parent(s): f858e79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -105
app.py CHANGED
@@ -1,4 +1,5 @@
1
-
 
2
  import pandas as pd
3
  import pdfplumber
4
  import json
@@ -8,10 +9,10 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
8
  import hashlib
9
  import shutil
10
  import time
11
- from functools import lru_cache
12
- from threading import Thread
13
  import re
14
  import tempfile
 
15
 
16
  # Environment setup
17
  current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -59,8 +60,11 @@ def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
59
  try:
60
  text_chunks = []
61
  with pdfplumber.open(file_path) as pdf:
 
62
  for i, page in enumerate(pdf.pages[:3]):
63
- text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
 
 
64
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
65
  page_text = page.extract_text() or ""
66
  if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
@@ -74,18 +78,18 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
74
  h = file_hash(file_path)
75
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
76
  if os.path.exists(cache_path):
77
- return open(cache_path, "r", encoding="utf-8").read()
 
78
 
79
  if file_type == "pdf":
80
  text = extract_priority_pages(file_path)
81
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
82
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
83
-
84
  elif file_type == "csv":
85
- df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
 
86
  content = df.fillna("").astype(str).values.tolist()
87
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
88
-
89
  elif file_type in ["xls", "xlsx"]:
90
  try:
91
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
@@ -93,39 +97,40 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
93
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
94
  content = df.fillna("").astype(str).values.tolist()
95
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
96
-
97
  else:
98
- return json.dumps({"error": f"Unsupported file type: {file_type}"})
99
-
100
  with open(cache_path, "w", encoding="utf-8") as f:
101
  f.write(result)
102
  return result
103
-
104
  except Exception as e:
105
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
106
 
107
- def full_pdf_processing(file_path: str, file_hash: str):
108
  try:
109
- cache_path = os.path.join(file_cache_dir, f"{file_hash}_full.json")
110
  if os.path.exists(cache_path):
111
  return
112
  with pdfplumber.open(file_path) as pdf:
113
- full_text = "\n".join([f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}" for i, page in enumerate(pdf.pages)])
 
114
  result = json.dumps({"filename": os.path.basename(file_path), "content": full_text, "status": "complete"})
115
  with open(cache_path, "w", encoding="utf-8") as f:
116
  f.write(result)
117
- with open(os.path.join(report_dir, f"{file_hash}_report.txt"), "w", encoding="utf-8") as out:
118
  out.write(full_text)
119
  except Exception as e:
120
  print(f"Background processing failed: {str(e)}")
121
 
 
 
 
 
122
  def init_agent():
123
  default_tool_path = os.path.abspath("data/new_tool.json")
124
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
125
  if not os.path.exists(target_tool_path):
126
  shutil.copy(default_tool_path, target_tool_path)
127
-
128
- agent = TxAgent(
129
  model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
130
  rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
131
  tool_files_dict={"new_tool": target_tool_path},
@@ -135,49 +140,68 @@ def init_agent():
135
  seed=100,
136
  additional_default_tools=[],
137
  )
138
- agent.init_model()
139
- return agent
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- def create_ui(agent: TxAgent):
142
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
143
  gr.Markdown("""
144
  <h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>
145
  <h3 style='text-align: center;'>Identify potential oversights in patient care</h3>
146
  """)
147
-
148
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
149
- file_upload = gr.File(label="Upload Medical Records", file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
 
 
150
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
151
  send_btn = gr.Button("Analyze", variant="primary")
152
- conversation_state = gr.State([])
153
  download_output = gr.File(label="Download Full Report")
154
 
155
- def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
156
- start_time = time.time()
157
- try:
158
- # Add initial user and temporary assistant messages to update UI immediately
159
- history = history + [
160
- {"role": "user", "content": message},
161
- {"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."}
162
- ]
 
 
 
 
163
  yield history, None
164
-
165
- extracted_data = ""
166
- file_hash_value = ""
167
- if files and isinstance(files, list):
168
- with ThreadPoolExecutor(max_workers=4) as executor:
169
- futures = [
170
- executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower())
171
- for f in files if hasattr(f, 'name')
172
- ]
173
- extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
174
- file_hash_value = file_hash(files[0].name) if hasattr(files[0], 'name') else ""
175
-
176
- # Truncate extracted data to reduce overall token count (tune the character limit as needed)
177
- max_extracted_chars = 12000
178
- truncated_data = extracted_data[:max_extracted_chars]
179
-
180
- analysis_prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
 
 
 
 
181
  1. List potential missed diagnoses
182
  2. Flag any medication conflicts
183
  3. Note incomplete assessments
@@ -188,68 +212,65 @@ Medical Records:
188
 
189
  ### Potential Oversights:
190
  """
191
- response = ""
192
- try:
193
- # Stream the agent responses; skip any None chunks
194
- for chunk in agent.run_gradio_chat(
195
- message=analysis_prompt,
196
- history=[],
197
- temperature=0.2,
198
- max_new_tokens=1024,
199
- max_token=4096,
200
- call_agent=False,
201
- conversation=conversation
202
- ):
203
- if chunk is None:
204
- continue
205
- if isinstance(chunk, str):
206
- response += chunk
207
- elif isinstance(chunk, list):
208
- response += "".join([c.content for c in chunk if hasattr(c, 'content')])
209
- # Yield partial response updates
210
- cleaned = response.replace("[TOOL_CALLS]", "").strip()
211
- yield history[:-1] + [{"role": "assistant", "content": cleaned}], None
212
- except Exception as agent_error:
213
- history.append({"role": "assistant", "content": f"❌ Analysis failed during processing: {str(agent_error)}"})
214
- yield history, None
215
- return
216
-
217
- final_output = response.replace("[TOOL_CALLS]", "").strip()
218
- if not final_output:
219
- final_output = "No clear oversights identified. Recommend comprehensive review."
220
-
221
- report_path = None
222
- if file_hash_value:
223
- possible_report = os.path.join(report_dir, f"{file_hash_value}_report.txt")
224
- if os.path.exists(possible_report):
225
- report_path = possible_report
226
-
227
- history = history[:-1] + [{"role": "assistant", "content": final_output}]
228
- yield history, report_path
229
 
230
- except Exception as e:
231
- history.append({"role": "assistant", "content": f"❌ Analysis failed: {str(e)}"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  yield history, None
233
-
234
- inputs = [msg_input, chatbot, conversation_state, file_upload]
235
- outputs = [chatbot, download_output]
236
- send_btn.click(analyze_potential_oversights, inputs=inputs, outputs=outputs)
237
- msg_input.submit(analyze_potential_oversights, inputs=inputs, outputs=outputs)
238
-
239
- gr.Examples([
240
- ["What might have been missed in this patient's treatment?"],
241
- ["Are there any medication conflicts in these records?"],
242
- ["What abnormal results require follow-up?"]
243
- ], inputs=msg_input)
244
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  return demo
246
 
247
  if __name__ == "__main__":
248
- print("Initializing medical analysis agent...")
249
- agent = init_agent()
250
-
251
  print("Launching interface...")
252
- demo = create_ui(agent)
253
  demo.queue(api_open=False).launch(
254
  server_name="0.0.0.0",
255
  server_port=7860,
 
1
+ import sys
2
+ import os
3
  import pandas as pd
4
  import pdfplumber
5
  import json
 
9
  import hashlib
10
  import shutil
11
  import time
12
+ from threading import Thread, Lock
 
13
  import re
14
  import tempfile
15
+ import threading
16
 
17
  # Environment setup
18
  current_dir = os.path.dirname(os.path.abspath(__file__))
 
60
  try:
61
  text_chunks = []
62
  with pdfplumber.open(file_path) as pdf:
63
+ # Process first three pages
64
  for i, page in enumerate(pdf.pages[:3]):
65
+ text = page.extract_text() or ""
66
+ text_chunks.append(f"=== Page {i+1} ===\n{text.strip()}")
67
+ # Check for keywords on later pages and add if found
68
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
69
  page_text = page.extract_text() or ""
70
  if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
 
78
  h = file_hash(file_path)
79
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
80
  if os.path.exists(cache_path):
81
+ with open(cache_path, "r", encoding="utf-8") as f:
82
+ return f.read()
83
 
84
  if file_type == "pdf":
85
  text = extract_priority_pages(file_path)
86
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
87
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
 
88
  elif file_type == "csv":
89
+ df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
90
+ skip_blank_lines=False, on_bad_lines="skip")
91
  content = df.fillna("").astype(str).values.tolist()
92
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
93
  elif file_type in ["xls", "xlsx"]:
94
  try:
95
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
 
97
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
98
  content = df.fillna("").astype(str).values.tolist()
99
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
100
  else:
101
+ result = json.dumps({"error": f"Unsupported file type: {file_type}"})
 
102
  with open(cache_path, "w", encoding="utf-8") as f:
103
  f.write(result)
104
  return result
 
105
  except Exception as e:
106
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
107
 
108
+ def full_pdf_processing(file_path: str, file_hash_value: str):
109
  try:
110
+ cache_path = os.path.join(file_cache_dir, f"{file_hash_value}_full.json")
111
  if os.path.exists(cache_path):
112
  return
113
  with pdfplumber.open(file_path) as pdf:
114
+ full_text = "\n".join([f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}"
115
+ for i, page in enumerate(pdf.pages)])
116
  result = json.dumps({"filename": os.path.basename(file_path), "content": full_text, "status": "complete"})
117
  with open(cache_path, "w", encoding="utf-8") as f:
118
  f.write(result)
119
+ with open(os.path.join(report_dir, f"{file_hash_value}_report.txt"), "w", encoding="utf-8") as out:
120
  out.write(full_text)
121
  except Exception as e:
122
  print(f"Background processing failed: {str(e)}")
123
 
124
+ # Global agent and a lock for safe multi-threaded access
125
+ agent = None
126
+ agent_lock = Lock()
127
+
128
  def init_agent():
129
  default_tool_path = os.path.abspath("data/new_tool.json")
130
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
131
  if not os.path.exists(target_tool_path):
132
  shutil.copy(default_tool_path, target_tool_path)
133
+ new_agent = TxAgent(
 
134
  model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
135
  rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
136
  tool_files_dict={"new_tool": target_tool_path},
 
140
  seed=100,
141
  additional_default_tools=[],
142
  )
143
+ new_agent.init_model()
144
+ return new_agent
145
+
146
+ def load_agent_in_background():
147
+ global agent
148
+ with agent_lock:
149
+ if agent is None:
150
+ print("Initializing agent in background...")
151
+ agent = init_agent()
152
+ print("Agent initialization complete.")
153
+
154
+ # Start background agent loading at startup
155
+ threading.Thread(target=load_agent_in_background, daemon=True).start()
156
 
157
+ def create_ui():
158
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
159
  gr.Markdown("""
160
  <h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>
161
  <h3 style='text-align: center;'>Identify potential oversights in patient care</h3>
162
  """)
 
163
  chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
164
+ file_upload = gr.File(label="Upload Medical Records",
165
+ file_types=[".pdf", ".csv", ".xls", ".xlsx"],
166
+ file_count="multiple")
167
  msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
168
  send_btn = gr.Button("Analyze", variant="primary")
 
169
  download_output = gr.File(label="Download Full Report")
170
 
171
+ def analyze_potential_oversights(message: str, history: list, files: list):
172
+ global agent
173
+ # Append user and interim assistant message
174
+ history = history + [
175
+ {"role": "user", "content": message},
176
+ {"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."}
177
+ ]
178
+ yield history, None
179
+
180
+ if agent is None:
181
+ history.append({"role": "assistant",
182
+ "content": "🕒 The model is still loading. Please wait a moment and try again."})
183
  yield history, None
184
+ return
185
+
186
+ extracted_data = ""
187
+ file_hash_value = ""
188
+ if files and isinstance(files, list):
189
+ with ThreadPoolExecutor(max_workers=4) as executor:
190
+ futures = [
191
+ executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower())
192
+ for f in files if hasattr(f, 'name')
193
+ ]
194
+ results = []
195
+ for future in as_completed(futures):
196
+ results.append(sanitize_utf8(future.result()))
197
+ extracted_data = "\n".join(results)
198
+ file_hash_value = file_hash(files[0].name) if hasattr(files[0], 'name') else ""
199
+
200
+ # Truncate the extracted data to avoid token overflows
201
+ max_extracted_chars = 12000
202
+ truncated_data = extracted_data[:max_extracted_chars]
203
+
204
+ analysis_prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
205
  1. List potential missed diagnoses
206
  2. Flag any medication conflicts
207
  3. Note incomplete assessments
 
212
 
213
  ### Potential Oversights:
214
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ response = ""
217
+ try:
218
+ # Stream agent responses and update the last message in the conversation with each chunk.
219
+ for chunk in agent.run_gradio_chat(
220
+ message=analysis_prompt,
221
+ history=[],
222
+ temperature=0.2,
223
+ max_new_tokens=1024,
224
+ max_token=4096,
225
+ call_agent=False,
226
+ conversation=[]
227
+ ):
228
+ if chunk is None:
229
+ continue
230
+ if isinstance(chunk, str):
231
+ response += chunk
232
+ elif isinstance(chunk, list):
233
+ response += "".join([c.content for c in chunk if hasattr(c, 'content')])
234
+ cleaned = response.replace("[TOOL_CALLS]", "").strip()
235
+ # Update the assistant message (last item in history) with the latest accumulated answer
236
+ history[-1] = {"role": "assistant", "content": cleaned}
237
+ yield history, None
238
+ except Exception as agent_error:
239
+ history[-1] = {"role": "assistant",
240
+ "content": f"❌ Analysis failed during processing: {str(agent_error)}"}
241
  yield history, None
242
+ return
243
+
244
+ final_output = response.replace("[TOOL_CALLS]", "").strip()
245
+ if not final_output:
246
+ final_output = "No clear oversights identified. Recommend comprehensive review."
247
+
248
+ # Update the assistant's message with the final output
249
+ history[-1] = {"role": "assistant", "content": final_output}
250
+
251
+ report_path = None
252
+ if file_hash_value:
253
+ possible_report = os.path.join(report_dir, f"{file_hash_value}_report.txt")
254
+ if os.path.exists(possible_report):
255
+ report_path = possible_report
256
+
257
+ yield history, report_path
258
+
259
+ send_btn.click(analyze_potential_oversights,
260
+ inputs=[msg_input, gr.State([]), file_upload],
261
+ outputs=[chatbot, download_output])
262
+ msg_input.submit(analyze_potential_oversights,
263
+ inputs=[msg_input, gr.State([]), file_upload],
264
+ outputs=[chatbot, download_output])
265
+ gr.Examples([["What might have been missed in this patient's treatment?"],
266
+ ["Are there any medication conflicts in these records?"],
267
+ ["What abnormal results require follow-up?"]],
268
+ inputs=msg_input)
269
  return demo
270
 
271
  if __name__ == "__main__":
 
 
 
272
  print("Launching interface...")
273
+ demo = create_ui()
274
  demo.queue(api_open=False).launch(
275
  server_name="0.0.0.0",
276
  server_port=7860,