Ali2206 commited on
Commit
722c891
·
verified ·
1 Parent(s): fddf521

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -41
app.py CHANGED
@@ -52,51 +52,41 @@ def file_hash(path: str) -> str:
52
  return hashlib.md5(f.read()).hexdigest()
53
 
54
  def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
55
- """Fast extraction of first pages and medically relevant sections"""
56
  try:
57
  text_chunks = []
58
  with pdfplumber.open(file_path) as pdf:
59
- # Always process first 3 pages
60
  for i, page in enumerate(pdf.pages[:3]):
61
  text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
62
-
63
- # Scan subsequent pages for medical keywords
64
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
65
  page_text = page.extract_text() or ""
66
  if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
67
  text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
68
-
69
  return "\n\n".join(text_chunks)
70
  except Exception as e:
71
  return f"PDF processing error: {str(e)}"
72
 
73
  def convert_file_to_json(file_path: str, file_type: str) -> str:
74
- """Optimized file conversion with medical focus"""
75
  try:
76
  h = file_hash(file_path)
77
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
78
-
79
  if os.path.exists(cache_path):
80
  return open(cache_path, "r", encoding="utf-8").read()
81
 
82
  if file_type == "pdf":
83
- # Fast initial processing
84
  text = extract_priority_pages(file_path)
85
  result = json.dumps({
86
  "filename": os.path.basename(file_path),
87
  "content": text,
88
  "status": "initial"
89
  })
90
-
91
- # Start background full processing
92
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
93
-
94
  elif file_type == "csv":
95
- df = pd.read_csv(file_path, encoding_errors="replace", header=None,
96
- dtype=str, skip_blank_lines=False, on_bad_lines="skip")
97
  content = df.fillna("").astype(str).values.tolist()
98
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
99
-
100
  elif file_type in ["xls", "xlsx"]:
101
  try:
102
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
@@ -104,44 +94,41 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
104
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
105
  content = df.fillna("").astype(str).values.tolist()
106
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
107
-
108
  else:
109
  return json.dumps({"error": f"Unsupported file type: {file_type}"})
110
 
111
  with open(cache_path, "w", encoding="utf-8") as f:
112
  f.write(result)
113
  return result
114
-
115
  except Exception as e:
116
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
117
 
118
  def full_pdf_processing(file_path: str, file_hash: str):
119
- """Background full PDF processing"""
120
  try:
121
  cache_path = os.path.join(file_cache_dir, f"{file_hash}_full.json")
122
  if os.path.exists(cache_path):
123
  return
124
 
125
  with pdfplumber.open(file_path) as pdf:
126
- full_text = "\n".join([f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}"
127
- for i, page in enumerate(pdf.pages)])
128
-
129
  result = json.dumps({
130
  "filename": os.path.basename(file_path),
131
  "content": full_text,
132
  "status": "complete"
133
  })
134
-
135
  with open(cache_path, "w", encoding="utf-8") as f:
136
  f.write(result)
137
  except Exception as e:
138
  print(f"Background processing failed: {str(e)}")
139
 
140
  def init_agent():
141
- """Initialize TxAgent with medical analysis focus"""
142
  default_tool_path = os.path.abspath("data/new_tool.json")
143
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
144
-
145
  if not os.path.exists(target_tool_path):
146
  shutil.copy(default_tool_path, target_tool_path)
147
 
@@ -153,8 +140,7 @@ def init_agent():
153
  enable_checker=True,
154
  step_rag_num=8,
155
  seed=100,
156
- additional_default_tools=[],
157
-
158
  )
159
  agent.init_model()
160
  return agent
@@ -164,7 +150,7 @@ def create_ui(agent: TxAgent):
164
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
165
  gr.Markdown("<h3 style='text-align: center;'>Identify potential oversights in patient care</h3>")
166
 
167
- chatbot = gr.Chatbot(label="Analysis", height=600)
168
  file_upload = gr.File(
169
  label="Upload Medical Records",
170
  file_types=[".pdf", ".csv", ".xls", ".xlsx"],
@@ -179,16 +165,13 @@ def create_ui(agent: TxAgent):
179
  try:
180
  history.append((message, "Analyzing records for potential oversights..."))
181
  yield history
182
-
183
- # Process files
184
  extracted_data = ""
185
  if files:
186
  with ThreadPoolExecutor(max_workers=4) as executor:
187
- futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower())
188
- for f in files if hasattr(f, 'name')]
189
  extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
190
 
191
- # Medical oversight analysis prompt
192
  analysis_prompt = """Review these medical records and identify EXACTLY what might have been missed:
193
  1. List potential missed diagnoses
194
  2. Flag any medication conflicts
@@ -203,14 +186,13 @@ Provide ONLY the potential oversights in this format:
203
  ### Potential Oversights:
204
  1. [Missed diagnosis] - [Evidence from records]
205
  2. [Medication issue] - [Supporting data]
206
- 3. [Assessment gap] - [Relevant findings]""".format(records=extracted_data[:15000]) # Limit input size
207
 
208
- # Generate analysis
209
  response = []
210
  for chunk in agent.run_gradio_chat(
211
  message=analysis_prompt,
212
  history=[],
213
- temperature=0.2, # More deterministic
214
  max_new_tokens=1024,
215
  max_token=4096,
216
  call_agent=False,
@@ -220,17 +202,15 @@ Provide ONLY the potential oversights in this format:
220
  response.append(chunk)
221
  elif isinstance(chunk, list):
222
  response.extend([c.content for c in chunk if hasattr(c, 'content')])
223
-
224
- if len(response) % 3 == 0: # Update every 3 chunks
225
  history[-1] = (message, "".join(response).strip())
226
  yield history
227
 
228
- # Finalize output
229
  final_output = "".join(response).strip()
230
  if not final_output:
231
  final_output = "No clear oversights identified. Recommend comprehensive review."
232
 
233
- # Format as bullet points if not already
234
  if not final_output.startswith(("1.", "-", "*", "#")):
235
  final_output = "• " + final_output.replace("\n", "\n• ")
236
 
@@ -242,7 +222,6 @@ Provide ONLY the potential oversights in this format:
242
  history.append((message, f"❌ Analysis failed: {str(e)}"))
243
  yield history
244
 
245
- # UI event handlers
246
  inputs = [msg_input, chatbot, conversation_state, file_upload]
247
  send_btn.click(analyze_potential_oversights, inputs=inputs, outputs=chatbot)
248
  msg_input.submit(analyze_potential_oversights, inputs=inputs, outputs=chatbot)
@@ -258,10 +237,10 @@ Provide ONLY the potential oversights in this format:
258
  if __name__ == "__main__":
259
  print("Initializing medical analysis agent...")
260
  agent = init_agent()
261
-
262
  print("Launching interface...")
263
  demo = create_ui(agent)
264
- demo.queue(concurrency_count=2).launch(
265
  server_name="0.0.0.0",
266
  server_port=7860,
267
  show_error=True,
 
52
  return hashlib.md5(f.read()).hexdigest()
53
 
54
  def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
 
55
  try:
56
  text_chunks = []
57
  with pdfplumber.open(file_path) as pdf:
 
58
  for i, page in enumerate(pdf.pages[:3]):
59
  text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
 
 
60
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
61
  page_text = page.extract_text() or ""
62
  if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
63
  text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
 
64
  return "\n\n".join(text_chunks)
65
  except Exception as e:
66
  return f"PDF processing error: {str(e)}"
67
 
68
  def convert_file_to_json(file_path: str, file_type: str) -> str:
 
69
  try:
70
  h = file_hash(file_path)
71
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
72
+
73
  if os.path.exists(cache_path):
74
  return open(cache_path, "r", encoding="utf-8").read()
75
 
76
  if file_type == "pdf":
 
77
  text = extract_priority_pages(file_path)
78
  result = json.dumps({
79
  "filename": os.path.basename(file_path),
80
  "content": text,
81
  "status": "initial"
82
  })
 
 
83
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
84
+
85
  elif file_type == "csv":
86
+ df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
 
87
  content = df.fillna("").astype(str).values.tolist()
88
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
89
+
90
  elif file_type in ["xls", "xlsx"]:
91
  try:
92
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
 
94
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
95
  content = df.fillna("").astype(str).values.tolist()
96
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
97
+
98
  else:
99
  return json.dumps({"error": f"Unsupported file type: {file_type}"})
100
 
101
  with open(cache_path, "w", encoding="utf-8") as f:
102
  f.write(result)
103
  return result
104
+
105
  except Exception as e:
106
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
107
 
108
  def full_pdf_processing(file_path: str, file_hash: str):
 
109
  try:
110
  cache_path = os.path.join(file_cache_dir, f"{file_hash}_full.json")
111
  if os.path.exists(cache_path):
112
  return
113
 
114
  with pdfplumber.open(file_path) as pdf:
115
+ full_text = "\n".join([f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}" for i, page in enumerate(pdf.pages)])
116
+
 
117
  result = json.dumps({
118
  "filename": os.path.basename(file_path),
119
  "content": full_text,
120
  "status": "complete"
121
  })
122
+
123
  with open(cache_path, "w", encoding="utf-8") as f:
124
  f.write(result)
125
  except Exception as e:
126
  print(f"Background processing failed: {str(e)}")
127
 
128
  def init_agent():
 
129
  default_tool_path = os.path.abspath("data/new_tool.json")
130
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
131
+
132
  if not os.path.exists(target_tool_path):
133
  shutil.copy(default_tool_path, target_tool_path)
134
 
 
140
  enable_checker=True,
141
  step_rag_num=8,
142
  seed=100,
143
+ additional_default_tools=[]
 
144
  )
145
  agent.init_model()
146
  return agent
 
150
  gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
151
  gr.Markdown("<h3 style='text-align: center;'>Identify potential oversights in patient care</h3>")
152
 
153
+ chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
154
  file_upload = gr.File(
155
  label="Upload Medical Records",
156
  file_types=[".pdf", ".csv", ".xls", ".xlsx"],
 
165
  try:
166
  history.append((message, "Analyzing records for potential oversights..."))
167
  yield history
168
+
 
169
  extracted_data = ""
170
  if files:
171
  with ThreadPoolExecutor(max_workers=4) as executor:
172
+ futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files if hasattr(f, 'name')]
 
173
  extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
174
 
 
175
  analysis_prompt = """Review these medical records and identify EXACTLY what might have been missed:
176
  1. List potential missed diagnoses
177
  2. Flag any medication conflicts
 
186
  ### Potential Oversights:
187
  1. [Missed diagnosis] - [Evidence from records]
188
  2. [Medication issue] - [Supporting data]
189
+ 3. [Assessment gap] - [Relevant findings]""".format(records=extracted_data[:15000])
190
 
 
191
  response = []
192
  for chunk in agent.run_gradio_chat(
193
  message=analysis_prompt,
194
  history=[],
195
+ temperature=0.2,
196
  max_new_tokens=1024,
197
  max_token=4096,
198
  call_agent=False,
 
202
  response.append(chunk)
203
  elif isinstance(chunk, list):
204
  response.extend([c.content for c in chunk if hasattr(c, 'content')])
205
+
206
+ if len(response) % 3 == 0:
207
  history[-1] = (message, "".join(response).strip())
208
  yield history
209
 
 
210
  final_output = "".join(response).strip()
211
  if not final_output:
212
  final_output = "No clear oversights identified. Recommend comprehensive review."
213
 
 
214
  if not final_output.startswith(("1.", "-", "*", "#")):
215
  final_output = "• " + final_output.replace("\n", "\n• ")
216
 
 
222
  history.append((message, f"❌ Analysis failed: {str(e)}"))
223
  yield history
224
 
 
225
  inputs = [msg_input, chatbot, conversation_state, file_upload]
226
  send_btn.click(analyze_potential_oversights, inputs=inputs, outputs=chatbot)
227
  msg_input.submit(analyze_potential_oversights, inputs=inputs, outputs=chatbot)
 
237
  if __name__ == "__main__":
238
  print("Initializing medical analysis agent...")
239
  agent = init_agent()
240
+
241
  print("Launching interface...")
242
  demo = create_ui(agent)
243
+ demo.queue().launch(
244
  server_name="0.0.0.0",
245
  server_port=7860,
246
  show_error=True,