Ali2206 commited on
Commit
65a2e99
·
verified ·
1 Parent(s): 38b7c69

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -27
app.py CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
- from typing import List, Optional
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
@@ -14,25 +14,21 @@ from threading import Thread
14
  import re
15
  import tempfile
16
 
17
- # Environment setup
18
  current_dir = os.path.dirname(os.path.abspath(__file__))
19
  src_path = os.path.abspath(os.path.join(current_dir, "src"))
20
  sys.path.insert(0, src_path)
21
 
22
  # Cache directories
23
  base_dir = "/data"
24
- os.makedirs(base_dir, exist_ok=True)
25
  model_cache_dir = os.path.join(base_dir, "txagent_models")
26
  tool_cache_dir = os.path.join(base_dir, "tool_cache")
27
  file_cache_dir = os.path.join(base_dir, "cache")
28
- report_dir = "/data/reports"
29
  vllm_cache_dir = os.path.join(base_dir, "vllm_cache")
30
 
31
- os.makedirs(model_cache_dir, exist_ok=True)
32
- os.makedirs(tool_cache_dir, exist_ok=True)
33
- os.makedirs(file_cache_dir, exist_ok=True)
34
- os.makedirs(report_dir, exist_ok=True)
35
- os.makedirs(vllm_cache_dir, exist_ok=True)
36
 
37
  os.environ.update({
38
  "TRANSFORMERS_CACHE": model_cache_dir,
@@ -64,7 +60,7 @@ def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
64
  text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
65
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
66
  page_text = page.extract_text() or ""
67
- if any(re.search(rf'\\b{kw}\\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
68
  text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
69
  return "\n\n".join(text_chunks)
70
  except Exception as e:
@@ -81,12 +77,10 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
81
  text = extract_priority_pages(file_path)
82
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
83
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
84
-
85
  elif file_type == "csv":
86
  df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
87
  content = df.fillna("").astype(str).values.tolist()
88
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
89
-
90
  elif file_type in ["xls", "xlsx"]:
91
  try:
92
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
@@ -94,7 +88,6 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
94
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
95
  content = df.fillna("").astype(str).values.tolist()
96
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
97
-
98
  else:
99
  return json.dumps({"error": f"Unsupported file type: {file_type}"})
100
 
@@ -154,20 +147,21 @@ def create_ui(agent: TxAgent):
154
  download_output = gr.File(label="Download Full Report")
155
 
156
  def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
157
- start_time = time.time()
158
  try:
159
- history = history + [{"role": "user", "content": message}, {"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."}]
 
160
  yield history, None
161
 
162
  extracted_data = ""
163
  file_hash_value = ""
 
164
  if files and isinstance(files, list):
165
  with ThreadPoolExecutor(max_workers=4) as executor:
166
  futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files if hasattr(f, 'name')]
167
  extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
168
  file_hash_value = file_hash(files[0].name) if files else ""
169
 
170
- analysis_prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
171
  1. List potential missed diagnoses
172
  2. Flag any medication conflicts
173
  3. Note incomplete assessments
@@ -177,9 +171,9 @@ Medical Records:\n{extracted_data[:15000]}
177
 
178
  ### Potential Oversights:\n"""
179
 
180
- response = ""
181
  for chunk in agent.run_gradio_chat(
182
- message=analysis_prompt,
183
  history=[],
184
  temperature=0.2,
185
  max_new_tokens=1024,
@@ -188,16 +182,13 @@ Medical Records:\n{extracted_data[:15000]}
188
  conversation=conversation
189
  ):
190
  if isinstance(chunk, str):
191
- response += chunk
192
  elif isinstance(chunk, list):
193
- response += "".join([c.content for c in chunk if hasattr(c, 'content')])
194
-
195
- cleaned = response.replace("[TOOL_CALLS]", "").strip()
196
- yield history[:-1] + [{"role": "assistant", "content": cleaned}], None
197
 
198
- final_output = response.replace("[TOOL_CALLS]", "").strip()
199
- if not final_output:
200
- final_output = "No clear oversights identified. Recommend comprehensive review."
201
 
202
  report_path = None
203
  if file_hash_value:
@@ -205,7 +196,7 @@ Medical Records:\n{extracted_data[:15000]}
205
  if os.path.exists(possible_report):
206
  report_path = possible_report
207
 
208
- history = history[:-1] + [{"role": "assistant", "content": final_output}]
209
  yield history, report_path
210
 
211
  except Exception as e:
 
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
+ from typing import List
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
 
14
  import re
15
  import tempfile
16
 
17
+ # Setup paths
18
  current_dir = os.path.dirname(os.path.abspath(__file__))
19
  src_path = os.path.abspath(os.path.join(current_dir, "src"))
20
  sys.path.insert(0, src_path)
21
 
22
  # Cache directories
23
  base_dir = "/data"
 
24
  model_cache_dir = os.path.join(base_dir, "txagent_models")
25
  tool_cache_dir = os.path.join(base_dir, "tool_cache")
26
  file_cache_dir = os.path.join(base_dir, "cache")
27
+ report_dir = os.path.join(base_dir, "reports")
28
  vllm_cache_dir = os.path.join(base_dir, "vllm_cache")
29
 
30
+ for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir, vllm_cache_dir]:
31
+ os.makedirs(d, exist_ok=True)
 
 
 
32
 
33
  os.environ.update({
34
  "TRANSFORMERS_CACHE": model_cache_dir,
 
60
  text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
61
  for i, page in enumerate(pdf.pages[3:max_pages], start=4):
62
  page_text = page.extract_text() or ""
63
+ if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
64
  text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
65
  return "\n\n".join(text_chunks)
66
  except Exception as e:
 
77
  text = extract_priority_pages(file_path)
78
  result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
79
  Thread(target=full_pdf_processing, args=(file_path, h)).start()
 
80
  elif file_type == "csv":
81
  df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
82
  content = df.fillna("").astype(str).values.tolist()
83
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
84
  elif file_type in ["xls", "xlsx"]:
85
  try:
86
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
 
88
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
89
  content = df.fillna("").astype(str).values.tolist()
90
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
 
91
  else:
92
  return json.dumps({"error": f"Unsupported file type: {file_type}"})
93
 
 
147
  download_output = gr.File(label="Download Full Report")
148
 
149
  def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
 
150
  try:
151
+ history.append({"role": "user", "content": message})
152
+ history.append({"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."})
153
  yield history, None
154
 
155
  extracted_data = ""
156
  file_hash_value = ""
157
+
158
  if files and isinstance(files, list):
159
  with ThreadPoolExecutor(max_workers=4) as executor:
160
  futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files if hasattr(f, 'name')]
161
  extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
162
  file_hash_value = file_hash(files[0].name) if files else ""
163
 
164
+ prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
165
  1. List potential missed diagnoses
166
  2. Flag any medication conflicts
167
  3. Note incomplete assessments
 
171
 
172
  ### Potential Oversights:\n"""
173
 
174
+ final_output = ""
175
  for chunk in agent.run_gradio_chat(
176
+ message=prompt,
177
  history=[],
178
  temperature=0.2,
179
  max_new_tokens=1024,
 
182
  conversation=conversation
183
  ):
184
  if isinstance(chunk, str):
185
+ final_output += chunk
186
  elif isinstance(chunk, list):
187
+ final_output += "".join([c.content for c in chunk if hasattr(c, 'content')])
 
 
 
188
 
189
+ cleaned = final_output.replace("[TOOL_CALLS]", "").strip()
190
+ if not cleaned:
191
+ cleaned = "No clear oversights identified. Recommend comprehensive review."
192
 
193
  report_path = None
194
  if file_hash_value:
 
196
  if os.path.exists(possible_report):
197
  report_path = possible_report
198
 
199
+ history[-1] = {"role": "assistant", "content": cleaned}
200
  yield history, report_path
201
 
202
  except Exception as e: