Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import pandas as pd
|
|
4 |
import pdfplumber
|
5 |
import json
|
6 |
import gradio as gr
|
7 |
-
from typing import List
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
import hashlib
|
10 |
import shutil
|
@@ -14,25 +14,21 @@ from threading import Thread
|
|
14 |
import re
|
15 |
import tempfile
|
16 |
|
17 |
-
#
|
18 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
19 |
src_path = os.path.abspath(os.path.join(current_dir, "src"))
|
20 |
sys.path.insert(0, src_path)
|
21 |
|
22 |
# Cache directories
|
23 |
base_dir = "/data"
|
24 |
-
os.makedirs(base_dir, exist_ok=True)
|
25 |
model_cache_dir = os.path.join(base_dir, "txagent_models")
|
26 |
tool_cache_dir = os.path.join(base_dir, "tool_cache")
|
27 |
file_cache_dir = os.path.join(base_dir, "cache")
|
28 |
-
report_dir = "
|
29 |
vllm_cache_dir = os.path.join(base_dir, "vllm_cache")
|
30 |
|
31 |
-
|
32 |
-
os.makedirs(
|
33 |
-
os.makedirs(file_cache_dir, exist_ok=True)
|
34 |
-
os.makedirs(report_dir, exist_ok=True)
|
35 |
-
os.makedirs(vllm_cache_dir, exist_ok=True)
|
36 |
|
37 |
os.environ.update({
|
38 |
"TRANSFORMERS_CACHE": model_cache_dir,
|
@@ -64,7 +60,7 @@ def extract_priority_pages(file_path: str, max_pages: int = 20) -> str:
|
|
64 |
text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
|
65 |
for i, page in enumerate(pdf.pages[3:max_pages], start=4):
|
66 |
page_text = page.extract_text() or ""
|
67 |
-
if any(re.search(rf'
|
68 |
text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
|
69 |
return "\n\n".join(text_chunks)
|
70 |
except Exception as e:
|
@@ -81,12 +77,10 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
|
|
81 |
text = extract_priority_pages(file_path)
|
82 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
83 |
Thread(target=full_pdf_processing, args=(file_path, h)).start()
|
84 |
-
|
85 |
elif file_type == "csv":
|
86 |
df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
|
87 |
content = df.fillna("").astype(str).values.tolist()
|
88 |
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
89 |
-
|
90 |
elif file_type in ["xls", "xlsx"]:
|
91 |
try:
|
92 |
df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
|
@@ -94,7 +88,6 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
|
|
94 |
df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
|
95 |
content = df.fillna("").astype(str).values.tolist()
|
96 |
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
97 |
-
|
98 |
else:
|
99 |
return json.dumps({"error": f"Unsupported file type: {file_type}"})
|
100 |
|
@@ -154,20 +147,21 @@ def create_ui(agent: TxAgent):
|
|
154 |
download_output = gr.File(label="Download Full Report")
|
155 |
|
156 |
def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
|
157 |
-
start_time = time.time()
|
158 |
try:
|
159 |
-
history
|
|
|
160 |
yield history, None
|
161 |
|
162 |
extracted_data = ""
|
163 |
file_hash_value = ""
|
|
|
164 |
if files and isinstance(files, list):
|
165 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
166 |
futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files if hasattr(f, 'name')]
|
167 |
extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
|
168 |
file_hash_value = file_hash(files[0].name) if files else ""
|
169 |
|
170 |
-
|
171 |
1. List potential missed diagnoses
|
172 |
2. Flag any medication conflicts
|
173 |
3. Note incomplete assessments
|
@@ -177,9 +171,9 @@ Medical Records:\n{extracted_data[:15000]}
|
|
177 |
|
178 |
### Potential Oversights:\n"""
|
179 |
|
180 |
-
|
181 |
for chunk in agent.run_gradio_chat(
|
182 |
-
message=
|
183 |
history=[],
|
184 |
temperature=0.2,
|
185 |
max_new_tokens=1024,
|
@@ -188,16 +182,13 @@ Medical Records:\n{extracted_data[:15000]}
|
|
188 |
conversation=conversation
|
189 |
):
|
190 |
if isinstance(chunk, str):
|
191 |
-
|
192 |
elif isinstance(chunk, list):
|
193 |
-
|
194 |
-
|
195 |
-
cleaned = response.replace("[TOOL_CALLS]", "").strip()
|
196 |
-
yield history[:-1] + [{"role": "assistant", "content": cleaned}], None
|
197 |
|
198 |
-
|
199 |
-
if not
|
200 |
-
|
201 |
|
202 |
report_path = None
|
203 |
if file_hash_value:
|
@@ -205,7 +196,7 @@ Medical Records:\n{extracted_data[:15000]}
|
|
205 |
if os.path.exists(possible_report):
|
206 |
report_path = possible_report
|
207 |
|
208 |
-
history
|
209 |
yield history, report_path
|
210 |
|
211 |
except Exception as e:
|
|
|
4 |
import pdfplumber
|
5 |
import json
|
6 |
import gradio as gr
|
7 |
+
from typing import List
|
8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9 |
import hashlib
|
10 |
import shutil
|
|
|
14 |
import re
|
15 |
import tempfile
|
16 |
|
17 |
+
# Setup paths
|
18 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
19 |
src_path = os.path.abspath(os.path.join(current_dir, "src"))
|
20 |
sys.path.insert(0, src_path)
|
21 |
|
22 |
# Cache directories
|
23 |
base_dir = "/data"
|
|
|
24 |
model_cache_dir = os.path.join(base_dir, "txagent_models")
|
25 |
tool_cache_dir = os.path.join(base_dir, "tool_cache")
|
26 |
file_cache_dir = os.path.join(base_dir, "cache")
|
27 |
+
report_dir = os.path.join(base_dir, "reports")
|
28 |
vllm_cache_dir = os.path.join(base_dir, "vllm_cache")
|
29 |
|
30 |
+
for d in [model_cache_dir, tool_cache_dir, file_cache_dir, report_dir, vllm_cache_dir]:
|
31 |
+
os.makedirs(d, exist_ok=True)
|
|
|
|
|
|
|
32 |
|
33 |
os.environ.update({
|
34 |
"TRANSFORMERS_CACHE": model_cache_dir,
|
|
|
60 |
text_chunks.append(f"=== Page {i+1} ===\n{(page.extract_text() or '').strip()}")
|
61 |
for i, page in enumerate(pdf.pages[3:max_pages], start=4):
|
62 |
page_text = page.extract_text() or ""
|
63 |
+
if any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
|
64 |
text_chunks.append(f"=== Page {i} ===\n{page_text.strip()}")
|
65 |
return "\n\n".join(text_chunks)
|
66 |
except Exception as e:
|
|
|
77 |
text = extract_priority_pages(file_path)
|
78 |
result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
|
79 |
Thread(target=full_pdf_processing, args=(file_path, h)).start()
|
|
|
80 |
elif file_type == "csv":
|
81 |
df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str, skip_blank_lines=False, on_bad_lines="skip")
|
82 |
content = df.fillna("").astype(str).values.tolist()
|
83 |
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
|
|
84 |
elif file_type in ["xls", "xlsx"]:
|
85 |
try:
|
86 |
df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
|
|
|
88 |
df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
|
89 |
content = df.fillna("").astype(str).values.tolist()
|
90 |
result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
|
|
|
91 |
else:
|
92 |
return json.dumps({"error": f"Unsupported file type: {file_type}"})
|
93 |
|
|
|
147 |
download_output = gr.File(label="Download Full Report")
|
148 |
|
149 |
def analyze_potential_oversights(message: str, history: list, conversation: list, files: list):
|
|
|
150 |
try:
|
151 |
+
history.append({"role": "user", "content": message})
|
152 |
+
history.append({"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."})
|
153 |
yield history, None
|
154 |
|
155 |
extracted_data = ""
|
156 |
file_hash_value = ""
|
157 |
+
|
158 |
if files and isinstance(files, list):
|
159 |
with ThreadPoolExecutor(max_workers=4) as executor:
|
160 |
futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files if hasattr(f, 'name')]
|
161 |
extracted_data = "\n".join([sanitize_utf8(f.result()) for f in as_completed(futures)])
|
162 |
file_hash_value = file_hash(files[0].name) if files else ""
|
163 |
|
164 |
+
prompt = f"""Review these medical records and identify EXACTLY what might have been missed:
|
165 |
1. List potential missed diagnoses
|
166 |
2. Flag any medication conflicts
|
167 |
3. Note incomplete assessments
|
|
|
171 |
|
172 |
### Potential Oversights:\n"""
|
173 |
|
174 |
+
final_output = ""
|
175 |
for chunk in agent.run_gradio_chat(
|
176 |
+
message=prompt,
|
177 |
history=[],
|
178 |
temperature=0.2,
|
179 |
max_new_tokens=1024,
|
|
|
182 |
conversation=conversation
|
183 |
):
|
184 |
if isinstance(chunk, str):
|
185 |
+
final_output += chunk
|
186 |
elif isinstance(chunk, list):
|
187 |
+
final_output += "".join([c.content for c in chunk if hasattr(c, 'content')])
|
|
|
|
|
|
|
188 |
|
189 |
+
cleaned = final_output.replace("[TOOL_CALLS]", "").strip()
|
190 |
+
if not cleaned:
|
191 |
+
cleaned = "No clear oversights identified. Recommend comprehensive review."
|
192 |
|
193 |
report_path = None
|
194 |
if file_hash_value:
|
|
|
196 |
if os.path.exists(possible_report):
|
197 |
report_path = possible_report
|
198 |
|
199 |
+
history[-1] = {"role": "assistant", "content": cleaned}
|
200 |
yield history, report_path
|
201 |
|
202 |
except Exception as e:
|