File size: 12,147 Bytes
f394b25 ff76617 e12aa83 f394b25 d88209d f394b25 a71a831 f394b25 d88209d dda4a06 5d37db7 dda4a06 5d37db7 dda4a06 ff76617 d88209d a71a831 ff76617 a71a831 d88209d 3cd3468 c10ba83 fcebf54 c10ba83 3cd3468 ff76617 a71a831 d88209d a71a831 d88209d a71a831 d88209d 5d37db7 d88209d 5d37db7 dda4a06 5d37db7 ff76617 d88209d 5d37db7 828effe ff76617 a71a831 5d37db7 a71a831 ff76617 a71a831 d88209d ff76617 76162fc ff76617 a71a831 d88209d 5d37db7 a71a831 d88209d ff76617 76162fc dda4a06 76162fc dda4a06 76162fc ff76617 76162fc dda4a06 ff76617 76162fc dda4a06 76162fc ff76617 dda4a06 ff76617 dda4a06 ff76617 76162fc ff76617 dda4a06 ff76617 76162fc ff76617 76162fc ff76617 d88209d a71a831 ff76617 d88209d ff76617 76162fc ff76617 76162fc d88209d ff76617 d88209d a71a831 d88209d ff76617 76162fc a71a831 5d37db7 d88209d 5d37db7 ff76617 76162fc 8a7f6db ff76617 76162fc d88209d ff76617 76162fc ff76617 a71a831 ff76617 dda4a06 a71a831 d88209d ff76617 8a7f6db d88209d a71a831 d88209d e12aa83 ff76617 e12aa83 a71a831 e12aa83 a71a831 e12aa83 a71a831 e12aa83 d88209d ff76617 a71a831 d88209d 9345354 dda4a06 d88209d ff76617 d88209d ff76617 d88209d 9345354 76162fc 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 76162fc ff76617 76162fc 9345354 ff76617 9345354 76162fc 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 ff76617 9345354 a71a831 ff76617 9345354 ff76617 d88209d ff76617 d88209d 9345354 d88209d ff76617 d88209d 9345354 a71a831 55e3db0 9345354 f394b25 e12aa83 ff76617 f394b25 ff76617 f394b25 d88209d 9345354 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
import sys
import os
import pandas as pd
import pdfplumber
import gradio as gr
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib
import shutil
import re
import logging
import torch
import gc
from diskcache import Cache
from transformers import AutoTokenizer
from functools import lru_cache
from difflib import SequenceMatcher
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Constants
MAX_TOKENS = 1800
BATCH_SIZE = 1
MAX_WORKERS = 2
CHUNK_SIZE = 5
MODEL_MAX_TOKENS = 131072
MAX_TEXT_LENGTH = 500000
MAX_ROWS_TO_PROCESS = 1000 # Limit for Excel/CSV rows
# Persistent directory setup
persistent_dir = "/data/hf_cache"
os.makedirs(persistent_dir, exist_ok=True)
model_cache_dir = os.path.join(persistent_dir, "txagent_models")
tool_cache_dir = os.path.join(persistent_dir, "tool_cache")
file_cache_dir = os.path.join(persistent_dir, "cache")
report_dir = os.path.join(persistent_dir, "reports")
os.makedirs(report_dir, exist_ok=True)
os.environ.update({
"HF_HOME": model_cache_dir,
"TOKENIZERS_PARALLELISM": "false",
})
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.abspath(os.path.join(current_dir, "src"))
sys.path.insert(0, src_path)
from txagent.txagent import TxAgent
# Initialize cache
cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
@lru_cache(maxsize=1)
def get_tokenizer():
return AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
def sanitize_utf8(text: str) -> str:
return text.encode("utf-8", "ignore").decode("utf-8")
def file_hash(path: str) -> str:
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
try:
text = page.extract_text() or ""
text = sanitize_utf8(text)
if len(text) > MAX_TEXT_LENGTH // 10:
text = text[:MAX_TEXT_LENGTH // 10]
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) > max_tokens:
chunks = []
current_chunk = []
current_length = 0
for token in tokens:
if current_length + 1 > max_tokens:
chunks.append(tokenizer.decode(current_chunk))
current_chunk = [token]
current_length = 1
else:
current_chunk.append(token)
current_length += 1
if current_chunk:
chunks.append(tokenizer.decode(current_chunk))
return chunks
return [text]
except Exception as e:
logger.warning(f"Error extracting page {page.page_number}: {str(e)}")
return []
def extract_all_pages(file_path: str) -> List[str]:
try:
tokenizer = get_tokenizer()
with pdfplumber.open(file_path) as pdf:
total_pages = len(pdf.pages)
if total_pages == 0:
return ["PDF appears to be empty"]
results = []
for i in range(0, min(total_pages, 50)): # Limit to first 50 pages
try:
page = pdf.pages[i]
chunks = extract_pdf_page(page, tokenizer)
for chunk in chunks:
results.append(f"=== Page {i+1} ===\n{chunk}")
except Exception as e:
logger.warning(f"Error processing page {i+1}: {str(e)}")
continue
return results if results else ["Could not extract text from PDF"]
except Exception as e:
logger.error(f"PDF processing error: {e}")
return [f"PDF processing error: {str(e)}"]
def excel_to_json(file_path: str) -> List[Dict]:
engines = ['openpyxl', 'xlrd']
for engine in engines:
try:
with pd.ExcelFile(file_path, engine=engine) as excel_file:
sheets = excel_file.sheet_names
if not sheets:
return [{"error": "No sheets found"}]
results = []
for sheet_name in sheets[:3]: # Limit to first 3 sheets
try:
df = pd.read_excel(
excel_file,
sheet_name=sheet_name,
header=None,
dtype=str,
na_filter=False,
nrows=MAX_ROWS_TO_PROCESS # Limit rows
)
if not df.empty:
rows = df.head(MAX_ROWS_TO_PROCESS).values.tolist()
results.append({
"filename": os.path.basename(file_path),
"sheet": sheet_name,
"rows": rows,
"type": "excel"
})
except Exception as e:
logger.warning(f"Error processing sheet {sheet_name}: {str(e)}")
continue
return results if results else [{"error": "No readable data found"}]
except Exception as e:
logger.warning(f"Excel engine {engine} failed: {str(e)}")
continue
return [{"error": "Could not process Excel file with any engine"}]
def csv_to_json(file_path: str) -> List[Dict]:
try:
df = pd.read_csv(
file_path,
header=None,
dtype=str,
encoding_errors='replace',
on_bad_lines='skip',
nrows=MAX_ROWS_TO_PROCESS # Limit rows
)
if df.empty:
return [{"error": "CSV file is empty"}]
return [{
"filename": os.path.basename(file_path),
"rows": df.values.tolist(),
"type": "csv"
}]
except Exception as e:
logger.error(f"CSV processing error: {e}")
return [{"error": f"CSV processing error: {str(e)}"}]
def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
try:
logger.info(f"Processing {file_type} file: {os.path.basename(file_path)}")
if file_type == "pdf":
chunks = extract_all_pages(file_path)
return [{
"filename": os.path.basename(file_path),
"content": chunk,
"type": "pdf"
} for chunk in chunks]
elif file_type in ["xls", "xlsx"]:
return excel_to_json(file_path)
elif file_type == "csv":
return csv_to_json(file_path)
return [{"error": f"Unsupported file type: {file_type}"}]
except Exception as e:
logger.error(f"Error processing file: {e}")
return [{"error": f"Error processing file: {str(e)}"}]
def clean_response(text: str) -> str:
if not text:
return ""
patterns = [
(re.compile(r"\[.*?\]|\bNone\b", re.IGNORECASE), ""),
(re.compile(r"\s+"), " "),
]
for pattern, repl in patterns:
text = pattern.sub(repl, text)
return text.strip()
@lru_cache(maxsize=1)
def init_agent():
logger.info("Initializing model...")
agent = TxAgent(
model_name="mims-harvard/TxAgent-T1-Llama-3.1-8B",
rag_model_name="mims-harvard/ToolRAG-T1-GTE-Qwen2-1.5B",
tool_files_dict={"new_tool": os.path.join(tool_cache_dir, "new_tool.json")},
force_finish=True,
enable_checker=False,
step_rag_num=4,
seed=100,
)
agent.init_model()
logger.info("Agent Ready")
return agent
def create_ui(agent):
PROMPT_TEMPLATE = """
Analyze this patient record excerpt for missed diagnoses (limit response to 500 tokens):
{chunk}
"""
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(label="Analysis", height=500, type="messages")
msg_input = gr.Textbox(placeholder="Ask about potential oversights...")
send_btn = gr.Button("Analyze", variant="primary")
file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="single")
with gr.Column(scale=1):
final_summary = gr.Markdown("## Summary")
status = gr.Textbox(label="Status", interactive=False)
def analyze(message: str, history: List[Dict], file_obj) -> tuple:
try:
if not file_obj:
return history, "Please upload a file first", "No file uploaded"
file_path = file_obj.name
file_type = os.path.splitext(file_path)[-1].lower().replace(".", "")
history.append({"role": "user", "content": message})
# Process file
processed = process_file_cached(file_path, file_type)
if "error" in processed[0]:
history.append({"role": "assistant", "content": processed[0]["error"]})
return history, processed[0]["error"], "File processing failed"
# Prepare chunks
chunks = []
for item in processed:
if "content" in item:
chunks.append(item["content"])
elif "rows" in item:
rows_text = "\n".join([", ".join(map(str, row)) for row in item["rows"][:100]])
chunks.append(f"=== {item.get('sheet', 'Data')} ===\n{rows_text}")
if not chunks:
history.append({"role": "assistant", "content": "No processable content found."})
return history, "No processable content found", "Content extraction failed"
# Analyze each chunk
responses = []
for i, chunk in enumerate(chunks[:5]):
try:
prompt = PROMPT_TEMPLATE.format(chunk=chunk[:5000])
response = agent.run_quick_summary(prompt, 0.2, 256, 500)
cleaned = clean_response(response)
if cleaned:
responses.append(f"Analysis {i+1}:\n{cleaned}")
except Exception as e:
logger.warning(f"Error analyzing chunk {i+1}: {str(e)}")
continue
if not responses:
history.append({"role": "assistant", "content": "No valid analysis generated."})
return history, "No valid analysis generated", "Analysis failed"
summary = "\n\n".join(responses)
history.append({"role": "assistant", "content": summary})
return history, "Analysis completed", "Success"
except Exception as e:
logger.error(f"Analysis error: {e}")
history.append({"role": "assistant", "content": f"Error: {str(e)}"})
return history, f"Error: {str(e)}", "Failed"
finally:
torch.cuda.empty_cache()
gc.collect()
send_btn.click(
analyze,
inputs=[msg_input, chatbot, file_upload],
outputs=[chatbot, final_summary, status]
)
msg_input.submit(
analyze,
inputs=[msg_input, chatbot, file_upload],
outputs=[chatbot, final_summary, status]
)
return demo
if __name__ == "__main__":
try:
agent = init_agent()
demo = create_ui(agent)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)
except Exception as e:
logger.error(f"Fatal error: {e}")
raise
|