Ali2206 commited on
Commit
44280bd
·
verified ·
1 Parent(s): 02a4d5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -138
app.py CHANGED
@@ -4,15 +4,16 @@ import pandas as pd
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
- from typing import List
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
11
  import re
12
  import psutil
13
  import subprocess
 
14
 
15
- # Persistent directory
16
  persistent_dir = "/data/hf_cache"
17
  os.makedirs(persistent_dir, exist_ok=True)
18
 
@@ -37,46 +38,78 @@ sys.path.insert(0, src_path)
37
 
38
  from txagent.txagent import TxAgent
39
 
40
- MEDICAL_KEYWORDS = {'diagnosis', 'assessment', 'plan', 'results', 'medications',
41
- 'allergies', 'summary', 'impression', 'findings', 'recommendations'}
 
 
 
 
 
 
42
 
43
  def sanitize_utf8(text: str) -> str:
 
44
  return text.encode("utf-8", "ignore").decode("utf-8")
45
 
46
  def file_hash(path: str) -> str:
 
47
  with open(path, "rb") as f:
48
  return hashlib.md5(f.read()).hexdigest()
49
 
50
- def extract_priority_pages(file_path: str) -> str:
 
 
 
 
51
  try:
52
  text_chunks = []
 
53
  with pdfplumber.open(file_path) as pdf:
 
 
54
  for i, page in enumerate(pdf.pages):
55
  page_text = page.extract_text() or ""
56
- if i < 3 or any(re.search(rf'\b{kw}\b', page_text.lower()) for kw in MEDICAL_KEYWORDS):
 
 
 
 
 
57
  text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
58
- return "\n\n".join(text_chunks)
 
59
  except Exception as e:
60
- return f"PDF processing error: {str(e)}"
61
 
62
  def convert_file_to_json(file_path: str, file_type: str) -> str:
 
63
  try:
64
  h = file_hash(file_path)
65
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
 
66
  if os.path.exists(cache_path):
67
  with open(cache_path, "r", encoding="utf-8") as f:
68
  return f.read()
69
 
70
  if file_type == "pdf":
71
- text = extract_priority_pages(file_path)
72
- result = json.dumps({"filename": os.path.basename(file_path), "content": text, "status": "initial"})
 
 
 
 
 
73
  elif file_type == "csv":
74
- df = pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
75
- skip_blank_lines=False, on_bad_lines="skip")
76
- content = df.fillna("").astype(str).values.tolist()
 
 
 
77
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
78
  elif file_type in ["xls", "xlsx"]:
79
  try:
 
80
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
81
  except Exception:
82
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
@@ -84,6 +117,7 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
84
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
85
  else:
86
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
 
87
  with open(cache_path, "w", encoding="utf-8") as f:
88
  f.write(result)
89
  return result
@@ -91,6 +125,7 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
91
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
92
 
93
  def log_system_usage(tag=""):
 
94
  try:
95
  cpu = psutil.cpu_percent(interval=1)
96
  mem = psutil.virtual_memory()
@@ -106,21 +141,74 @@ def log_system_usage(tag=""):
106
  print(f"[{tag}] GPU/CPU monitor failed: {e}")
107
 
108
  def clean_response(text: str) -> str:
 
109
  text = sanitize_utf8(text)
110
- # Remove tool calls, JSON data, and repetitive phrases
111
  text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
112
- text = re.sub(r"\['get_[^\]]+\']\n?", "", text) # Remove tool names
113
- text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL) # Remove JSON
 
114
  text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
 
115
  text = re.sub(r"\n{3,}", "\n\n", text).strip()
116
- # Only keep text under analysis headings or relevant content
117
- if not re.search(r"(Missed Diagnoses|Medication Conflicts|Incomplete Assessments|Urgent Follow-up)", text):
118
- return ""
119
  return text
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  def init_agent():
 
122
  print("🔁 Initializing model...")
123
  log_system_usage("Before Load")
 
124
  default_tool_path = os.path.abspath("data/new_tool.json")
125
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
126
  if not os.path.exists(target_tool_path):
@@ -141,135 +229,191 @@ def init_agent():
141
  print("✅ Agent Ready")
142
  return agent
143
 
144
- def create_ui(agent):
145
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
146
- gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
147
- chatbot = gr.Chatbot(label="Analysis", height=600, type="messages")
148
- file_upload = gr.File(file_types=[".pdf", ".csv", ".xls", ".xlsx"], file_count="multiple")
149
- msg_input = gr.Textbox(placeholder="Ask about potential oversights...", show_label=False)
150
- send_btn = gr.Button("Analyze", variant="primary")
151
- download_output = gr.File(label="Download Full Report")
152
-
153
- def analyze(message: str, history: List[dict], files: List):
154
- history.append({"role": "user", "content": message})
155
- history.append({"role": "assistant", "content": "⏳ Analyzing records for potential oversights..."})
156
- yield history, None
157
-
158
- extracted = ""
159
- file_hash_value = ""
160
- if files:
161
- with ThreadPoolExecutor(max_workers=6) as executor:
162
- futures = [executor.submit(convert_file_to_json, f.name, f.name.split(".")[-1].lower()) for f in files]
163
- results = [sanitize_utf8(f.result()) for f in as_completed(futures)]
164
- extracted = "\n".join(results)
165
- file_hash_value = file_hash(files[0].name) if files else ""
166
-
167
- # Split extracted text into chunks of ~4,000 characters
168
- chunk_size = 4000
169
- chunks = [extracted[i:i + chunk_size] for i in range(0, len(extracted), chunk_size)]
170
- combined_response = ""
171
-
172
- prompt_template = f"""
173
- Analyze the medical records for clinical oversights. Provide a concise, evidence-based summary under these headings:
174
-
175
- 1. **Missed Diagnoses**:
176
- - Identify inconsistencies in history, symptoms, or tests.
177
- - Consider psychiatric, neurological, infectious, autoimmune, genetic conditions, family history, trauma, and developmental factors.
178
-
179
- 2. **Medication Conflicts**:
180
- - Check for contraindications, interactions, or unjustified off-label use.
181
- - Assess if medications worsen diagnoses or cause adverse effects.
182
-
183
- 3. **Incomplete Assessments**:
184
- - Note missing or superficial cognitive, psychiatric, social, or family assessments.
185
- - Highlight gaps in medical history, substance use, or lab/imaging documentation.
186
-
187
- 4. **Urgent Follow-up**:
188
- - Flag abnormal lab results, imaging, behaviors, or legal history needing immediate reassessment or referral.
189
-
190
- Medical Records (Chunk {0} of {1}):
191
- {{chunk}}
192
-
193
- Begin analysis:
194
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  try:
197
- if history and history[-1]["content"].startswith("⏳"):
198
- history.pop()
199
-
200
- # Process each chunk and stream cleaned results
201
- for chunk_idx, chunk in enumerate(chunks, 1):
202
- # Update UI with progress
203
- history.append({"role": "assistant", "content": f"🔄 Processing Chunk {chunk_idx} of {len(chunks)}..."})
204
- yield history, None
205
-
206
- prompt = prompt_template.format(chunk_idx, len(chunks), chunk=chunk)
207
- chunk_response = ""
208
- for chunk_output in agent.run_gradio_chat(
209
- message=prompt,
210
- history=[],
211
- temperature=0.2,
212
- max_new_tokens=1024,
213
- max_token=4096,
214
- call_agent=False,
215
- conversation=[],
216
- ):
217
- if chunk_output is None:
218
- continue
219
- if isinstance(chunk_output, list):
220
- for m in chunk_output:
221
- if hasattr(m, 'content') and m.content:
222
- cleaned = clean_response(m.content)
223
- if cleaned:
224
- chunk_response += cleaned + "\n"
225
- # Stream partial response to UI
226
- if history[-1]["content"].startswith("🔄"):
227
- history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
228
- else:
229
- history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
230
- yield history, None
231
- elif isinstance(chunk_output, str) and chunk_output.strip():
232
- cleaned = clean_response(chunk_output)
233
- if cleaned:
234
- chunk_response += cleaned + "\n"
235
- # Stream partial response to UI
236
- if history[-1]["content"].startswith("🔄"):
237
- history[-1] = {"role": "assistant", "content": f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"}
238
- else:
239
- history[-1]["content"] = f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response.strip()}"
240
- yield history, None
241
-
242
- # Append completed chunk response to combined response
243
- if chunk_response:
244
- combined_response += f"--- Analysis for Chunk {chunk_idx} ---\n{chunk_response}\n"
245
-
246
- # Finalize UI with complete response
247
- if combined_response:
248
- history[-1]["content"] = combined_response.strip()
249
- else:
250
- history.append({"role": "assistant", "content": "No oversights identified."})
251
-
252
- # Generate report file with cleaned response
253
- report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
254
- if report_path:
255
- with open(report_path, "w", encoding="utf-8") as f:
256
- f.write(combined_response)
257
- yield history, report_path if report_path and os.path.exists(report_path) else None
258
-
259
  except Exception as e:
260
- print("🚨 ERROR:", e)
261
- history.append({"role": "assistant", "content": f"❌ Error occurred: {str(e)}"})
262
- yield history, None
263
-
264
- send_btn.click(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
265
- msg_input.submit(analyze, inputs=[msg_input, gr.State([]), file_upload], outputs=[chatbot, download_output])
 
 
 
 
 
 
 
 
 
 
 
 
266
  return demo
267
 
268
  if __name__ == "__main__":
269
  print("🚀 Launching app...")
270
  agent = init_agent()
271
  demo = create_ui(agent)
272
- demo.queue(api_open=False).launch(
 
 
 
273
  server_name="0.0.0.0",
274
  server_port=7860,
275
  show_error=True,
 
4
  import pdfplumber
5
  import json
6
  import gradio as gr
7
+ from typing import List, Tuple, Optional
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
  import hashlib
10
  import shutil
11
  import re
12
  import psutil
13
  import subprocess
14
+ from datetime import datetime
15
 
16
+ # Persistent directory setup
17
  persistent_dir = "/data/hf_cache"
18
  os.makedirs(persistent_dir, exist_ok=True)
19
 
 
38
 
39
  from txagent.txagent import TxAgent
40
 
41
+ # Constants
42
+ MEDICAL_KEYWORDS = {
43
+ 'diagnosis', 'assessment', 'plan', 'results', 'medications',
44
+ 'allergies', 'summary', 'impression', 'findings', 'recommendations',
45
+ 'conclusion', 'history', 'examination', 'progress', 'discharge'
46
+ }
47
+ CHUNK_SIZE = 10000 # Increased chunk size for better context
48
+ MAX_TOKENS = 12000 # Maximum tokens for analysis
49
 
50
  def sanitize_utf8(text: str) -> str:
51
+ """Ensure text is UTF-8 clean."""
52
  return text.encode("utf-8", "ignore").decode("utf-8")
53
 
54
  def file_hash(path: str) -> str:
55
+ """Generate MD5 hash of file content."""
56
  with open(path, "rb") as f:
57
  return hashlib.md5(f.read()).hexdigest()
58
 
59
+ def extract_all_pages(file_path: str) -> Tuple[str, int]:
60
+ """
61
+ Extract all pages from PDF with smart prioritization of medical sections.
62
+ Returns (extracted_text, total_pages)
63
+ """
64
  try:
65
  text_chunks = []
66
+ total_pages = 0
67
  with pdfplumber.open(file_path) as pdf:
68
+ total_pages = len(pdf.pages)
69
+
70
  for i, page in enumerate(pdf.pages):
71
  page_text = page.extract_text() or ""
72
+ lower_text = page_text.lower()
73
+
74
+ # Include all pages but mark sections with medical keywords
75
+ if any(re.search(rf'\b{kw}\b', lower_text) for kw in MEDICAL_KEYWORDS):
76
+ text_chunks.append(f"=== MEDICAL SECTION (Page {i+1}) ===\n{page_text.strip()}")
77
+ else:
78
  text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
79
+
80
+ return "\n\n".join(text_chunks), total_pages
81
  except Exception as e:
82
+ return f"PDF processing error: {str(e)}", 0
83
 
84
  def convert_file_to_json(file_path: str, file_type: str) -> str:
85
+ """Convert file to JSON format with caching, processing all content."""
86
  try:
87
  h = file_hash(file_path)
88
  cache_path = os.path.join(file_cache_dir, f"{h}.json")
89
+
90
  if os.path.exists(cache_path):
91
  with open(cache_path, "r", encoding="utf-8") as f:
92
  return f.read()
93
 
94
  if file_type == "pdf":
95
+ text, total_pages = extract_all_pages(file_path)
96
+ result = json.dumps({
97
+ "filename": os.path.basename(file_path),
98
+ "content": text,
99
+ "total_pages": total_pages,
100
+ "status": "complete"
101
+ })
102
  elif file_type == "csv":
103
+ # Read CSV in chunks to handle large files
104
+ chunks = []
105
+ for chunk in pd.read_csv(file_path, encoding_errors="replace", header=None, dtype=str,
106
+ skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
107
+ chunks.append(chunk.fillna("").astype(str).values.tolist())
108
+ content = [item for sublist in chunks for item in sublist]
109
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
110
  elif file_type in ["xls", "xlsx"]:
111
  try:
112
+ # Read Excel in chunks if possible
113
  df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
114
  except Exception:
115
  df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
 
117
  result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
118
  else:
119
  result = json.dumps({"error": f"Unsupported file type: {file_type}"})
120
+
121
  with open(cache_path, "w", encoding="utf-8") as f:
122
  f.write(result)
123
  return result
 
125
  return json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
126
 
127
  def log_system_usage(tag=""):
128
+ """Log system resource usage."""
129
  try:
130
  cpu = psutil.cpu_percent(interval=1)
131
  mem = psutil.virtual_memory()
 
141
  print(f"[{tag}] GPU/CPU monitor failed: {e}")
142
 
143
  def clean_response(text: str) -> str:
144
+ """Clean and format the model response."""
145
  text = sanitize_utf8(text)
146
+ # Remove tool calls and JSON artifacts
147
  text = re.sub(r"\[TOOL_CALLS\].*", "", text, flags=re.DOTALL)
148
+ text = re.sub(r"\['get_[^\]]+\']\n?", "", text)
149
+ text = re.sub(r"\{'meta':\s*\{.*?\}\s*,\s*'results':\s*\[.*?\]\}\n?", "", text, flags=re.DOTALL)
150
+ # Remove repetitive phrases
151
  text = re.sub(r"To analyze the medical records for clinical oversights.*?begin by reviewing.*?\n", "", text, flags=re.DOTALL)
152
+ # Collapse excessive newlines
153
  text = re.sub(r"\n{3,}", "\n\n", text).strip()
 
 
 
154
  return text
155
 
156
+ def format_final_report(analysis_results: List[str], filename: str) -> str:
157
+ """Combine all analysis chunks into a well-formatted final report."""
158
+ report = []
159
+ report.append(f"COMPREHENSIVE CLINICAL OVERSIGHT ANALYSIS")
160
+ report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
161
+ report.append(f"File: {filename}")
162
+ report.append("=" * 80)
163
+
164
+ # Extract sections from all chunks
165
+ sections = {
166
+ "CRITICAL FINDINGS": [],
167
+ "MISSED DIAGNOSES": [],
168
+ "MEDICATION ISSUES": [],
169
+ "ASSESSMENT GAPS": [],
170
+ "FOLLOW-UP RECOMMENDATIONS": []
171
+ }
172
+
173
+ for result in analysis_results:
174
+ for section in sections:
175
+ # Find section content using regex
176
+ section_match = re.search(
177
+ rf"{re.escape(section)}:?\s*\n([^*]+?)(?=\n\*|\n\n|$)",
178
+ result,
179
+ re.IGNORECASE | re.DOTALL
180
+ )
181
+ if section_match:
182
+ content = section_match.group(1).strip()
183
+ if content and content not in sections[section]:
184
+ sections[section].append(content)
185
+
186
+ # Build the final report - prioritize critical findings
187
+ if sections["CRITICAL FINDINGS"]:
188
+ report.append("\n🚨 **CRITICAL FINDINGS** 🚨")
189
+ for content in sections["CRITICAL FINDINGS"]:
190
+ report.append(f"\n{content}")
191
+
192
+ # Add other sections
193
+ for section, contents in sections.items():
194
+ if section != "CRITICAL FINDINGS" and contents:
195
+ report.append(f"\n**{section.upper()}**")
196
+ for content in contents:
197
+ report.append(f"\n{content}")
198
+
199
+ if not any(sections.values()):
200
+ report.append("\nNo significant clinical oversights identified.")
201
+
202
+ report.append("\n" + "=" * 80)
203
+ report.append("END OF REPORT")
204
+
205
+ return "\n".join(report)
206
+
207
  def init_agent():
208
+ """Initialize the TxAgent with proper configuration."""
209
  print("🔁 Initializing model...")
210
  log_system_usage("Before Load")
211
+
212
  default_tool_path = os.path.abspath("data/new_tool.json")
213
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
214
  if not os.path.exists(target_tool_path):
 
229
  print("✅ Agent Ready")
230
  return agent
231
 
232
+ def analyze_large_document(content: str, filename: str, agent: TxAgent) -> str:
233
+ """Analyze large documents by processing in logical sections."""
234
+ # Split content into logical sections
235
+ sections = re.split(r"(=== MEDICAL SECTION|=== Page \d+ ===)", content)
236
+ sections = [s.strip() for s in sections if s.strip()]
237
+
238
+ analysis_results = []
239
+ current_chunk = ""
240
+
241
+ for section in sections:
242
+ # If adding this section would exceed chunk size, analyze current chunk
243
+ if len(current_chunk) + len(section) > CHUNK_SIZE and current_chunk:
244
+ analysis_results.append(process_chunk(current_chunk, filename, agent))
245
+ current_chunk = section
246
+ else:
247
+ current_chunk += "\n\n" + section
248
+
249
+ # Process the last chunk
250
+ if current_chunk:
251
+ analysis_results.append(process_chunk(current_chunk, filename, agent))
252
+
253
+ return format_final_report(analysis_results, filename)
254
+
255
+ def process_chunk(chunk: str, filename: str, agent: TxAgent) -> str:
256
+ """Process a single chunk of the document."""
257
+ prompt = f"""
258
+ Analyze this section of medical records for clinical oversights. Focus on:
259
+ 1. Critical findings needing immediate attention
260
+ 2. Potential missed diagnoses
261
+ 3. Medication conflicts
262
+ 4. Assessment gaps
263
+ 5. Follow-up recommendations
264
+
265
+ File: {filename}
266
+ Content:
267
+ {chunk[:CHUNK_SIZE]}
268
+
269
+ Provide concise findings in bullet points under relevant headings.
270
+ Focus on factual evidence from the content.
 
 
 
 
 
 
 
 
 
 
 
271
  """
272
+
273
+ full_response = ""
274
+ for output in agent.run_gradio_chat(
275
+ message=prompt,
276
+ history=[],
277
+ temperature=0.1, # Lower temperature for more factual responses
278
+ max_new_tokens=1024,
279
+ max_token=MAX_TOKENS,
280
+ call_agent=False,
281
+ conversation=[],
282
+ ):
283
+ if output is None:
284
+ continue
285
+
286
+ if isinstance(output, list):
287
+ for m in output:
288
+ if hasattr(m, 'content') and m.content:
289
+ cleaned = clean_response(m.content)
290
+ if cleaned:
291
+ full_response += cleaned + "\n"
292
+ elif isinstance(output, str) and output.strip():
293
+ cleaned = clean_response(output)
294
+ if cleaned:
295
+ full_response += cleaned + "\n"
296
+
297
+ return full_response
298
 
299
+ def create_ui(agent):
300
+ """Create the Gradio interface."""
301
+ with gr.Blocks(theme=gr.themes.Soft(), title="Clinical Oversight Assistant") as demo:
302
+ gr.Markdown("""
303
+ <h1 style='text-align: center;'>🩺 Comprehensive Clinical Oversight Assistant</h1>
304
+ <p style='text-align: center;'>Analyze complete medical records for potential oversights</p>
305
+ """)
306
+
307
+ with gr.Row():
308
+ with gr.Column(scale=3):
309
+ file_upload = gr.File(
310
+ file_types=[".pdf", ".csv", ".xls", ".xlsx"],
311
+ file_count="multiple",
312
+ label="Upload Medical Records"
313
+ )
314
+ msg_input = gr.Textbox(
315
+ placeholder="Optional: Add specific focus areas or questions...",
316
+ label="Analysis Focus"
317
+ )
318
+ with gr.Row():
319
+ send_btn = gr.Button("Analyze Full Document", variant="primary")
320
+ clear_btn = gr.Button("Clear")
321
+ status = gr.Textbox(label="Status", interactive=False)
322
+
323
+ with gr.Column(scale=7):
324
+ report_output = gr.Textbox(
325
+ label="Clinical Oversight Report",
326
+ lines=20,
327
+ max_lines=50,
328
+ interactive=False
329
+ )
330
+ download_output = gr.File(
331
+ label="Download Full Report",
332
+ visible=False
333
+ )
334
+
335
+ def analyze(files: List, message: str):
336
+ """Process files and generate analysis."""
337
+ if not files:
338
+ yield "", None, "⚠️ Please upload at least one file to analyze."
339
+ return
340
+
341
+ yield "", None, "⏳ Processing documents..."
342
+
343
+ # Process all files completely
344
+ file_contents = []
345
+ filenames = []
346
+
347
+ with ThreadPoolExecutor(max_workers=4) as executor:
348
+ futures = []
349
+ for f in files:
350
+ futures.append(executor.submit(
351
+ convert_file_to_json,
352
+ f.name,
353
+ f.name.split(".")[-1].lower()
354
+ ))
355
+ filenames.append(os.path.basename(f.name))
356
+
357
+ results = []
358
+ for future in as_completed(futures):
359
+ results.append(sanitize_utf8(future.result()))
360
+
361
+ file_contents = results
362
+
363
+ combined_filename = " + ".join(filenames)
364
+ combined_content = "\n".join([
365
+ json.loads(fc).get("content", "") if "content" in json.loads(fc)
366
+ else str(json.loads(fc).get("rows", ""))
367
+ for fc in file_contents
368
+ ])
369
+
370
+ yield "", None, "🔍 Analyzing content..."
371
+
372
  try:
373
+ # Process the complete document
374
+ full_report = analyze_large_document(
375
+ combined_content,
376
+ combined_filename,
377
+ agent
378
+ )
379
+
380
+ # Save report to file
381
+ file_hash_value = hashlib.md5(combined_content.encode()).hexdigest()
382
+ report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt")
383
+ with open(report_path, "w", encoding="utf-8") as f:
384
+ f.write(full_report)
385
+
386
+ yield full_report, report_path if os.path.exists(report_path) else None, "✅ Analysis complete!"
387
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  except Exception as e:
389
+ error_msg = f" Error during analysis: {str(e)}"
390
+ print(error_msg)
391
+ yield "", None, error_msg
392
+
393
+ # UI event handlers
394
+ send_btn.click(
395
+ fn=analyze,
396
+ inputs=[file_upload, msg_input],
397
+ outputs=[report_output, download_output, status],
398
+ api_name="analyze"
399
+ )
400
+
401
+ clear_btn.click(
402
+ fn=lambda: ("", None, ""),
403
+ inputs=None,
404
+ outputs=[report_output, download_output, status]
405
+ )
406
+
407
  return demo
408
 
409
  if __name__ == "__main__":
410
  print("🚀 Launching app...")
411
  agent = init_agent()
412
  demo = create_ui(agent)
413
+ demo.queue(
414
+ api_open=False,
415
+ max_size=20
416
+ ).launch(
417
  server_name="0.0.0.0",
418
  server_port=7860,
419
  show_error=True,