Update app.py
Browse files
app.py
CHANGED
@@ -111,6 +111,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
111 |
with pdfplumber.open(file_path) as pdf:
|
112 |
total_pages = len(pdf.pages)
|
113 |
if total_pages == 0:
|
|
|
114 |
return []
|
115 |
|
116 |
results = []
|
@@ -128,7 +129,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
128 |
for chunk in page_chunks:
|
129 |
chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
|
130 |
if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
|
131 |
-
logger.warning(
|
132 |
return results
|
133 |
results.append(chunk)
|
134 |
total_tokens += chunk_tokens
|
@@ -139,60 +140,60 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
|
|
139 |
del pdf
|
140 |
gc.collect()
|
141 |
|
|
|
|
|
|
|
|
|
142 |
return results
|
143 |
except Exception as e:
|
144 |
logger.error(f"PDF processing error: {e}")
|
145 |
return [f"PDF processing error: {str(e)}"]
|
146 |
|
147 |
def excel_to_json(file_path: str) -> List[Dict]:
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
150 |
try:
|
151 |
-
with pd.ExcelFile(file_path, engine=
|
152 |
sheets = excel_file.sheet_names
|
|
|
|
|
|
|
153 |
results = []
|
154 |
for sheet_name in sheets:
|
155 |
-
|
156 |
-
excel_file,
|
157 |
-
sheet_name=sheet_name,
|
158 |
-
header=None,
|
159 |
-
dtype=str,
|
160 |
-
na_filter=False
|
161 |
-
)
|
162 |
-
if not df.empty:
|
163 |
-
results.append({
|
164 |
-
"filename": f"{os.path.basename(file_path)} - {sheet_name}",
|
165 |
-
"rows": df.values.tolist(),
|
166 |
-
"type": "excel"
|
167 |
-
})
|
168 |
-
return results if results else [{"error": "No data found in any sheet"}]
|
169 |
-
except Exception as openpyxl_error:
|
170 |
-
# Fallback to xlrd
|
171 |
-
try:
|
172 |
-
with pd.ExcelFile(file_path, engine='xlrd') as excel_file:
|
173 |
-
sheets = excel_file.sheet_names
|
174 |
-
results = []
|
175 |
-
for sheet_name in sheets:
|
176 |
df = pd.read_excel(
|
177 |
excel_file,
|
178 |
sheet_name=sheet_name,
|
179 |
header=None,
|
180 |
dtype=str,
|
181 |
-
na_filter=False
|
|
|
182 |
)
|
183 |
if not df.empty:
|
|
|
|
|
184 |
results.append({
|
185 |
"filename": f"{os.path.basename(file_path)} - {sheet_name}",
|
186 |
"rows": df.values.tolist(),
|
187 |
-
"type": "excel"
|
|
|
|
|
188 |
})
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
def csv_to_json(file_path: str) -> List[Dict]:
|
198 |
try:
|
@@ -209,10 +210,14 @@ def csv_to_json(file_path: str) -> List[Dict]:
|
|
209 |
chunks.append(chunk)
|
210 |
|
211 |
df = pd.concat(chunks) if chunks else pd.DataFrame()
|
|
|
|
|
|
|
212 |
return [{
|
213 |
"filename": os.path.basename(file_path),
|
214 |
"rows": df.values.tolist(),
|
215 |
-
"type": "csv"
|
|
|
216 |
}]
|
217 |
except Exception as e:
|
218 |
logger.error(f"CSV processing error: {e}")
|
@@ -220,23 +225,44 @@ def csv_to_json(file_path: str) -> List[Dict]:
|
|
220 |
|
221 |
@lru_cache(maxsize=100)
|
222 |
def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
|
|
|
223 |
try:
|
|
|
|
|
224 |
if file_type == "pdf":
|
225 |
chunks = extract_all_pages(file_path)
|
|
|
|
|
226 |
return [{
|
227 |
"filename": os.path.basename(file_path),
|
228 |
"content": chunk,
|
229 |
"status": "initial",
|
230 |
-
"type": "pdf"
|
231 |
-
|
|
|
|
|
232 |
elif file_type in ["xls", "xlsx"]:
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
elif file_type == "csv":
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
else:
|
|
|
237 |
return [{"error": f"Unsupported file type: {file_type}"}]
|
|
|
238 |
except Exception as e:
|
239 |
-
logger.error(f"Error processing
|
240 |
return [{"error": f"Error processing file: {str(e)}"}]
|
241 |
|
242 |
def clean_response(text: str) -> str:
|
@@ -317,113 +343,136 @@ Patient Record Excerpt (Chunk {0} of {1}):
|
|
317 |
progress_bar = gr.Progress()
|
318 |
|
319 |
def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
|
320 |
-
"""
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
yield history, None, ""
|
328 |
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
|
|
|
|
|
|
|
|
333 |
for f in files:
|
334 |
file_type = f.name.split(".")[-1].lower()
|
335 |
-
|
336 |
|
|
|
337 |
if cache_key in cache:
|
338 |
cached_data = cache[cache_key]
|
339 |
if isinstance(cached_data, list) and len(cached_data) > 0:
|
340 |
extracted.extend(cached_data)
|
341 |
history[-1][1] = f"β
Using cached data for {os.path.basename(f.name)}"
|
342 |
yield history, None, ""
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
cache[cache_key] = result
|
351 |
-
extracted.extend(result)
|
352 |
-
history[-1][1] = f"β
Processed {os.path.basename(f.name)}"
|
353 |
-
yield history, None, ""
|
354 |
-
else:
|
355 |
-
error_msg = result[0]["error"] if result else "Unknown error"
|
356 |
-
history[-1][1] = f"β Failed to process {os.path.basename(f.name)}: {error_msg}"
|
357 |
-
yield history, None, error_msg
|
358 |
-
return
|
359 |
-
except Exception as e:
|
360 |
-
logger.error(f"File processing error: {e}")
|
361 |
-
history[-1][1] = f"β Error processing {os.path.basename(f.name)}: {str(e)}"
|
362 |
-
yield history, None, str(e)
|
363 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
file_hash_value = file_hash(files[0].name) if files else ""
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
yield history, None, ""
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
418 |
|
419 |
except Exception as e:
|
420 |
-
logger.error(f"
|
421 |
-
history.append(["
|
422 |
-
history[-1][1] = f"β
|
423 |
-
yield history, None, f"
|
424 |
-
finally:
|
425 |
-
torch.cuda.empty_cache()
|
426 |
-
gc.collect()
|
427 |
|
428 |
send_btn.click(
|
429 |
analyze,
|
|
|
111 |
with pdfplumber.open(file_path) as pdf:
|
112 |
total_pages = len(pdf.pages)
|
113 |
if total_pages == 0:
|
114 |
+
logger.error("PDF has 0 pages - may be corrupted or empty")
|
115 |
return []
|
116 |
|
117 |
results = []
|
|
|
129 |
for chunk in page_chunks:
|
130 |
chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
|
131 |
if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
|
132 |
+
logger.warning("Total tokens exceed model limit. Stopping.")
|
133 |
return results
|
134 |
results.append(chunk)
|
135 |
total_tokens += chunk_tokens
|
|
|
140 |
del pdf
|
141 |
gc.collect()
|
142 |
|
143 |
+
if not results:
|
144 |
+
logger.error("No content extracted from PDF - may be scanned or encrypted")
|
145 |
+
return ["PDF appears to be empty or unreadable"]
|
146 |
+
|
147 |
return results
|
148 |
except Exception as e:
|
149 |
logger.error(f"PDF processing error: {e}")
|
150 |
return [f"PDF processing error: {str(e)}"]
|
151 |
|
152 |
def excel_to_json(file_path: str) -> List[Dict]:
|
153 |
+
"""Enhanced Excel processing with multiple engine support"""
|
154 |
+
engines = ['openpyxl', 'xlrd', 'odf']
|
155 |
+
last_error = None
|
156 |
+
|
157 |
+
for engine in engines:
|
158 |
try:
|
159 |
+
with pd.ExcelFile(file_path, engine=engine) as excel_file:
|
160 |
sheets = excel_file.sheet_names
|
161 |
+
if not sheets:
|
162 |
+
return [{"error": "No sheets found in Excel file"}]
|
163 |
+
|
164 |
results = []
|
165 |
for sheet_name in sheets:
|
166 |
+
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
df = pd.read_excel(
|
168 |
excel_file,
|
169 |
sheet_name=sheet_name,
|
170 |
header=None,
|
171 |
dtype=str,
|
172 |
+
na_filter=False,
|
173 |
+
engine=engine
|
174 |
)
|
175 |
if not df.empty:
|
176 |
+
# Convert all cells to string and clean
|
177 |
+
df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else "")
|
178 |
results.append({
|
179 |
"filename": f"{os.path.basename(file_path)} - {sheet_name}",
|
180 |
"rows": df.values.tolist(),
|
181 |
+
"type": "excel",
|
182 |
+
"sheet": sheet_name,
|
183 |
+
"dimensions": f"{len(df)} rows x {len(df.columns)} cols"
|
184 |
})
|
185 |
+
except Exception as sheet_error:
|
186 |
+
logger.warning(f"Error processing sheet {sheet_name}: {sheet_error}")
|
187 |
+
continue
|
188 |
+
|
189 |
+
if results:
|
190 |
+
logger.info(f"Successfully processed Excel file with {engine} engine")
|
191 |
+
return results
|
192 |
+
except Exception as engine_error:
|
193 |
+
last_error = engine_error
|
194 |
+
continue
|
195 |
+
|
196 |
+
return [{"error": f"Failed to process Excel file with all engines. Last error: {str(last_error)}"}]
|
197 |
|
198 |
def csv_to_json(file_path: str) -> List[Dict]:
|
199 |
try:
|
|
|
210 |
chunks.append(chunk)
|
211 |
|
212 |
df = pd.concat(chunks) if chunks else pd.DataFrame()
|
213 |
+
if df.empty:
|
214 |
+
return [{"error": "CSV file is empty or could not be read"}]
|
215 |
+
|
216 |
return [{
|
217 |
"filename": os.path.basename(file_path),
|
218 |
"rows": df.values.tolist(),
|
219 |
+
"type": "csv",
|
220 |
+
"dimensions": f"{len(df)} rows x {len(df.columns)} cols"
|
221 |
}]
|
222 |
except Exception as e:
|
223 |
logger.error(f"CSV processing error: {e}")
|
|
|
225 |
|
226 |
@lru_cache(maxsize=100)
|
227 |
def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
|
228 |
+
"""Enhanced file processing with detailed logging"""
|
229 |
try:
|
230 |
+
logger.info(f"Processing file: {file_path} (type: {file_type})")
|
231 |
+
|
232 |
if file_type == "pdf":
|
233 |
chunks = extract_all_pages(file_path)
|
234 |
+
if not chunks or (len(chunks) == 1 and "error" in chunks[0]):
|
235 |
+
return [{"error": chunks[0] if chunks else "PDF appears to be empty"}]
|
236 |
return [{
|
237 |
"filename": os.path.basename(file_path),
|
238 |
"content": chunk,
|
239 |
"status": "initial",
|
240 |
+
"type": "pdf",
|
241 |
+
"page": i+1
|
242 |
+
} for i, chunk in enumerate(chunks)]
|
243 |
+
|
244 |
elif file_type in ["xls", "xlsx"]:
|
245 |
+
result = excel_to_json(file_path)
|
246 |
+
if "error" in result[0]:
|
247 |
+
logger.error(f"Excel processing failed: {result[0]['error']}")
|
248 |
+
else:
|
249 |
+
logger.info(f"Excel processing successful - found {len(result)} sheets")
|
250 |
+
return result
|
251 |
+
|
252 |
elif file_type == "csv":
|
253 |
+
result = csv_to_json(file_path)
|
254 |
+
if "error" in result[0]:
|
255 |
+
logger.error(f"CSV processing failed: {result[0]['error']}")
|
256 |
+
else:
|
257 |
+
logger.info(f"CSV processing successful - found {len(result[0]['rows'])} rows")
|
258 |
+
return result
|
259 |
+
|
260 |
else:
|
261 |
+
logger.warning(f"Unsupported file type: {file_type}")
|
262 |
return [{"error": f"Unsupported file type: {file_type}"}]
|
263 |
+
|
264 |
except Exception as e:
|
265 |
+
logger.error(f"Error processing {file_path}: {str(e)}", exc_info=True)
|
266 |
return [{"error": f"Error processing file: {str(e)}"}]
|
267 |
|
268 |
def clean_response(text: str) -> str:
|
|
|
343 |
progress_bar = gr.Progress()
|
344 |
|
345 |
def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
|
346 |
+
"""Enhanced analysis with detailed file processing feedback"""
|
347 |
+
try:
|
348 |
+
if history is None:
|
349 |
+
history = []
|
350 |
+
|
351 |
+
history.append([message, None])
|
352 |
+
yield history, None, ""
|
|
|
353 |
|
354 |
+
if not files:
|
355 |
+
history[-1][1] = "β Please upload a file to analyze"
|
356 |
+
yield history, None, "No files uploaded"
|
357 |
+
return
|
358 |
+
|
359 |
+
extracted = []
|
360 |
+
file_hash_value = ""
|
361 |
+
|
362 |
for f in files:
|
363 |
file_type = f.name.split(".")[-1].lower()
|
364 |
+
logger.info(f"Processing file: {f.name} (type: {file_type})")
|
365 |
|
366 |
+
cache_key = f"{file_hash(f.name)}_{file_type}"
|
367 |
if cache_key in cache:
|
368 |
cached_data = cache[cache_key]
|
369 |
if isinstance(cached_data, list) and len(cached_data) > 0:
|
370 |
extracted.extend(cached_data)
|
371 |
history[-1][1] = f"β
Using cached data for {os.path.basename(f.name)}"
|
372 |
yield history, None, ""
|
373 |
+
continue
|
374 |
+
|
375 |
+
try:
|
376 |
+
result = process_file_cached(f.name, file_type)
|
377 |
+
if "error" in result[0]:
|
378 |
+
history[-1][1] = f"β Error processing {os.path.basename(f.name)}: {result[0]['error']}"
|
379 |
+
yield history, None, result[0]['error']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
return
|
381 |
+
|
382 |
+
cache[cache_key] = result
|
383 |
+
extracted.extend(result)
|
384 |
+
history[-1][1] = f"β
Processed {os.path.basename(f.name)}"
|
385 |
+
yield history, None, ""
|
386 |
+
except Exception as e:
|
387 |
+
logger.error(f"File processing error: {e}", exc_info=True)
|
388 |
+
history[-1][1] = f"β Critical error processing {os.path.basename(f.name)}"
|
389 |
+
yield history, None, str(e)
|
390 |
+
return
|
391 |
|
392 |
file_hash_value = file_hash(files[0].name) if files else ""
|
393 |
+
|
394 |
+
# Debug extracted content
|
395 |
+
logger.info(f"Extracted content summary:")
|
396 |
+
for item in extracted:
|
397 |
+
if "content" in item:
|
398 |
+
logger.info(f"- {item['filename']}: {len(item['content'])} chars")
|
399 |
+
elif "rows" in item:
|
400 |
+
logger.info(f"- {item['filename']}: {len(item['rows'])} rows")
|
401 |
+
|
402 |
+
if not extracted:
|
403 |
+
history[-1][1] = "β No valid content extracted from files"
|
404 |
+
yield history, None, "No valid content extracted"
|
405 |
+
return
|
406 |
+
|
407 |
+
chunks = []
|
408 |
+
for item in extracted:
|
409 |
+
if "content" in item:
|
410 |
+
chunks.append(item["content"])
|
411 |
+
elif "rows" in item:
|
412 |
+
# Convert Excel/CSV rows to text
|
413 |
+
rows_text = "\n".join([", ".join(map(str, row)) for row in item["rows"]])
|
414 |
+
chunks.append(f"=== {item['filename']} ===\n{rows_text}")
|
415 |
+
|
416 |
+
if not chunks:
|
417 |
+
history[-1][1] = "β No processable content found in files"
|
418 |
+
yield history, None, "No processable content found"
|
419 |
+
return
|
420 |
+
|
421 |
+
combined_response = ""
|
422 |
+
report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
|
423 |
+
|
424 |
+
try:
|
425 |
+
for batch_idx in range(0, len(chunks), BATCH_SIZE):
|
426 |
+
batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
|
427 |
+
|
428 |
+
progress(batch_idx / len(chunks),
|
429 |
+
desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
|
430 |
|
431 |
+
with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
|
432 |
+
futures = {
|
433 |
+
executor.submit(
|
434 |
+
agent.run_quick_summary,
|
435 |
+
chunk, 0.2, 256, 1024
|
436 |
+
): idx
|
437 |
+
for idx, chunk in enumerate(batch_chunks)
|
438 |
+
}
|
439 |
+
|
440 |
+
for future in as_completed(futures):
|
441 |
+
chunk_idx = futures[future]
|
442 |
+
try:
|
443 |
+
response = clean_response(future.result())
|
444 |
+
if response:
|
445 |
+
combined_response += f"\n--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
|
446 |
+
history[-1][1] = combined_response.strip()
|
447 |
+
yield history, None, ""
|
448 |
+
except Exception as e:
|
449 |
+
logger.error(f"Chunk processing error: {e}")
|
450 |
+
history[-1][1] = f"Error processing chunk: {str(e)}"
|
451 |
yield history, None, ""
|
452 |
+
finally:
|
453 |
+
del future
|
454 |
+
torch.cuda.empty_cache()
|
455 |
+
gc.collect()
|
456 |
+
|
457 |
+
summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
|
458 |
+
history.append(["Analysis completed", None])
|
459 |
+
history[-1][1] = summary
|
460 |
+
yield history, report_path, summary
|
461 |
+
|
462 |
+
except Exception as e:
|
463 |
+
logger.error(f"Analysis error: {e}")
|
464 |
+
history.append(["Analysis failed", None])
|
465 |
+
history[-1][1] = f"β Error occurred: {str(e)}"
|
466 |
+
yield history, None, f"Error occurred: {str(e)}"
|
467 |
+
finally:
|
468 |
+
torch.cuda.empty_cache()
|
469 |
+
gc.collect()
|
470 |
|
471 |
except Exception as e:
|
472 |
+
logger.error(f"Unexpected error in analysis: {e}")
|
473 |
+
history.append(["System error", None])
|
474 |
+
history[-1][1] = f"β System error occurred: {str(e)}"
|
475 |
+
yield history, None, f"System error: {str(e)}"
|
|
|
|
|
|
|
476 |
|
477 |
send_btn.click(
|
478 |
analyze,
|