Ali2206 commited on
Commit
76162fc
Β·
verified Β·
1 Parent(s): b1ea34e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -130
app.py CHANGED
@@ -111,6 +111,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
111
  with pdfplumber.open(file_path) as pdf:
112
  total_pages = len(pdf.pages)
113
  if total_pages == 0:
 
114
  return []
115
 
116
  results = []
@@ -128,7 +129,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
128
  for chunk in page_chunks:
129
  chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
130
  if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
131
- logger.warning(f"Total tokens exceed model limit. Stopping.")
132
  return results
133
  results.append(chunk)
134
  total_tokens += chunk_tokens
@@ -139,60 +140,60 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
139
  del pdf
140
  gc.collect()
141
 
 
 
 
 
142
  return results
143
  except Exception as e:
144
  logger.error(f"PDF processing error: {e}")
145
  return [f"PDF processing error: {str(e)}"]
146
 
147
  def excel_to_json(file_path: str) -> List[Dict]:
148
- try:
149
- # Try with openpyxl first
 
 
 
150
  try:
151
- with pd.ExcelFile(file_path, engine='openpyxl') as excel_file:
152
  sheets = excel_file.sheet_names
 
 
 
153
  results = []
154
  for sheet_name in sheets:
155
- df = pd.read_excel(
156
- excel_file,
157
- sheet_name=sheet_name,
158
- header=None,
159
- dtype=str,
160
- na_filter=False
161
- )
162
- if not df.empty:
163
- results.append({
164
- "filename": f"{os.path.basename(file_path)} - {sheet_name}",
165
- "rows": df.values.tolist(),
166
- "type": "excel"
167
- })
168
- return results if results else [{"error": "No data found in any sheet"}]
169
- except Exception as openpyxl_error:
170
- # Fallback to xlrd
171
- try:
172
- with pd.ExcelFile(file_path, engine='xlrd') as excel_file:
173
- sheets = excel_file.sheet_names
174
- results = []
175
- for sheet_name in sheets:
176
  df = pd.read_excel(
177
  excel_file,
178
  sheet_name=sheet_name,
179
  header=None,
180
  dtype=str,
181
- na_filter=False
 
182
  )
183
  if not df.empty:
 
 
184
  results.append({
185
  "filename": f"{os.path.basename(file_path)} - {sheet_name}",
186
  "rows": df.values.tolist(),
187
- "type": "excel"
 
 
188
  })
189
- return results if results else [{"error": "No data found in any sheet"}]
190
- except Exception as xlrd_error:
191
- logger.error(f"Excel processing failed: {xlrd_error}")
192
- return [{"error": f"Excel processing failed: {str(xlrd_error)}"}]
193
- except Exception as e:
194
- logger.error(f"Excel file opening error: {e}")
195
- return [{"error": f"Excel file opening error: {str(e)}"}]
 
 
 
 
 
196
 
197
  def csv_to_json(file_path: str) -> List[Dict]:
198
  try:
@@ -209,10 +210,14 @@ def csv_to_json(file_path: str) -> List[Dict]:
209
  chunks.append(chunk)
210
 
211
  df = pd.concat(chunks) if chunks else pd.DataFrame()
 
 
 
212
  return [{
213
  "filename": os.path.basename(file_path),
214
  "rows": df.values.tolist(),
215
- "type": "csv"
 
216
  }]
217
  except Exception as e:
218
  logger.error(f"CSV processing error: {e}")
@@ -220,23 +225,44 @@ def csv_to_json(file_path: str) -> List[Dict]:
220
 
221
  @lru_cache(maxsize=100)
222
  def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
 
223
  try:
 
 
224
  if file_type == "pdf":
225
  chunks = extract_all_pages(file_path)
 
 
226
  return [{
227
  "filename": os.path.basename(file_path),
228
  "content": chunk,
229
  "status": "initial",
230
- "type": "pdf"
231
- } for chunk in chunks]
 
 
232
  elif file_type in ["xls", "xlsx"]:
233
- return excel_to_json(file_path)
 
 
 
 
 
 
234
  elif file_type == "csv":
235
- return csv_to_json(file_path)
 
 
 
 
 
 
236
  else:
 
237
  return [{"error": f"Unsupported file type: {file_type}"}]
 
238
  except Exception as e:
239
- logger.error(f"Error processing file: {e}")
240
  return [{"error": f"Error processing file: {str(e)}"}]
241
 
242
  def clean_response(text: str) -> str:
@@ -317,113 +343,136 @@ Patient Record Excerpt (Chunk {0} of {1}):
317
  progress_bar = gr.Progress()
318
 
319
  def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
320
- """Optimized analysis pipeline with correct message formatting"""
321
- # Initialize with empty history if None
322
- if history is None:
323
- history = []
324
-
325
- # Append user message
326
- history.append([message, None])
327
- yield history, None, ""
328
 
329
- extracted = []
330
- file_hash_value = ""
331
-
332
- if files:
 
 
 
 
333
  for f in files:
334
  file_type = f.name.split(".")[-1].lower()
335
- cache_key = f"{file_hash(f.name)}_{file_type}"
336
 
 
337
  if cache_key in cache:
338
  cached_data = cache[cache_key]
339
  if isinstance(cached_data, list) and len(cached_data) > 0:
340
  extracted.extend(cached_data)
341
  history[-1][1] = f"βœ… Using cached data for {os.path.basename(f.name)}"
342
  yield history, None, ""
343
- else:
344
- history[-1][1] = f"❌ Cached data empty for {os.path.basename(f.name)}. Reprocessing..."
345
- yield history, None, ""
346
- else:
347
- try:
348
- result = process_file_cached(f.name, file_type)
349
- if result and not (len(result) == 1 and "error" in result[0]):
350
- cache[cache_key] = result
351
- extracted.extend(result)
352
- history[-1][1] = f"βœ… Processed {os.path.basename(f.name)}"
353
- yield history, None, ""
354
- else:
355
- error_msg = result[0]["error"] if result else "Unknown error"
356
- history[-1][1] = f"❌ Failed to process {os.path.basename(f.name)}: {error_msg}"
357
- yield history, None, error_msg
358
- return
359
- except Exception as e:
360
- logger.error(f"File processing error: {e}")
361
- history[-1][1] = f"❌ Error processing {os.path.basename(f.name)}: {str(e)}"
362
- yield history, None, str(e)
363
  return
 
 
 
 
 
 
 
 
 
 
364
 
365
  file_hash_value = file_hash(files[0].name) if files else ""
366
-
367
- if not extracted:
368
- history[-1][1] = "❌ No valid content extracted. Please upload a supported file."
369
- yield history, None, "No valid content extracted."
370
- return
371
-
372
- chunks = [item["content"] for item in extracted if "content" in item]
373
- if not chunks:
374
- history[-1][1] = "❌ No processable content found in the file."
375
- yield history, None, "No processable content found."
376
- return
377
-
378
- combined_response = ""
379
- report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
380
-
381
- try:
382
- for batch_idx in range(0, len(chunks), BATCH_SIZE):
383
- batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
384
-
385
- progress(batch_idx / len(chunks),
386
- desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
387
-
388
- with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
389
- futures = {
390
- executor.submit(
391
- agent.run_quick_summary,
392
- chunk, 0.2, 256, 1024
393
- ): idx
394
- for idx, chunk in enumerate(batch_chunks)
395
- }
 
 
 
 
 
 
 
396
 
397
- for future in as_completed(futures):
398
- chunk_idx = futures[future]
399
- try:
400
- response = clean_response(future.result())
401
- if response:
402
- combined_response += f"\n--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
403
- history[-1][1] = combined_response.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  yield history, None, ""
405
- except Exception as e:
406
- logger.error(f"Chunk processing error: {e}")
407
- history[-1][1] = f"Error processing chunk: {str(e)}"
408
- yield history, None, ""
409
- finally:
410
- del future
411
- torch.cuda.empty_cache()
412
- gc.collect()
413
-
414
- summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
415
- history.append(["Analysis completed", None])
416
- history[-1][1] = summary
417
- yield history, report_path, summary
 
 
 
 
 
418
 
419
  except Exception as e:
420
- logger.error(f"Analysis error: {e}")
421
- history.append(["Analysis failed", None])
422
- history[-1][1] = f"❌ Error occurred: {str(e)}"
423
- yield history, None, f"Error occurred: {str(e)}"
424
- finally:
425
- torch.cuda.empty_cache()
426
- gc.collect()
427
 
428
  send_btn.click(
429
  analyze,
 
111
  with pdfplumber.open(file_path) as pdf:
112
  total_pages = len(pdf.pages)
113
  if total_pages == 0:
114
+ logger.error("PDF has 0 pages - may be corrupted or empty")
115
  return []
116
 
117
  results = []
 
129
  for chunk in page_chunks:
130
  chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
131
  if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
132
+ logger.warning("Total tokens exceed model limit. Stopping.")
133
  return results
134
  results.append(chunk)
135
  total_tokens += chunk_tokens
 
140
  del pdf
141
  gc.collect()
142
 
143
+ if not results:
144
+ logger.error("No content extracted from PDF - may be scanned or encrypted")
145
+ return ["PDF appears to be empty or unreadable"]
146
+
147
  return results
148
  except Exception as e:
149
  logger.error(f"PDF processing error: {e}")
150
  return [f"PDF processing error: {str(e)}"]
151
 
152
  def excel_to_json(file_path: str) -> List[Dict]:
153
+ """Enhanced Excel processing with multiple engine support"""
154
+ engines = ['openpyxl', 'xlrd', 'odf']
155
+ last_error = None
156
+
157
+ for engine in engines:
158
  try:
159
+ with pd.ExcelFile(file_path, engine=engine) as excel_file:
160
  sheets = excel_file.sheet_names
161
+ if not sheets:
162
+ return [{"error": "No sheets found in Excel file"}]
163
+
164
  results = []
165
  for sheet_name in sheets:
166
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  df = pd.read_excel(
168
  excel_file,
169
  sheet_name=sheet_name,
170
  header=None,
171
  dtype=str,
172
+ na_filter=False,
173
+ engine=engine
174
  )
175
  if not df.empty:
176
+ # Convert all cells to string and clean
177
+ df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else "")
178
  results.append({
179
  "filename": f"{os.path.basename(file_path)} - {sheet_name}",
180
  "rows": df.values.tolist(),
181
+ "type": "excel",
182
+ "sheet": sheet_name,
183
+ "dimensions": f"{len(df)} rows x {len(df.columns)} cols"
184
  })
185
+ except Exception as sheet_error:
186
+ logger.warning(f"Error processing sheet {sheet_name}: {sheet_error}")
187
+ continue
188
+
189
+ if results:
190
+ logger.info(f"Successfully processed Excel file with {engine} engine")
191
+ return results
192
+ except Exception as engine_error:
193
+ last_error = engine_error
194
+ continue
195
+
196
+ return [{"error": f"Failed to process Excel file with all engines. Last error: {str(last_error)}"}]
197
 
198
  def csv_to_json(file_path: str) -> List[Dict]:
199
  try:
 
210
  chunks.append(chunk)
211
 
212
  df = pd.concat(chunks) if chunks else pd.DataFrame()
213
+ if df.empty:
214
+ return [{"error": "CSV file is empty or could not be read"}]
215
+
216
  return [{
217
  "filename": os.path.basename(file_path),
218
  "rows": df.values.tolist(),
219
+ "type": "csv",
220
+ "dimensions": f"{len(df)} rows x {len(df.columns)} cols"
221
  }]
222
  except Exception as e:
223
  logger.error(f"CSV processing error: {e}")
 
225
 
226
  @lru_cache(maxsize=100)
227
  def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
228
+ """Enhanced file processing with detailed logging"""
229
  try:
230
+ logger.info(f"Processing file: {file_path} (type: {file_type})")
231
+
232
  if file_type == "pdf":
233
  chunks = extract_all_pages(file_path)
234
+ if not chunks or (len(chunks) == 1 and "error" in chunks[0]):
235
+ return [{"error": chunks[0] if chunks else "PDF appears to be empty"}]
236
  return [{
237
  "filename": os.path.basename(file_path),
238
  "content": chunk,
239
  "status": "initial",
240
+ "type": "pdf",
241
+ "page": i+1
242
+ } for i, chunk in enumerate(chunks)]
243
+
244
  elif file_type in ["xls", "xlsx"]:
245
+ result = excel_to_json(file_path)
246
+ if "error" in result[0]:
247
+ logger.error(f"Excel processing failed: {result[0]['error']}")
248
+ else:
249
+ logger.info(f"Excel processing successful - found {len(result)} sheets")
250
+ return result
251
+
252
  elif file_type == "csv":
253
+ result = csv_to_json(file_path)
254
+ if "error" in result[0]:
255
+ logger.error(f"CSV processing failed: {result[0]['error']}")
256
+ else:
257
+ logger.info(f"CSV processing successful - found {len(result[0]['rows'])} rows")
258
+ return result
259
+
260
  else:
261
+ logger.warning(f"Unsupported file type: {file_type}")
262
  return [{"error": f"Unsupported file type: {file_type}"}]
263
+
264
  except Exception as e:
265
+ logger.error(f"Error processing {file_path}: {str(e)}", exc_info=True)
266
  return [{"error": f"Error processing file: {str(e)}"}]
267
 
268
  def clean_response(text: str) -> str:
 
343
  progress_bar = gr.Progress()
344
 
345
  def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
346
+ """Enhanced analysis with detailed file processing feedback"""
347
+ try:
348
+ if history is None:
349
+ history = []
350
+
351
+ history.append([message, None])
352
+ yield history, None, ""
 
353
 
354
+ if not files:
355
+ history[-1][1] = "❌ Please upload a file to analyze"
356
+ yield history, None, "No files uploaded"
357
+ return
358
+
359
+ extracted = []
360
+ file_hash_value = ""
361
+
362
  for f in files:
363
  file_type = f.name.split(".")[-1].lower()
364
+ logger.info(f"Processing file: {f.name} (type: {file_type})")
365
 
366
+ cache_key = f"{file_hash(f.name)}_{file_type}"
367
  if cache_key in cache:
368
  cached_data = cache[cache_key]
369
  if isinstance(cached_data, list) and len(cached_data) > 0:
370
  extracted.extend(cached_data)
371
  history[-1][1] = f"βœ… Using cached data for {os.path.basename(f.name)}"
372
  yield history, None, ""
373
+ continue
374
+
375
+ try:
376
+ result = process_file_cached(f.name, file_type)
377
+ if "error" in result[0]:
378
+ history[-1][1] = f"❌ Error processing {os.path.basename(f.name)}: {result[0]['error']}"
379
+ yield history, None, result[0]['error']
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  return
381
+
382
+ cache[cache_key] = result
383
+ extracted.extend(result)
384
+ history[-1][1] = f"βœ… Processed {os.path.basename(f.name)}"
385
+ yield history, None, ""
386
+ except Exception as e:
387
+ logger.error(f"File processing error: {e}", exc_info=True)
388
+ history[-1][1] = f"❌ Critical error processing {os.path.basename(f.name)}"
389
+ yield history, None, str(e)
390
+ return
391
 
392
  file_hash_value = file_hash(files[0].name) if files else ""
393
+
394
+ # Debug extracted content
395
+ logger.info(f"Extracted content summary:")
396
+ for item in extracted:
397
+ if "content" in item:
398
+ logger.info(f"- {item['filename']}: {len(item['content'])} chars")
399
+ elif "rows" in item:
400
+ logger.info(f"- {item['filename']}: {len(item['rows'])} rows")
401
+
402
+ if not extracted:
403
+ history[-1][1] = "❌ No valid content extracted from files"
404
+ yield history, None, "No valid content extracted"
405
+ return
406
+
407
+ chunks = []
408
+ for item in extracted:
409
+ if "content" in item:
410
+ chunks.append(item["content"])
411
+ elif "rows" in item:
412
+ # Convert Excel/CSV rows to text
413
+ rows_text = "\n".join([", ".join(map(str, row)) for row in item["rows"]])
414
+ chunks.append(f"=== {item['filename']} ===\n{rows_text}")
415
+
416
+ if not chunks:
417
+ history[-1][1] = "❌ No processable content found in files"
418
+ yield history, None, "No processable content found"
419
+ return
420
+
421
+ combined_response = ""
422
+ report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
423
+
424
+ try:
425
+ for batch_idx in range(0, len(chunks), BATCH_SIZE):
426
+ batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
427
+
428
+ progress(batch_idx / len(chunks),
429
+ desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
430
 
431
+ with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
432
+ futures = {
433
+ executor.submit(
434
+ agent.run_quick_summary,
435
+ chunk, 0.2, 256, 1024
436
+ ): idx
437
+ for idx, chunk in enumerate(batch_chunks)
438
+ }
439
+
440
+ for future in as_completed(futures):
441
+ chunk_idx = futures[future]
442
+ try:
443
+ response = clean_response(future.result())
444
+ if response:
445
+ combined_response += f"\n--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
446
+ history[-1][1] = combined_response.strip()
447
+ yield history, None, ""
448
+ except Exception as e:
449
+ logger.error(f"Chunk processing error: {e}")
450
+ history[-1][1] = f"Error processing chunk: {str(e)}"
451
  yield history, None, ""
452
+ finally:
453
+ del future
454
+ torch.cuda.empty_cache()
455
+ gc.collect()
456
+
457
+ summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
458
+ history.append(["Analysis completed", None])
459
+ history[-1][1] = summary
460
+ yield history, report_path, summary
461
+
462
+ except Exception as e:
463
+ logger.error(f"Analysis error: {e}")
464
+ history.append(["Analysis failed", None])
465
+ history[-1][1] = f"❌ Error occurred: {str(e)}"
466
+ yield history, None, f"Error occurred: {str(e)}"
467
+ finally:
468
+ torch.cuda.empty_cache()
469
+ gc.collect()
470
 
471
  except Exception as e:
472
+ logger.error(f"Unexpected error in analysis: {e}")
473
+ history.append(["System error", None])
474
+ history[-1][1] = f"❌ System error occurred: {str(e)}"
475
+ yield history, None, f"System error: {str(e)}"
 
 
 
476
 
477
  send_btn.click(
478
  analyze,