CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 20

Commit

76162fc

verified ·

1 Parent(s): b1ea34e

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -130

app.py CHANGED Viewed

@@ -111,6 +111,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
                 return []
         results = []
@@ -128,7 +129,7 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
                         for chunk in page_chunks:
                             chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
                             if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
-                                logger.warning(f"Total tokens exceed model limit. Stopping.")
                                 return results
                             results.append(chunk)
                             total_tokens += chunk_tokens
@@ -139,60 +140,60 @@ def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
             del pdf
             gc.collect()
         return results
     except Exception as e:
         logger.error(f"PDF processing error: {e}")
         return [f"PDF processing error: {str(e)}"]
 def excel_to_json(file_path: str) -> List[Dict]:
-    try:
-        # Try with openpyxl first
         try:
-            with pd.ExcelFile(file_path, engine='openpyxl') as excel_file:
                 sheets = excel_file.sheet_names
                 results = []
                 for sheet_name in sheets:
-                    df = pd.read_excel(
-                        excel_file,
-                        sheet_name=sheet_name,
-                        header=None,
-                        dtype=str,
-                        na_filter=False
-                    )
-                    if not df.empty:
-                        results.append({
-                            "filename": f"{os.path.basename(file_path)} - {sheet_name}",
-                            "rows": df.values.tolist(),
-                            "type": "excel"
-                        })
-                return results if results else [{"error": "No data found in any sheet"}]
-        except Exception as openpyxl_error:
-            # Fallback to xlrd
-            try:
-                with pd.ExcelFile(file_path, engine='xlrd') as excel_file:
-                    sheets = excel_file.sheet_names
-                    results = []
-                    for sheet_name in sheets:
                         df = pd.read_excel(
                             excel_file,
                             sheet_name=sheet_name,
                             header=None,
                             dtype=str,
-                            na_filter=False
                         )
                         if not df.empty:
                             results.append({
                                 "filename": f"{os.path.basename(file_path)} - {sheet_name}",
                                 "rows": df.values.tolist(),
-                                "type": "excel"
                             })
-                    return results if results else [{"error": "No data found in any sheet"}]
-            except Exception as xlrd_error:
-                logger.error(f"Excel processing failed: {xlrd_error}")
-                return [{"error": f"Excel processing failed: {str(xlrd_error)}"}]
-    except Exception as e:
-        logger.error(f"Excel file opening error: {e}")
-        return [{"error": f"Excel file opening error: {str(e)}"}]
 def csv_to_json(file_path: str) -> List[Dict]:
     try:
@@ -209,10 +210,14 @@ def csv_to_json(file_path: str) -> List[Dict]:
             chunks.append(chunk)
         df = pd.concat(chunks) if chunks else pd.DataFrame()
         return [{
             "filename": os.path.basename(file_path),
             "rows": df.values.tolist(),
-            "type": "csv"
         }]
     except Exception as e:
         logger.error(f"CSV processing error: {e}")
@@ -220,23 +225,44 @@ def csv_to_json(file_path: str) -> List[Dict]:
 @lru_cache(maxsize=100)
 def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
     try:
         if file_type == "pdf":
             chunks = extract_all_pages(file_path)
             return [{
                 "filename": os.path.basename(file_path),
                 "content": chunk,
                 "status": "initial",
-                "type": "pdf"
-            } for chunk in chunks]
         elif file_type in ["xls", "xlsx"]:
-            return excel_to_json(file_path)
         elif file_type == "csv":
-            return csv_to_json(file_path)
         else:
             return [{"error": f"Unsupported file type: {file_type}"}]
     except Exception as e:
-        logger.error(f"Error processing file: {e}")
         return [{"error": f"Error processing file: {str(e)}"}]
 def clean_response(text: str) -> str:
@@ -317,113 +343,136 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 progress_bar = gr.Progress()
         def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
-            """Optimized analysis pipeline with correct message formatting"""
-            # Initialize with empty history if None
-            if history is None:
-                history = []
-            # Append user message
-            history.append([message, None])
-            yield history, None, ""
-            extracted = []
-            file_hash_value = ""
-            if files:
                 for f in files:
                     file_type = f.name.split(".")[-1].lower()
-                    cache_key = f"{file_hash(f.name)}_{file_type}"
                     if cache_key in cache:
                         cached_data = cache[cache_key]
                         if isinstance(cached_data, list) and len(cached_data) > 0:
                             extracted.extend(cached_data)
                             history[-1][1] = f"✅ Using cached data for {os.path.basename(f.name)}"
                             yield history, None, ""
-                        else:
-                            history[-1][1] = f"❌ Cached data empty for {os.path.basename(f.name)}. Reprocessing..."
-                            yield history, None, ""
-                    else:
-                        try:
-                            result = process_file_cached(f.name, file_type)
-                            if result and not (len(result) == 1 and "error" in result[0]):
-                                cache[cache_key] = result
-                                extracted.extend(result)
-                                history[-1][1] = f"✅ Processed {os.path.basename(f.name)}"
-                                yield history, None, ""
-                            else:
-                                error_msg = result[0]["error"] if result else "Unknown error"
-                                history[-1][1] = f"❌ Failed to process {os.path.basename(f.name)}: {error_msg}"
-                                yield history, None, error_msg
-                                return
-                        except Exception as e:
-                            logger.error(f"File processing error: {e}")
-                            history[-1][1] = f"❌ Error processing {os.path.basename(f.name)}: {str(e)}"
-                            yield history, None, str(e)
                             return
                 file_hash_value = file_hash(files[0].name) if files else ""
-            if not extracted:
-                history[-1][1] = "❌ No valid content extracted. Please upload a supported file."
-                yield history, None, "No valid content extracted."
-                return
-            chunks = [item["content"] for item in extracted if "content" in item]
-            if not chunks:
-                history[-1][1] = "❌ No processable content found in the file."
-                yield history, None, "No processable content found."
-                return
-            combined_response = ""
-            report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
-            try:
-                for batch_idx in range(0, len(chunks), BATCH_SIZE):
-                    batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
-                    progress(batch_idx / len(chunks),
-                           desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
-                    with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
-                        futures = {
-                            executor.submit(
-                                agent.run_quick_summary,
-                                chunk, 0.2, 256, 1024
-                            ): idx
-                            for idx, chunk in enumerate(batch_chunks)
-                        }
-                        for future in as_completed(futures):
-                            chunk_idx = futures[future]
-                            try:
-                                response = clean_response(future.result())
-                                if response:
-                                    combined_response += f"\n--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
-                                    history[-1][1] = combined_response.strip()
                                     yield history, None, ""
-                            except Exception as e:
-                                logger.error(f"Chunk processing error: {e}")
-                                history[-1][1] = f"Error processing chunk: {str(e)}"
-                                yield history, None, ""
-                            finally:
-                                del future
-                                torch.cuda.empty_cache()
-                                gc.collect()
-                summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
-                history.append(["Analysis completed", None])
-                history[-1][1] = summary
-                yield history, report_path, summary
             except Exception as e:
-                logger.error(f"Analysis error: {e}")
-                history.append(["Analysis failed", None])
-                history[-1][1] = f"❌ Error occurred: {str(e)}"
-                yield history, None, f"Error occurred: {str(e)}"
-            finally:
-                torch.cuda.empty_cache()
-                gc.collect()
         send_btn.click(
             analyze,

         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
+                logger.error("PDF has 0 pages - may be corrupted or empty")
                 return []
         results = []
                         for chunk in page_chunks:
                             chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
                             if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
+                                logger.warning("Total tokens exceed model limit. Stopping.")
                                 return results
                             results.append(chunk)
                             total_tokens += chunk_tokens
             del pdf
             gc.collect()
+        if not results:
+            logger.error("No content extracted from PDF - may be scanned or encrypted")
+            return ["PDF appears to be empty or unreadable"]
         return results
     except Exception as e:
         logger.error(f"PDF processing error: {e}")
         return [f"PDF processing error: {str(e)}"]
 def excel_to_json(file_path: str) -> List[Dict]:
+    """Enhanced Excel processing with multiple engine support"""
+    engines = ['openpyxl', 'xlrd', 'odf']
+    last_error = None
+    for engine in engines:
         try:
+            with pd.ExcelFile(file_path, engine=engine) as excel_file:
                 sheets = excel_file.sheet_names
+                if not sheets:
+                    return [{"error": "No sheets found in Excel file"}]
                 results = []
                 for sheet_name in sheets:
+                    try:
                         df = pd.read_excel(
                             excel_file,
                             sheet_name=sheet_name,
                             header=None,
                             dtype=str,
+                            na_filter=False,
+                            engine=engine
                         )
                         if not df.empty:
+                            # Convert all cells to string and clean
+                            df = df.applymap(lambda x: str(x).strip() if pd.notna(x) else "")
                             results.append({
                                 "filename": f"{os.path.basename(file_path)} - {sheet_name}",
                                 "rows": df.values.tolist(),
+                                "type": "excel",
+                                "sheet": sheet_name,
+                                "dimensions": f"{len(df)} rows x {len(df.columns)} cols"
                             })
+                    except Exception as sheet_error:
+                        logger.warning(f"Error processing sheet {sheet_name}: {sheet_error}")
+                        continue
+                if results:
+                    logger.info(f"Successfully processed Excel file with {engine} engine")
+                    return results
+        except Exception as engine_error:
+            last_error = engine_error
+            continue
+    return [{"error": f"Failed to process Excel file with all engines. Last error: {str(last_error)}"}]
 def csv_to_json(file_path: str) -> List[Dict]:
     try:
             chunks.append(chunk)
         df = pd.concat(chunks) if chunks else pd.DataFrame()
+        if df.empty:
+            return [{"error": "CSV file is empty or could not be read"}]
         return [{
             "filename": os.path.basename(file_path),
             "rows": df.values.tolist(),
+            "type": "csv",
+            "dimensions": f"{len(df)} rows x {len(df.columns)} cols"
         }]
     except Exception as e:
         logger.error(f"CSV processing error: {e}")
 @lru_cache(maxsize=100)
 def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
+    """Enhanced file processing with detailed logging"""
     try:
+        logger.info(f"Processing file: {file_path} (type: {file_type})")
         if file_type == "pdf":
             chunks = extract_all_pages(file_path)
+            if not chunks or (len(chunks) == 1 and "error" in chunks[0]):
+                return [{"error": chunks[0] if chunks else "PDF appears to be empty"}]
             return [{
                 "filename": os.path.basename(file_path),
                 "content": chunk,
                 "status": "initial",
+                "type": "pdf",
+                "page": i+1
+            } for i, chunk in enumerate(chunks)]
         elif file_type in ["xls", "xlsx"]:
+            result = excel_to_json(file_path)
+            if "error" in result[0]:
+                logger.error(f"Excel processing failed: {result[0]['error']}")
+            else:
+                logger.info(f"Excel processing successful - found {len(result)} sheets")
+            return result
         elif file_type == "csv":
+            result = csv_to_json(file_path)
+            if "error" in result[0]:
+                logger.error(f"CSV processing failed: {result[0]['error']}")
+            else:
+                logger.info(f"CSV processing successful - found {len(result[0]['rows'])} rows")
+            return result
         else:
+            logger.warning(f"Unsupported file type: {file_type}")
             return [{"error": f"Unsupported file type: {file_type}"}]
     except Exception as e:
+        logger.error(f"Error processing {file_path}: {str(e)}", exc_info=True)
         return [{"error": f"Error processing file: {str(e)}"}]
 def clean_response(text: str) -> str:
                 progress_bar = gr.Progress()
         def analyze(message: str, history: List[List[str]], files: List, progress=gr.Progress()):
+            """Enhanced analysis with detailed file processing feedback"""
+            try:
+                if history is None:
+                    history = []
+                history.append([message, None])
+                yield history, None, ""
+                if not files:
+                    history[-1][1] = "❌ Please upload a file to analyze"
+                    yield history, None, "No files uploaded"
+                    return
+                extracted = []
+                file_hash_value = ""
                 for f in files:
                     file_type = f.name.split(".")[-1].lower()
+                    logger.info(f"Processing file: {f.name} (type: {file_type})")
+                    cache_key = f"{file_hash(f.name)}_{file_type}"
                     if cache_key in cache:
                         cached_data = cache[cache_key]
                         if isinstance(cached_data, list) and len(cached_data) > 0:
                             extracted.extend(cached_data)
                             history[-1][1] = f"✅ Using cached data for {os.path.basename(f.name)}"
                             yield history, None, ""
+                            continue
+                    try:
+                        result = process_file_cached(f.name, file_type)
+                        if "error" in result[0]:
+                            history[-1][1] = f"❌ Error processing {os.path.basename(f.name)}: {result[0]['error']}"
+                            yield history, None, result[0]['error']
                             return
+                        cache[cache_key] = result
+                        extracted.extend(result)
+                        history[-1][1] = f"✅ Processed {os.path.basename(f.name)}"
+                        yield history, None, ""
+                    except Exception as e:
+                        logger.error(f"File processing error: {e}", exc_info=True)
+                        history[-1][1] = f"❌ Critical error processing {os.path.basename(f.name)}"
+                        yield history, None, str(e)
+                        return
                 file_hash_value = file_hash(files[0].name) if files else ""
+                # Debug extracted content
+                logger.info(f"Extracted content summary:")
+                for item in extracted:
+                    if "content" in item:
+                        logger.info(f"- {item['filename']}: {len(item['content'])} chars")
+                    elif "rows" in item:
+                        logger.info(f"- {item['filename']}: {len(item['rows'])} rows")
+                if not extracted:
+                    history[-1][1] = "❌ No valid content extracted from files"
+                    yield history, None, "No valid content extracted"
+                    return
+                chunks = []
+                for item in extracted:
+                    if "content" in item:
+                        chunks.append(item["content"])
+                    elif "rows" in item:
+                        # Convert Excel/CSV rows to text
+                        rows_text = "\n".join([", ".join(map(str, row)) for row in item["rows"]])
+                        chunks.append(f"=== {item['filename']} ===\n{rows_text}")
+                if not chunks:
+                    history[-1][1] = "❌ No processable content found in files"
+                    yield history, None, "No processable content found"
+                    return
+                combined_response = ""
+                report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
+                try:
+                    for batch_idx in range(0, len(chunks), BATCH_SIZE):
+                        batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
+                        progress(batch_idx / len(chunks),
+                               desc=f"Processing batch {(batch_idx // BATCH_SIZE) + 1}/{(len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE}")
+                        with ThreadPoolExecutor(max_workers=min(BATCH_SIZE, MAX_WORKERS)) as executor:
+                            futures = {
+                                executor.submit(
+                                    agent.run_quick_summary,
+                                    chunk, 0.2, 256, 1024
+                                ): idx
+                                for idx, chunk in enumerate(batch_chunks)
+                            }
+                            for future in as_completed(futures):
+                                chunk_idx = futures[future]
+                                try:
+                                    response = clean_response(future.result())
+                                    if response:
+                                        combined_response += f"\n--- Analysis for Chunk {batch_idx + chunk_idx + 1} ---\n{response}\n"
+                                        history[-1][1] = combined_response.strip()
+                                        yield history, None, ""
+                                except Exception as e:
+                                    logger.error(f"Chunk processing error: {e}")
+                                    history[-1][1] = f"Error processing chunk: {str(e)}"
                                     yield history, None, ""
+                                finally:
+                                    del future
+                                    torch.cuda.empty_cache()
+                                    gc.collect()
+                    summary = "Analysis complete. " + ("Download full report below." if report_path and os.path.exists(report_path) else "")
+                    history.append(["Analysis completed", None])
+                    history[-1][1] = summary
+                    yield history, report_path, summary
+                except Exception as e:
+                    logger.error(f"Analysis error: {e}")
+                    history.append(["Analysis failed", None])
+                    history[-1][1] = f"❌ Error occurred: {str(e)}"
+                    yield history, None, f"Error occurred: {str(e)}"
+                finally:
+                    torch.cuda.empty_cache()
+                    gc.collect()
             except Exception as e:
+                logger.error(f"Unexpected error in analysis: {e}")
+                history.append(["System error", None])
+                history[-1][1] = f"❌ System error occurred: {str(e)}"
+                yield history, None, f"System error: {str(e)}"
         send_btn.click(
             analyze,