CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

58a777c

verified ·

1 Parent(s): 029059d

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -82

app.py CHANGED Viewed

@@ -98,77 +98,66 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
-def excel_to_ndjson(file_path: str) -> Generator[str, None, None]:
-    """Stream Excel file as NDJSON for maximum performance"""
     try:
-        # Use openpyxl in streaming mode
-        with pd.ExcelFile(file_path, engine='openpyxl') as xls:
-            for sheet_name in xls.sheet_names:
-                for chunk in pd.read_excel(
-                    xls,
-                    sheet_name=sheet_name,
-                    header=None,
-                    dtype=str,
-                    chunksize=1000
-                ):
-                    for _, row in chunk.iterrows():
-                        yield json.dumps({
-                            "sheet": sheet_name,
-                            "row": row.fillna("").astype(str).tolist()
-                        }) + "\n"
     except Exception as e:
-        logger.error(f"Error streaming Excel: {e}")
-        raise
-def csv_to_ndjson(file_path: str) -> Generator[str, None, None]:
-    """Stream CSV file as NDJSON for maximum performance"""
     try:
-        for chunk in pd.read_csv(
             file_path,
             header=None,
             dtype=str,
-            chunksize=1000,
             encoding_errors='replace',
             on_bad_lines='skip'
-        ):
-            for _, row in chunk.iterrows():
-                yield json.dumps({
-                    "row": row.fillna("").astype(str).tolist()
-                }) + "\n"
     except Exception as e:
-        logger.error(f"Error streaming CSV: {e}")
-        raise
-def stream_file_to_json(file_path: str, file_type: str) -> Generator[str, None, None]:
-    """Stream file content as JSON chunks"""
     try:
         if file_type == "pdf":
             text = extract_all_pages(file_path)
-            yield json.dumps({
                 "filename": os.path.basename(file_path),
                 "content": text,
                 "status": "initial"
-            })
-        elif file_type in ["csv", "xls", "xlsx"]:
-            # Stream the file content
-            yield json.dumps({
-                "filename": os.path.basename(file_path),
-                "streaming": True,
-                "type": file_type
-            })
-            if file_type == "csv":
-                stream_gen = csv_to_ndjson(file_path)
-            else:
-                stream_gen = excel_to_ndjson(file_path)
-            for chunk in stream_gen:
-                yield chunk
         else:
-            yield json.dumps({"error": f"Unsupported file type: {file_type}"})
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
-        yield json.dumps({"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"})
 def log_system_usage(tag=""):
     try:
@@ -272,15 +261,6 @@ def init_agent():
     logger.info("Agent Ready")
     return agent
-def batched(iterable, n):
-    """Batch data into tuples of length n. The last batch may be shorter."""
-    it = iter(iterable)
-    while True:
-        batch = list(islice(it, n))
-        if not batch:
-            return
-        yield batch
 def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
@@ -306,14 +286,15 @@ Patient Record Excerpt (Chunk {0} of {1}):
             file_hash_value = ""
             if files:
-                # Process files in parallel with streaming
                 with ThreadPoolExecutor(max_workers=4) as executor:
                     futures = []
                     for f in files:
                         file_type = f.name.split(".")[-1].lower()
                         futures.append(executor.submit(
-                            lambda f: list(stream_file_to_json(f.name, file_type)),
-                            f
                         ))
                     for future in as_completed(futures):
@@ -321,39 +302,35 @@ Patient Record Excerpt (Chunk {0} of {1}):
                             extracted.extend(future.result())
                         except Exception as e:
                             logger.error(f"File processing error: {e}")
-                            extracted.append(json.dumps({
-                                "error": f"Error processing file: {str(e)}"
-                            }))
                 file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
             # Process chunks in parallel with dynamic batching
-            chunk_size = 8000  # Larger chunks reduce overhead
             combined_response = ""
             try:
-                # Convert extracted data to text chunks
-                text_content = "\n".join(extracted)
-                chunks = [text_content[i:i+chunk_size] for i in range(0, len(text_content), chunk_size)]
-                # Process chunks in parallel batches
-                batch_size = 4  # Optimal for most GPUs
-                total_chunks = len(chunks)
-                for batch_idx, batch_chunks in enumerate(batched(chunks, batch_size)):
                     batch_prompts = [
                         prompt_template.format(
-                            batch_idx * batch_size + i + 1,
-                            total_chunks,
-                            chunk=chunk[:6000]  # Slightly larger context
                         )
                         for i, chunk in enumerate(batch_chunks)
                     ]
-                    progress((batch_idx * batch_size) / total_chunks,
-                           desc=f"Analyzing batch {batch_idx + 1}/{(total_chunks + batch_size - 1) // batch_size}")
                     # Process batch in parallel
                     with ThreadPoolExecutor(max_workers=len(batch_prompts)) as executor:
@@ -381,7 +358,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
                                     if cleaned:
                                         chunk_response += cleaned + " "
-                            combined_response += f"--- Analysis for Chunk {batch_idx * batch_size + 1} ---\n{chunk_response.strip()}\n"
                             history[-1] = {"role": "assistant", "content": combined_response.strip()}
                             yield history, None, ""

         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
+def excel_to_json(file_path: str) -> List[dict]:
+    """Convert Excel file to JSON data with proper error handling"""
     try:
+        # First try with openpyxl (faster for xlsx)
+        try:
+            df = pd.read_excel(file_path, engine='openpyxl', header=None, dtype=str)
+        except Exception:
+            # Fall back to xlrd if needed
+            df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
+        # Convert to list of lists
+        content = df.fillna("").astype(str).values.tolist()
+        return [{
+            "filename": os.path.basename(file_path),
+            "rows": content
+        }]
     except Exception as e:
+        logger.error(f"Error processing Excel file: {e}")
+        return [{"error": f"Error processing Excel file: {str(e)}"}]
+def csv_to_json(file_path: str) -> List[dict]:
+    """Convert CSV file to JSON data with proper error handling"""
     try:
+        df = pd.read_csv(
             file_path,
             header=None,
             dtype=str,
             encoding_errors='replace',
             on_bad_lines='skip'
+        )
+        content = df.fillna("").astype(str).values.tolist()
+        return [{
+            "filename": os.path.basename(file_path),
+            "rows": content
+        }]
     except Exception as e:
+        logger.error(f"Error processing CSV file: {e}")
+        return [{"error": f"Error processing CSV file: {str(e)}"}]
+def process_file(file_path: str, file_type: str) -> List[dict]:
+    """Process file based on type and return JSON data"""
     try:
         if file_type == "pdf":
             text = extract_all_pages(file_path)
+            return [{
                 "filename": os.path.basename(file_path),
                 "content": text,
                 "status": "initial"
+            }]
+        elif file_type in ["xls", "xlsx"]:
+            return excel_to_json(file_path)
+        elif file_type == "csv":
+            return csv_to_json(file_path)
         else:
+            return [{"error": f"Unsupported file type: {file_type}"}]
     except Exception as e:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
+        return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
 def log_system_usage(tag=""):
     try:
     logger.info("Agent Ready")
     return agent
 def create_ui(agent):
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("<h1 style='text-align: center;'>🩺 Clinical Oversight Assistant</h1>")
             file_hash_value = ""
             if files:
+                # Process files in parallel
                 with ThreadPoolExecutor(max_workers=4) as executor:
                     futures = []
                     for f in files:
                         file_type = f.name.split(".")[-1].lower()
                         futures.append(executor.submit(
+                            process_file,
+                            f.name,
+                            file_type
                         ))
                     for future in as_completed(futures):
                             extracted.extend(future.result())
                         except Exception as e:
                             logger.error(f"File processing error: {e}")
+                            extracted.append({"error": f"Error processing file: {str(e)}"})
                 file_hash_value = file_hash(files[0].name) if files else ""
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
+            # Convert extracted data to text
+            text_content = "\n".join(json.dumps(item) for item in extracted)
             # Process chunks in parallel with dynamic batching
+            chunk_size = 8000
+            chunks = [text_content[i:i+chunk_size] for i in range(0, len(text_content), chunk_size)]
             combined_response = ""
+            batch_size = 4  # Optimal for most GPUs
             try:
+                for batch_idx in range(0, len(chunks), batch_size):
+                    batch_chunks = chunks[batch_idx:batch_idx + batch_size]
                     batch_prompts = [
                         prompt_template.format(
+                            batch_idx + i + 1,
+                            len(chunks),
+                            chunk=chunk[:6000]
                         )
                         for i, chunk in enumerate(batch_chunks)
                     ]
+                    progress((batch_idx) / len(chunks),
+                           desc=f"Analyzing batch {(batch_idx // batch_size) + 1}/{(len(chunks) + batch_size - 1) // batch_size}")
                     # Process batch in parallel
                     with ThreadPoolExecutor(max_workers=len(batch_prompts)) as executor:
                                     if cleaned:
                                         chunk_response += cleaned + " "
+                            combined_response += f"--- Analysis for Chunk {batch_idx + 1} ---\n{chunk_response.strip()}\n"
                             history[-1] = {"role": "assistant", "content": combined_response.strip()}
                             yield history, None, ""