CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 18

Commit

543491f

verified ·

1 Parent(s): 58a777c

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -28

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pandas as pd
 import pdfplumber
 import json
 import gradio as gr
-from typing import List, Tuple, Optional, Generator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
@@ -16,13 +16,7 @@ import torch
 import gc
 from diskcache import Cache
 import time
-import pyarrow as pa
-import pyarrow.parquet as pq
-import pyarrow.csv as pc
-import numpy as np
-from functools import partial
-from itertools import islice
-import io
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -56,6 +50,9 @@ from txagent.txagent import TxAgent
 # Initialize cache with 10GB limit
 cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
@@ -98,8 +95,8 @@ def extract_all_pages(file_path: str, progress_callback=None) -> str:
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
-def excel_to_json(file_path: str) -> List[dict]:
-    """Convert Excel file to JSON data with proper error handling"""
     try:
         # First try with openpyxl (faster for xlsx)
         try:
@@ -108,38 +105,46 @@ def excel_to_json(file_path: str) -> List[dict]:
             # Fall back to xlrd if needed
             df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
-        # Convert to list of lists
-        content = df.fillna("").astype(str).values.tolist()
         return [{
             "filename": os.path.basename(file_path),
-            "rows": content
         }]
     except Exception as e:
         logger.error(f"Error processing Excel file: {e}")
         return [{"error": f"Error processing Excel file: {str(e)}"}]
-def csv_to_json(file_path: str) -> List[dict]:
-    """Convert CSV file to JSON data with proper error handling"""
     try:
-        df = pd.read_csv(
             file_path,
             header=None,
             dtype=str,
             encoding_errors='replace',
-            on_bad_lines='skip'
-        )
-        content = df.fillna("").astype(str).values.tolist()
         return [{
             "filename": os.path.basename(file_path),
-            "rows": content
         }]
     except Exception as e:
         logger.error(f"Error processing CSV file: {e}")
         return [{"error": f"Error processing CSV file: {str(e)}"}]
-def process_file(file_path: str, file_type: str) -> List[dict]:
     """Process file based on type and return JSON data"""
     try:
         if file_type == "pdf":
@@ -147,7 +152,8 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
             return [{
                 "filename": os.path.basename(file_path),
                 "content": text,
-                "status": "initial"
             }]
         elif file_type in ["xls", "xlsx"]:
             return excel_to_json(file_path)
@@ -159,6 +165,15 @@ def process_file(file_path: str, file_type: str) -> List[dict]:
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
         return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
 def log_system_usage(tag=""):
     try:
         cpu = psutil.cpu_percent(interval=1)
@@ -308,14 +323,13 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
-            # Convert extracted data to text
             text_content = "\n".join(json.dumps(item) for item in extracted)
-            # Process chunks in parallel with dynamic batching
-            chunk_size = 8000
-            chunks = [text_content[i:i+chunk_size] for i in range(0, len(text_content), chunk_size)]
             combined_response = ""
-            batch_size = 4  # Optimal for most GPUs
             try:
                 for batch_idx in range(0, len(chunks), batch_size):
@@ -324,7 +338,7 @@ Patient Record Excerpt (Chunk {0} of {1}):
                         prompt_template.format(
                             batch_idx + i + 1,
                             len(chunks),
-                            chunk=chunk[:6000]
                         )
                         for i, chunk in enumerate(batch_chunks)
                     ]

 import pdfplumber
 import json
 import gradio as gr
+from typing import List, Dict, Optional, Generator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import hashlib
 import shutil
 import gc
 from diskcache import Cache
 import time
+from transformers import AutoTokenizer
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 # Initialize cache with 10GB limit
 cache = Cache(file_cache_dir, size_limit=10 * 1024**3)
+# Initialize tokenizer for precise chunking
+tokenizer = AutoTokenizer.from_pretrained("mims-harvard/TxAgent-T1-Llama-3.1-8B")
 def sanitize_utf8(text: str) -> str:
     return text.encode("utf-8", "ignore").decode("utf-8")
         logger.error("PDF processing error: %s", e)
         return f"PDF processing error: {str(e)}"
+def excel_to_json(file_path: str) -> List[Dict]:
+    """Convert Excel file to JSON with optimized processing"""
     try:
         # First try with openpyxl (faster for xlsx)
         try:
             # Fall back to xlrd if needed
             df = pd.read_excel(file_path, engine='xlrd', header=None, dtype=str)
+        # Convert to list of lists with null handling
+        content = df.where(pd.notnull(df), "").astype(str).values.tolist()
         return [{
             "filename": os.path.basename(file_path),
+            "rows": content,
+            "type": "excel"
         }]
     except Exception as e:
         logger.error(f"Error processing Excel file: {e}")
         return [{"error": f"Error processing Excel file: {str(e)}"}]
+def csv_to_json(file_path: str) -> List[Dict]:
+    """Convert CSV file to JSON with optimized processing"""
     try:
+        # Read CSV in chunks if large
+        chunks = []
+        for chunk in pd.read_csv(
             file_path,
             header=None,
             dtype=str,
             encoding_errors='replace',
+            on_bad_lines='skip',
+            chunksize=10000
+        ):
+            chunks.append(chunk)
+        df = pd.concat(chunks) if chunks else pd.DataFrame()
+        content = df.where(pd.notnull(df), "").astype(str).values.tolist()
         return [{
             "filename": os.path.basename(file_path),
+            "rows": content,
+            "type": "csv"
         }]
     except Exception as e:
         logger.error(f"Error processing CSV file: {e}")
         return [{"error": f"Error processing CSV file: {str(e)}"}]
+def process_file(file_path: str, file_type: str) -> List[Dict]:
     """Process file based on type and return JSON data"""
     try:
         if file_type == "pdf":
             return [{
                 "filename": os.path.basename(file_path),
                 "content": text,
+                "status": "initial",
+                "type": "pdf"
             }]
         elif file_type in ["xls", "xlsx"]:
             return excel_to_json(file_path)
         logger.error("Error processing %s: %s", os.path.basename(file_path), e)
         return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
+def tokenize_and_chunk(text: str, max_tokens: int = 1800) -> List[str]:
+    """Split text into chunks based on token count"""
+    tokens = tokenizer.encode(text)
+    chunks = []
+    for i in range(0, len(tokens), max_tokens):
+        chunk_tokens = tokens[i:i + max_tokens]
+        chunks.append(tokenizer.decode(chunk_tokens))
+    return chunks
 def log_system_usage(tag=""):
     try:
         cpu = psutil.cpu_percent(interval=1)
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
+            # Convert extracted data to JSON text
             text_content = "\n".join(json.dumps(item) for item in extracted)
+            # Tokenize and chunk the content properly
+            chunks = tokenize_and_chunk(text_content)
             combined_response = ""
+            batch_size = 2  # Reduced batch size to prevent token overflow
             try:
                 for batch_idx in range(0, len(chunks), batch_size):
                         prompt_template.format(
                             batch_idx + i + 1,
                             len(chunks),
+                            chunk=chunk[:1800]  # Conservative chunk size
                         )
                         for i, chunk in enumerate(batch_chunks)
                     ]