CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 19

Commit

5d37db7

verified ·

1 Parent(s): 8ce9243

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -54

app.py CHANGED Viewed

@@ -27,10 +27,11 @@ logger = logging.getLogger(__name__)
 # Constants
 MAX_TOKENS = 1800
-BATCH_SIZE = 2
-MAX_WORKERS = 4
-CHUNK_SIZE = 10  # For PDF processing
-MODEL_MAX_TOKENS = 131072  # Model's maximum token limit
 # Persistent directory setup
 persistent_dir = "/data/hf_cache"
@@ -79,45 +80,75 @@ def file_hash(path: str) -> str:
             hash_md5.update(chunk)
     return hash_md5.hexdigest()
-def extract_pdf_page(page) -> str:
-    """Optimized single page extraction"""
     try:
         text = page.extract_text() or ""
-        return f"=== Page {page.page_number} ===\n{text.strip()}"
     except Exception as e:
         logger.warning(f"Error extracting page {page.page_number}: {str(e)}")
-        return ""
-def extract_all_pages(file_path: str, progress_callback=None) -> str:
-    """Optimized PDF extraction with memory management"""
     try:
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
-                return ""
         results = []
         for chunk_start in range(0, total_pages, CHUNK_SIZE):
             chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
             with pdfplumber.open(file_path) as pdf:
-                with ThreadPoolExecutor(max_workers=min(CHUNK_SIZE, 4)) as executor:
-                    futures = [executor.submit(extract_pdf_page, pdf.pages[i])
                              for i in range(chunk_start, chunk_end)]
                     for future in as_completed(futures):
-                        results.append(future.result())
                     if progress_callback:
                         progress_callback(min(chunk_end, total_pages), total_pages)
             del pdf
             gc.collect()
-        return "\n\n".join(filter(None, results))
     except Exception as e:
         logger.error(f"PDF processing error: {e}")
-        return f"PDF processing error: {str(e)}"
 def excel_to_json(file_path: str) -> List[Dict]:
     """Optimized Excel processing with chunking"""
@@ -173,13 +204,13 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
     """Cached file processing with memory optimization"""
     try:
         if file_type == "pdf":
-            text = extract_all_pages(file_path)
             return [{
                 "filename": os.path.basename(file_path),
-                "content": text,
                 "status": "initial",
                 "type": "pdf"
-            }]
         elif file_type in ["xls", "xlsx"]:
             return excel_to_json(file_path)
         elif file_type == "csv":
@@ -191,9 +222,17 @@ def process_file_cached(file_path: str, file_type: str) -> List[Dict]:
         return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
 def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
-    """Optimized tokenization and chunking with strict token limit enforcement"""
     tokenizer = get_tokenizer()
     tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
     current_chunk = []
     current_length = 0
@@ -210,21 +249,6 @@ def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
     if current_chunk:
         chunks.append(tokenizer.decode(current_chunk))
-    # Validate total tokens
-    total_tokens = sum(len(tokenizer.encode(chunk, add_special_tokens=False)) for chunk in chunks)
-    if total_tokens > MODEL_MAX_TOKENS:
-        logger.warning(f"Total tokens ({total_tokens}) exceed model limit ({MODEL_MAX_TOKENS}). Truncating.")
-        truncated_chunks = []
-        current_tokens = 0
-        for chunk in chunks:
-            chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
-            if current_tokens + chunk_tokens <= MODEL_MAX_TOKENS:
-                truncated_chunks.append(chunk)
-                current_tokens += chunk_tokens
-            else:
-                break
-        chunks = truncated_chunks
     return chunks
 def log_system_usage(tag=""):
@@ -427,26 +451,22 @@ Patient Record Excerpt (Chunk {0} of {1}):
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
-            text_content = "\n".join(json.dumps(item, ensure_ascii=False) for item in extracted)
-            del extracted
-            gc.collect()
-            try:
-                chunks = tokenize_and_chunk(text_content)
-            except Exception as e:
-                logger.error(f"Tokenization error: {e}")
-                history.append({"role": "assistant", "content": f"❌ Error: Input too large to process. Please upload a smaller file."})
-                yield history, None, f"Error: Input too large to process."
                 return
-            del text_content
-            gc.collect()
             combined_response = ""
             report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
             seen_responses = set()
             try:
                 for batch_idx in range(0, len(chunks), BATCH_SIZE):
                     batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
@@ -511,10 +531,12 @@ Patient Record Excerpt (Chunk {0} of {1}):
                                 for chunk_output in future.result():
                                     if isinstance(chunk_output, list):
                                         for msg in chunk_output:
-                                            if isinstance(msg, ChatMessage) and msg.content:
-                                                combined_response += clean_response(msg.content) + "\n"
-                                                history[-1] = {"role": "assistant", "content": combined_response.strip()}
-                                                yield history, report_path, ""
                             except Exception as e:
                                 logger.error(f"Detailed analysis error for chunk {batch_idx + chunk_idx + 1}: {e}")
                                 history[-1] = {"role": "assistant", "content": f"Error in detailed analysis for chunk {batch_idx + chunk_idx + 1}: {str(e)}"}

 # Constants
 MAX_TOKENS = 1800
+BATCH_SIZE = 1  # Reduced to minimize memory pressure
+MAX_WORKERS = 2
+CHUNK_SIZE = 5  # Smaller chunks for PDF processing
+MODEL_MAX_TOKENS = 131072
+MAX_TEXT_LENGTH = 500000  # Limit raw text length before tokenization
 # Persistent directory setup
 persistent_dir = "/data/hf_cache"
             hash_md5.update(chunk)
     return hash_md5.hexdigest()
+def extract_pdf_page(page, tokenizer, max_tokens=MAX_TOKENS) -> List[str]:
+    """Extract and chunk a single page with token limit"""
     try:
         text = page.extract_text() or ""
+        text = sanitize_utf8(text)
+        if len(text) > MAX_TEXT_LENGTH // 10:  # Per-page text limit
+            logger.warning(f"Page {page.page_number} text too long ({len(text)}). Truncating.")
+            text = text[:MAX_TEXT_LENGTH // 10]
+        tokens = tokenizer.encode(text, add_special_tokens=False)
+        if len(tokens) > max_tokens:
+            chunks = []
+            current_chunk = []
+            current_length = 0
+            for token in tokens:
+                if current_length + 1 > max_tokens:
+                    chunks.append(tokenizer.decode(current_chunk))
+                    current_chunk = [token]
+                    current_length = 1
+                else:
+                    current_chunk.append(token)
+                    current_length += 1
+            if current_chunk:
+                chunks.append(tokenizer.decode(current_chunk))
+            return [f"=== Page {page.page_number} ===\n{c}" for c in chunks]
+        return [f"=== Page {page.page_number} ===\n{text}"]
     except Exception as e:
         logger.warning(f"Error extracting page {page.page_number}: {str(e)}")
+        return []
+def extract_all_pages(file_path: str, progress_callback=None) -> List[str]:
+    """Extract PDF pages with early token-based chunking"""
     try:
+        tokenizer = get_tokenizer()
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
             if total_pages == 0:
+                return []
         results = []
+        total_tokens = 0
         for chunk_start in range(0, total_pages, CHUNK_SIZE):
             chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
             with pdfplumber.open(file_path) as pdf:
+                with ThreadPoolExecutor(max_workers=min(CHUNK_SIZE, 2)) as executor:
+                    futures = [executor.submit(extract_pdf_page, pdf.pages[i], tokenizer)
                              for i in range(chunk_start, chunk_end)]
                     for future in as_completed(futures):
+                        page_chunks = future.result()
+                        for chunk in page_chunks:
+                            chunk_tokens = len(tokenizer.encode(chunk, add_special_tokens=False))
+                            if total_tokens + chunk_tokens > MODEL_MAX_TOKENS:
+                                logger.warning(f"Total tokens ({total_tokens + chunk_tokens}) exceed model limit ({MODEL_MAX_TOKENS}). Stopping.")
+                                return results
+                            results.append(chunk)
+                            total_tokens += chunk_tokens
                     if progress_callback:
                         progress_callback(min(chunk_end, total_pages), total_pages)
             del pdf
             gc.collect()
+        return results
     except Exception as e:
         logger.error(f"PDF processing error: {e}")
+        return [f"PDF processing error: {str(e)}"]
 def excel_to_json(file_path: str) -> List[Dict]:
     """Optimized Excel processing with chunking"""
     """Cached file processing with memory optimization"""
     try:
         if file_type == "pdf":
+            chunks = extract_all_pages(file_path)
             return [{
                 "filename": os.path.basename(file_path),
+                "content": chunk,
                 "status": "initial",
                 "type": "pdf"
+            } for chunk in chunks]
         elif file_type in ["xls", "xlsx"]:
             return excel_to_json(file_path)
         elif file_type == "csv":
         return [{"error": f"Error processing {os.path.basename(file_path)}: {str(e)}"}]
 def tokenize_and_chunk(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
+    """Optimized tokenization and chunking with early validation"""
+    if len(text) > MAX_TEXT_LENGTH:
+        logger.warning(f"Text length ({len(text)}) exceeds limit ({MAX_TEXT_LENGTH}). Truncating.")
+        text = text[:MAX_TEXT_LENGTH]
     tokenizer = get_tokenizer()
     tokens = tokenizer.encode(text, add_special_tokens=False)
+    if len(tokens) > MODEL_MAX_TOKENS:
+        logger.error(f"Token count ({len(tokens)}) exceeds model limit ({MODEL_MAX_TOKENS}).")
+        return [text[:MAX_TEXT_LENGTH // 10]]  # Fallback to small chunk
     chunks = []
     current_chunk = []
     current_length = 0
     if current_chunk:
         chunks.append(tokenizer.decode(current_chunk))
     return chunks
 def log_system_usage(tag=""):
                 history.append({"role": "assistant", "content": "✅ File processing complete"})
                 yield history, None, ""
+            if not extracted:
+                history.append({"role": "assistant", "content": "❌ No valid content extracted. Please upload a supported file."})
+                yield history, None, "No valid content extracted."
                 return
             combined_response = ""
             report_path = os.path.join(report_dir, f"{file_hash_value}_report.txt") if file_hash_value else None
             seen_responses = set()
             try:
+                chunks = [item["content"] for item in extracted if "content" in item]
+                if not chunks:
+                    history.append({"role": "assistant", "content": "❌ No processable content found in the file."})
+                    yield history, None, "No processable content found."
+                    return
                 for batch_idx in range(0, len(chunks), BATCH_SIZE):
                     batch_chunks = chunks[batch_idx:batch_idx + BATCH_SIZE]
                                 for chunk_output in future.result():
                                     if isinstance(chunk_output, list):
                                         for msg in chunk_output:
+                                            if isinstance(msg, gr.ChatMessage) and msg.content:
+                                                cleaned_content = clean_response(msg.content)
+                                                if cleaned_content and cleaned_content != "No missed diagnoses identified.":
+                                                    combined_response += cleaned_content + "\n"
+                                                    history[-1] = {"role": "assistant", "content": combined_response.strip()}
+                                                    yield history, report_path, ""
                             except Exception as e:
                                 logger.error(f"Detailed analysis error for chunk {batch_idx + chunk_idx + 1}: {e}")
                                 history[-1] = {"role": "assistant", "content": f"Error in detailed analysis for chunk {batch_idx + chunk_idx + 1}: {str(e)}"}