CPS-Test-Mobile

Paused

App Files Files Community

Ali2206 commited on Apr 20

Commit

fb2ccc1

verified ·

1 Parent(s): 44280bd

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -78

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import re
 import psutil
 import subprocess
 from datetime import datetime
 # Persistent directory setup
 persistent_dir = "/data/hf_cache"
@@ -44,8 +45,10 @@ MEDICAL_KEYWORDS = {
     'allergies', 'summary', 'impression', 'findings', 'recommendations',
     'conclusion', 'history', 'examination', 'progress', 'discharge'
 }
-CHUNK_SIZE = 10000  # Increased chunk size for better context
-MAX_TOKENS = 12000  # Maximum tokens for analysis
 def sanitize_utf8(text: str) -> str:
     """Ensure text is UTF-8 clean."""
@@ -56,14 +59,21 @@ def file_hash(path: str) -> str:
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
-def extract_all_pages(file_path: str) -> Tuple[str, int]:
     """
-    Extract all pages from PDF with smart prioritization of medical sections.
-    Returns (extracted_text, total_pages)
     """
     try:
         text_chunks = []
         total_pages = 0
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
@@ -71,18 +81,22 @@ def extract_all_pages(file_path: str) -> Tuple[str, int]:
                 page_text = page.extract_text() or ""
                 lower_text = page_text.lower()
-                # Include all pages but mark sections with medical keywords
                 if any(re.search(rf'\b{kw}\b', lower_text) for kw in MEDICAL_KEYWORDS):
-                    text_chunks.append(f"=== MEDICAL SECTION (Page {i+1}) ===\n{page_text.strip()}")
                 else:
-                    text_chunks.append(f"=== Page {i+1} ===\n{page_text.strip()}")
-        return "\n\n".join(text_chunks), total_pages
     except Exception as e:
-        return f"PDF processing error: {str(e)}", 0
 def convert_file_to_json(file_path: str, file_type: str) -> str:
-    """Convert file to JSON format with caching, processing all content."""
     try:
         h = file_hash(file_path)
         cache_path = os.path.join(file_cache_dir, f"{h}.json")
@@ -92,11 +106,12 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
                 return f.read()
         if file_type == "pdf":
-            text, total_pages = extract_all_pages(file_path)
             result = json.dumps({
                 "filename": os.path.basename(file_path),
                 "content": text,
                 "total_pages": total_pages,
                 "status": "complete"
             })
         elif file_type == "csv":
@@ -106,15 +121,22 @@ def convert_file_to_json(file_path: str, file_type: str) -> str:
                                    skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
                 chunks.append(chunk.fillna("").astype(str).values.tolist())
             content = [item for sublist in chunks for item in sublist]
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         elif file_type in ["xls", "xlsx"]:
             try:
-                # Read Excel in chunks if possible
                 df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
             except Exception:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
             content = df.fillna("").astype(str).values.tolist()
-            result = json.dumps({"filename": os.path.basename(file_path), "rows": content})
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
@@ -204,6 +226,40 @@ def format_final_report(analysis_results: List[str], filename: str) -> str:
     return "\n".join(report)
 def init_agent():
     """Initialize the TxAgent with proper configuration."""
     print("🔁 Initializing model...")
@@ -229,72 +285,74 @@ def init_agent():
     print("✅ Agent Ready")
     return agent
-def analyze_large_document(content: str, filename: str, agent: TxAgent) -> str:
-    """Analyze large documents by processing in logical sections."""
-    # Split content into logical sections
-    sections = re.split(r"(=== MEDICAL SECTION|=== Page \d+ ===)", content)
-    sections = [s.strip() for s in sections if s.strip()]
     analysis_results = []
-    current_chunk = ""
-    for section in sections:
-        # If adding this section would exceed chunk size, analyze current chunk
-        if len(current_chunk) + len(section) > CHUNK_SIZE and current_chunk:
-            analysis_results.append(process_chunk(current_chunk, filename, agent))
-            current_chunk = section
-        else:
-            current_chunk += "\n\n" + section
-    # Process the last chunk
-    if current_chunk:
-        analysis_results.append(process_chunk(current_chunk, filename, agent))
-    return format_final_report(analysis_results, filename)
-def process_chunk(chunk: str, filename: str, agent: TxAgent) -> str:
-    """Process a single chunk of the document."""
-    prompt = f"""
-Analyze this section of medical records for clinical oversights. Focus on:
-1. Critical findings needing immediate attention
-2. Potential missed diagnoses
-3. Medication conflicts
-4. Assessment gaps
-5. Follow-up recommendations
-File: {filename}
-Content:
-{chunk[:CHUNK_SIZE]}
-Provide concise findings in bullet points under relevant headings.
-Focus on factual evidence from the content.
 """
-    full_response = ""
-    for output in agent.run_gradio_chat(
-        message=prompt,
-        history=[],
-        temperature=0.1,  # Lower temperature for more factual responses
-        max_new_tokens=1024,
-        max_token=MAX_TOKENS,
-        call_agent=False,
-        conversation=[],
-    ):
-        if output is None:
             continue
-        if isinstance(output, list):
-            for m in output:
-                if hasattr(m, 'content') and m.content:
-                    cleaned = clean_response(m.content)
-                    if cleaned:
-                        full_response += cleaned + "\n"
-        elif isinstance(output, str) and output.strip():
-            cleaned = clean_response(output)
-            if cleaned:
-                full_response += cleaned + "\n"
-    return full_response
 def create_ui(agent):
     """Create the Gradio interface."""
@@ -316,7 +374,7 @@ def create_ui(agent):
                     label="Analysis Focus"
                 )
                 with gr.Row():
-                    send_btn = gr.Button("Analyze Full Document", variant="primary")
                     clear_btn = gr.Button("Clear")
                 status = gr.Textbox(label="Status", interactive=False)
@@ -338,11 +396,12 @@ def create_ui(agent):
                 yield "", None, "⚠️ Please upload at least one file to analyze."
                 return
-            yield "", None, "⏳ Processing documents..."
             # Process all files completely
             file_contents = []
             filenames = []
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = []
@@ -356,7 +415,14 @@ def create_ui(agent):
                 results = []
                 for future in as_completed(futures):
-                    results.append(sanitize_utf8(future.result()))
                 file_contents = results
@@ -367,11 +433,11 @@ def create_ui(agent):
                 for fc in file_contents
             ])
-            yield "", None, "🔍 Analyzing content..."
             try:
                 # Process the complete document
-                full_report = analyze_large_document(
                     combined_content,
                     combined_filename,
                     agent
@@ -408,6 +474,13 @@ def create_ui(agent):
 if __name__ == "__main__":
     print("🚀 Launching app...")
     agent = init_agent()
     demo = create_ui(agent)
     demo.queue(

 import psutil
 import subprocess
 from datetime import datetime
+import tiktoken
 # Persistent directory setup
 persistent_dir = "/data/hf_cache"
     'allergies', 'summary', 'impression', 'findings', 'recommendations',
     'conclusion', 'history', 'examination', 'progress', 'discharge'
 }
+TOKENIZER = "cl100k_base"  # Matches Llama 3's tokenizer
+MAX_MODEL_LEN = 8000  # Conservative estimate for model context
+CHUNK_TOKEN_SIZE = MAX_MODEL_LEN // 2  # Target chunk size
+MEDICAL_SECTION_HEADER = "=== MEDICAL SECTION ==="
 def sanitize_utf8(text: str) -> str:
     """Ensure text is UTF-8 clean."""
     with open(path, "rb") as f:
         return hashlib.md5(f.read()).hexdigest()
+def count_tokens(text: str) -> int:
+    """Count tokens using the same method as the model"""
+    encoding = tiktoken.get_encoding(TOKENIZER)
+    return len(encoding.encode(text))
+def extract_all_pages_with_token_count(file_path: str) -> Tuple[str, int, int]:
     """
+    Extract all pages from PDF with token counting.
+    Returns (extracted_text, total_pages, total_tokens)
     """
     try:
         text_chunks = []
         total_pages = 0
+        total_tokens = 0
         with pdfplumber.open(file_path) as pdf:
             total_pages = len(pdf.pages)
                 page_text = page.extract_text() or ""
                 lower_text = page_text.lower()
+                # Mark medical sections
                 if any(re.search(rf'\b{kw}\b', lower_text) for kw in MEDICAL_KEYWORDS):
+                    section_header = f"\n{MEDICAL_SECTION_HEADER} (Page {i+1})\n"
+                    text_chunks.append(section_header + page_text.strip())
+                    total_tokens += count_tokens(section_header)
                 else:
+                    text_chunks.append(f"\n=== Page {i+1} ===\n{page_text.strip()}")
+                total_tokens += count_tokens(page_text)
+        return "\n".join(text_chunks), total_pages, total_tokens
     except Exception as e:
+        return f"PDF processing error: {str(e)}", 0, 0
 def convert_file_to_json(file_path: str, file_type: str) -> str:
+    """Convert file to JSON format with caching and token counting."""
     try:
         h = file_hash(file_path)
         cache_path = os.path.join(file_cache_dir, f"{h}.json")
                 return f.read()
         if file_type == "pdf":
+            text, total_pages, total_tokens = extract_all_pages_with_token_count(file_path)
             result = json.dumps({
                 "filename": os.path.basename(file_path),
                 "content": text,
                 "total_pages": total_pages,
+                "total_tokens": total_tokens,
                 "status": "complete"
             })
         elif file_type == "csv":
                                    skip_blank_lines=False, on_bad_lines="skip", chunksize=1000):
                 chunks.append(chunk.fillna("").astype(str).values.tolist())
             content = [item for sublist in chunks for item in sublist]
+            result = json.dumps({
+                "filename": os.path.basename(file_path),
+                "rows": content,
+                "total_tokens": count_tokens(str(content))
+            })
         elif file_type in ["xls", "xlsx"]:
             try:
                 df = pd.read_excel(file_path, engine="openpyxl", header=None, dtype=str)
             except Exception:
                 df = pd.read_excel(file_path, engine="xlrd", header=None, dtype=str)
             content = df.fillna("").astype(str).values.tolist()
+            result = json.dumps({
+                "filename": os.path.basename(file_path),
+                "rows": content,
+                "total_tokens": count_tokens(str(content))
+            })
         else:
             result = json.dumps({"error": f"Unsupported file type: {file_type}"})
     return "\n".join(report)
+def split_content_by_tokens(content: str, max_tokens: int = CHUNK_TOKEN_SIZE) -> List[str]:
+    """Split content into chunks that fit within token limits"""
+    paragraphs = re.split(r"\n\s*\n", content)
+    chunks = []
+    current_chunk = []
+    current_tokens = 0
+    for para in paragraphs:
+        para_tokens = count_tokens(para)
+        if para_tokens > max_tokens:
+            # Handle very long paragraphs by splitting sentences
+            sentences = re.split(r'(?<=[.!?])\s+', para)
+            for sent in sentences:
+                sent_tokens = count_tokens(sent)
+                if current_tokens + sent_tokens > max_tokens:
+                    chunks.append("\n\n".join(current_chunk))
+                    current_chunk = [sent]
+                    current_tokens = sent_tokens
+                else:
+                    current_chunk.append(sent)
+                    current_tokens += sent_tokens
+        elif current_tokens + para_tokens > max_tokens:
+            chunks.append("\n\n".join(current_chunk))
+            current_chunk = [para]
+            current_tokens = para_tokens
+        else:
+            current_chunk.append(para)
+            current_tokens += para_tokens
+    if current_chunk:
+        chunks.append("\n\n".join(current_chunk))
+    return chunks
 def init_agent():
     """Initialize the TxAgent with proper configuration."""
     print("🔁 Initializing model...")
     print("✅ Agent Ready")
     return agent
+def analyze_complete_document(content: str, filename: str, agent: TxAgent) -> str:
+    """Analyze complete document with proper chunking and token management"""
+    chunks = split_content_by_tokens(content)
     analysis_results = []
+    for i, chunk in enumerate(chunks):
+        try:
+            # Create context-aware prompt
+            prompt = f"""
+Analyze this section ({i+1}/{len(chunks)}) of medical records for clinical oversights.
+Focus on factual evidence from the content only.
+**File:** {filename}
+**Content:**
+{chunk}
+Provide concise findings under these headings:
+1. CRITICAL FINDINGS (urgent issues)
+2. MISSED DIAGNOSES (with supporting evidence)
+3. MEDICATION ISSUES (specific conflicts)
+4. ASSESSMENT GAPS (missing evaluations)
+5. FOLLOW-UP RECOMMENDATIONS (specific actions)
+Be concise and evidence-based:
 """
+            # Ensure prompt + chunk doesn't exceed model limits
+            prompt_tokens = count_tokens(prompt)
+            chunk_tokens = count_tokens(chunk)
+            if prompt_tokens + chunk_tokens > MAX_MODEL_LEN - 1024:  # Leave room for response
+                # Dynamically adjust chunk size
+                max_chunk_tokens = MAX_MODEL_LEN - prompt_tokens - 1024
+                adjusted_chunk = ""
+                tokens_used = 0
+                for para in re.split(r"\n\s*\n", chunk):
+                    para_tokens = count_tokens(para)
+                    if tokens_used + para_tokens <= max_chunk_tokens:
+                        adjusted_chunk += "\n\n" + para
+                        tokens_used += para_tokens
+                    else:
+                        break
+                chunk = adjusted_chunk.strip()
+            response = ""
+            for output in agent.run_gradio_chat(
+                message=prompt,
+                history=[],
+                temperature=0.1,
+                max_new_tokens=1024,
+                max_token=MAX_MODEL_LEN,
+                call_agent=False,
+                conversation=[],
+            ):
+                if output:
+                    if isinstance(output, list):
+                        for m in output:
+                            if hasattr(m, 'content'):
+                                response += clean_response(m.content)
+                    elif isinstance(output, str):
+                        response += clean_response(output)
+            if response:
+                analysis_results.append(response)
+        except Exception as e:
+            print(f"Error processing chunk {i}: {str(e)}")
             continue
+    return format_final_report(analysis_results, filename)
 def create_ui(agent):
     """Create the Gradio interface."""
                     label="Analysis Focus"
                 )
                 with gr.Row():
+                    send_btn = gr.Button("Analyze Complete Documents", variant="primary")
                     clear_btn = gr.Button("Clear")
                 status = gr.Textbox(label="Status", interactive=False)
                 yield "", None, "⚠️ Please upload at least one file to analyze."
                 return
+            yield "", None, "⏳ Processing documents (this may take several minutes for large files)..."
             # Process all files completely
             file_contents = []
             filenames = []
+            total_tokens = 0
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = []
                 results = []
                 for future in as_completed(futures):
+                    result = sanitize_utf8(future.result())
+                    results.append(result)
+                    try:
+                        data = json.loads(result)
+                        if "total_tokens" in data:
+                            total_tokens += data["total_tokens"]
+                    except:
+                        pass
                 file_contents = results
                 for fc in file_contents
             ])
+            yield "", None, f"🔍 Analyzing content ({total_tokens//1000}k tokens)..."
             try:
                 # Process the complete document
+                full_report = analyze_complete_document(
                     combined_content,
                     combined_filename,
                     agent
 if __name__ == "__main__":
     print("🚀 Launching app...")
+    # Install tiktoken if not available
+    try:
+        import tiktoken
+    except ImportError:
+        print("Installing tiktoken...")
+        subprocess.run([sys.executable, "-m", "pip", "install", "tiktoken"])
     agent = init_agent()
     demo = create_ui(agent)
     demo.queue(