Spaces:

Cylanoid
/

llama_4_Medical_Fraud_Detection

Paused

App Files Files Community

Cylanoid commited on Apr 21

Commit

4d504fd

verified ·

1 Parent(s): 36b5bed

Update document_analyzer.py

Browse files

Files changed (1) hide show

document_analyzer.py +39 -249

document_analyzer.py CHANGED Viewed

@@ -1,261 +1,51 @@
 # document_analyzer.py
-# Enhanced document analysis module for healthcare fraud detection with Llama 4 (text-only)
 import torch
-import re
-from typing import List, Dict, Any
 import nltk
 from nltk.tokenize import sent_tokenize
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
 class HealthcareFraudAnalyzer:
-    def __init__(self, model, tokenizer, device=None):
         self.model = model
         self.tokenizer = tokenizer
-        self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
-        self.model.to(self.device)
-        self.model.eval()
-        self.fraud_categories = [
-            "Consent violations",
-            "Documentation issues",
-            "Visitation restrictions",
-            "Medication misuse",
-            "Chemical restraint",
-            "Fraudulent billing",
-            "False testimony",
-            "Information concealment",
-            "Patient neglect",
-            "Hospice certification issues"
-        ]
-        self.key_terms = {
-            "medication": ["haloperidol", "lorazepam", "sedation", "chemical", "restraint",
-                         "prn", "as needed", "antipsychotic", "sedative", "benadryl",
-                         "ativan", "seroquel", "comfort kit", "medication"],
-            "documentation": ["record", "documentation", "log", "chart", "note", "missing",
-                           "altered", "backdated", "omit", "selective", "inconsistent"],
-            "visitation": ["visit", "restriction", "limit", "family", "spouse", "access",
-                         "barrier", "monitor", "disruptive", "uncooperative"],
-            "consent": ["consent", "authorize", "approval", "permission", "against wishes",
-                     "refused", "decline", "without knowledge"],
-            "hospice": ["hospice", "terminal", "end of life", "palliative", "comfort care",
-                      "six months", "6 months", "prognosis", "certification"],
-            "billing": ["charge", "bill", "payment", "medicare", "medicaid", "insurance",
-                      "reimbursement", "fee", "additional", "extra"]
-        }
-    def chunk_document(self, text: str, chunk_size: int = 1024, overlap: int = 256) -> List[str]:
-        sentences = sent_tokenize(text)
-        chunks = []
-        current_chunk = ""
-        for sentence in sentences:
-            if len(current_chunk) + len(sentence) <= chunk_size:
-                current_chunk += sentence + " "
-            else:
-                chunks.append(current_chunk.strip())
-                overlap_start = max(0, len(current_chunk) - overlap)
-                current_chunk = current_chunk[overlap_start:] + sentence + " "
-        if current_chunk.strip():
-            chunks.append(current_chunk.strip())
-        return chunks
-    def analyze_chunk(self, chunk: str) -> Dict[str, Any]:
-        prompt = f"""<s>[INST] Analyze the following healthcare document text for evidence of fraud, neglect, abuse, or criminal conduct.
-Focus on: {', '.join(self.fraud_categories)}.
-Provide specific indicators and cite the relevant text.
-DOCUMENT TEXT:
-{chunk}
-ANALYSIS: [/INST]"""
-        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(self.device)
-        with torch.no_grad():
-            output = self.model.generate(
-                **inputs,
-                max_new_tokens=512,
-                temperature=0.1,
-                top_p=0.9,
-                repetition_penalty=1.2
             )
-        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
-        analysis = response.split("ANALYSIS:")[-1].strip()
-        term_matches = self._find_key_terms(chunk)
-        return {
-            "analysis": analysis,
-            "term_matches": term_matches,
-            "chunk_text": chunk[:200] + "..." if len(chunk) > 200 else chunk
-        }
-    def _find_key_terms(self, text: str) -> Dict[str, List[str]]:
-        text = text.lower()
-        results = {}
-        for category, terms in self.key_terms.items():
-            matches = []
-            for term in terms:
-                pattern = r'.{0,50}' + re.escape(term) + r'.{0,50}'
-                for match in re.finditer(pattern, text):
-                    matches.append("..." + match.group(0) + "...")
-            if matches:
-                results[category] = matches
-        return results
-    def analyze_document(self, document_text: str) -> Dict[str, Any]:
-        document_text = document_text.replace('\n', ' ').replace('\r', ' ')
-        document_text = re.sub(r'\s+', ' ', document_text)
-        chunks = self.chunk_document(document_text)
-        chunk_analyses = [self.analyze_chunk(chunk) for chunk in chunks]
-        consolidated_findings = self._consolidate_analyses(chunk_analyses)
-        return {
-            "summary": self._generate_summary(consolidated_findings, document_text),
-            "detailed_findings": consolidated_findings,
-            "chunk_analyses": chunk_analyses,
-            "document_metadata": {
-                "length": len(document_text),
-                "chunk_count": len(chunks)
-            }
-        }
-    def _consolidate_analyses(self, chunk_analyses: List[Dict[str, Any]]) -> Dict[str, Any]:
-        all_term_matches = {category: [] for category in self.key_terms.keys()}
-        for analysis in chunk_analyses:
-            for category, matches in analysis.get("term_matches", {}).items():
-                all_term_matches[category].extend(matches)
-        for category in all_term_matches:
-            if all_term_matches[category]:
-                deduplicated = []
-                for match in all_term_matches[category]:
-                    if not any(match in other and match != other for other in all_term_matches[category]):
-                        deduplicated.append(match)
-                all_term_matches[category] = deduplicated[:5]
-        categorized_findings = {category: [] for category in self.fraud_categories}
-        for analysis in chunk_analyses:
-            analysis_text = analysis.get("analysis", "")
-            for category in self.fraud_categories:
-                if category.lower() in analysis_text.lower():
-                    sentences = sent_tokenize(analysis_text)
-                    relevant = [s for s in sentences if category.lower() in s.lower()]
-                    if relevant:
-                        categorized_findings[category].extend(relevant)
-        return {
-            "term_matches": all_term_matches,
-            "categorized_findings": categorized_findings
-        }
-    def _generate_summary(self, findings: Dict[str, Any], full_text: str) -> str:
-        indicator_counts = {
-            category: len(findings["categorized_findings"].get(category, []))
-            for category in self.fraud_categories
-        }
-        term_match_counts = {
-            category: len(matches)
-            for category, matches in findings["term_matches"].items()
-        }
-        sorted_categories = sorted(
-            self.fraud_categories,
-            key=lambda x: indicator_counts.get(x, 0) + term_match_counts.get(x, 0),
-            reverse=True
-        )
-        summary_lines = ["# Healthcare Fraud Detection Analysis", ""]
-        summary_lines.append("## Key Concerns Identified")
-        for category in sorted_categories[:3]:
-            if indicator_counts.get(category, 0) > 0 or term_match_counts.get(category, 0) > 0:
-                summary_lines.append(f"### {category}")
-                if findings["categorized_findings"].get(category):
-                    summary_lines.append("Model analysis indicates:")
-                    for finding in findings["categorized_findings"].get(category, [])[:3]:
-                        summary_lines.append(f"- {finding}")
-                category_lower = category.lower().rstrip('s')
-                for term_category, matches in findings["term_matches"].items():
-                    if category_lower in term_category.lower() and matches:
-                        summary_lines.append(f"Key terms identified:")
-                        for match in matches[:3]:
-                            summary_lines.append(f"- {match}")
-                summary_lines.append("")
-        summary_lines.append("## Recommended Actions")
-        if sum(indicator_counts.values()) > 5:
-            summary_lines.append("- **Urgent review recommended** - Multiple indicators of potential fraud detected")
-            summary_lines.append("- Consider referral to appropriate regulatory authorities")
-            summary_lines.append("- Document preservation should be prioritized")
-        elif sum(indicator_counts.values()) > 2:
-            summary_lines.append("- **Further investigation recommended** - Several potential indicators identified")
-            summary_lines.append("- Conduct interviews with involved personnel")
-            summary_lines.append("- Secure additional documentation for verification")
-        else:
-            summary_lines.append("- **Monitor situation** - Limited indicators detected")
-            summary_lines.append("- Consider more specific document analysis")
-        return "\n".join(summary_lines)
-    def print_report(self, results: Dict[str, Any]) -> None:
-        print("\n" + "="*80)
-        print("HEALTHCARE FRAUD DETECTION REPORT")
-        print("="*80 + "\n")
-        print(results["summary"])
-        print("\n" + "="*80)
-        print("DETAILED FINDINGS")
-        print("="*80)
-        for category, findings in results["detailed_findings"]["categorized_findings"].items():
-            if findings:
-                print(f"\n## {category.upper()}")
-                for i, finding in enumerate(findings, 1):
-                    print(f"{i}. {finding}")
-        print("\n" + "="*80)
-        print("KEY TERM MATCHES")
-        print("="*80)
-        for category, matches in results["detailed_findings"]["term_matches"].items():
-            if matches:
-                print(f"\n## {category.upper()}")
-                for match in matches:
-                    print(f"- {match}")
-        print("\n" + "="*80 + "\n")
-def analyze_pdf_for_fraud(pdf_path, model, tokenizer):
-    import pdfplumber
-    with pdfplumber.open(pdf_path) as pdf:
-        text = ""
-        for page in pdf.pages:
-            text += page.extract_text() or ""
-    analyzer = HealthcareFraudAnalyzer(model, tokenizer)
-    results = analyzer.analyze_document(text)
-    analyzer.print_report(results)
-    return results

 # document_analyzer.py
+# Analyzer for healthcare fraud detection using Llama 4 Maverick (text-only)
 import torch
 import nltk
 from nltk.tokenize import sent_tokenize
 class HealthcareFraudAnalyzer:
+    def __init__(self, model, tokenizer, accelerator):
         self.model = model
         self.tokenizer = tokenizer
+        self.accelerator = accelerator
+        self.device = self.accelerator.device
+        try:
+            nltk.data.find('tokenizers/punkt')
+        except LookupError:
+            nltk.download('punkt')
+    def analyze_document(self, sentences):
+        fraud_indicators = []
+        for sentence in sentences:
+            prompt = (
+                f"Analyze the following sentence for potential healthcare fraud indicators, "
+                f"such as consent violations, medication misuse, or billing irregularities. "
+                f"Provide a reason and confidence score (0-1). "
+                f"Sentence: {sentence}\nOutput format: {{'fraud_detected': bool, 'reason': str, 'confidence': float}}"
             )
+            inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=256,
+                    temperature=0.7,
+                    top_p=0.9,
+                    do_sample=True
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            try:
+                result = eval(response) if response.startswith("{") else {"fraud_detected": False, "reason": "Invalid response", "confidence": 0.0}
+                if result["fraud_detected"]:
+                    fraud_indicators.append({
+                        "sentence": sentence,
+                        "reason": result["reason"],
+                        "confidence": result["confidence"]
+                    })
+            except:
+                continue
+        return fraud_indicators