Spaces:

Cylanoid
/

llama_4_Medical_Fraud_Detection

Paused

App Files Files Community

Cylanoid commited on Apr 21

Commit

36b5bed

verified ·

1 Parent(s): 9b2c756

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -177

app.py CHANGED Viewed

@@ -23,6 +23,9 @@ except LookupError:
 # Import the HealthcareFraudAnalyzer
 from document_analyzer import HealthcareFraudAnalyzer
 # Debug: Print environment variables
 print("Environment variables:", dict(os.environ))
@@ -44,16 +47,16 @@ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-# Device map for CPU offloading
 device_map = {
     "model.embed_tokens": 0,
-    "model.layers.0-15": 0,
-    "model.layers.16-31": "cpu",
     "model.norm": 0,
     "lm_head": 0
 }
-# Load model with 8-bit quantization
 model = Llama4ForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
@@ -63,192 +66,108 @@ model = Llama4ForConditionalGeneration.from_pretrained(
     attn_implementation="flex_attention"
 )
-# Prepare for LoRA training
-model = prepare_model_for_kbit_training(model)
-peft_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
-)
-model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()
-# Function to create training pairs
-def extract_training_pairs_from_text(text):
-    pairs = []
-    patterns = [
-        (r"(?i).*?\b(haloperidol|lorazepam|ativan)\b.*?\b(daily|routine|regular)\b.*?",
-         "Patient receives {} on a {} basis. Is this appropriate?",
-         "This may indicate inappropriate use. Regular psychotropic use without need assessment may violate standards."),
-        (r"(?i).*?\b(missing|omitted|absent|lacking)\b.*?\b(documentation|records|logs|notes)\b.*?",
-         "Facility has {} {} for care. Is this a concern?",
-         "Yes, incomplete records may indicate fraud or attempts to hide issues."),
-        (r"(?i).*?\b(restrict|limit|prevent|block)\b.*?\b(visits|visitation|access|family)\b.*?",
-         "Facility {} family {} without necessity. Is this suspicious?",
-         "Yes, restrictions may hide issues and constitute fraud when billing for care."),
-        (r"(?i).*?\b(hospice|terminal|end.of.life)\b.*?\b(not|without|lacking)\b.*?\b(evidence|decline|documentation)\b.*?",
-         "Patient on {} care {} supporting {}. Is this fraudulent?",
-         "Yes, hospice without documented decline may indicate Medicare fraud."),
-        (r"(?i).*?\b(different|contradicts|conflicts|inconsistent)\b.*?\b(records|documentation|testimony|statements)\b.*?",
-         "Records show {} {} about condition. Is this fraudulent?",
-         "Yes, contradictory records suggest fraudulent misrepresentation.")
-    ]
-    for pattern, input_template, output_text in patterns:
-        for match in re.finditer(pattern, text):
-            groups = match.groups()
-            if len(groups) >= 2:
-                pairs.append({"input": input_template.format(*groups), "output": output_text})
-    if not pairs:
-        if any(x in text.lower() for x in ["medication", "prescribed", "administered"]):
-            pairs.append({
-                "input": "Medication records show inconsistent times. Is this concerning?",
-                "output": "Yes, inconsistent timing may indicate fraud or mismanagement."
-            })
-        if any(x in text.lower() for x in ["visit", "family", "spouse"]):
-            pairs.append({
-                "input": "Staff documents visits inconsistently. Is this suspicious?",
-                "output": "Yes, selective documentation suggests fraudulent record-keeping."
-            })
-        if any(x in text.lower() for x in ["hospice", "terminal", "prognosis"]):
-            pairs.append({
-                "input": "Patient on hospice without decline. Is this fraud?",
-                "output": "Yes, lack of decline suggests fraudulent certification."
-            })
-    return pairs
-# Function to process files and train
-def train_ui(files):
     try:
-        raw_text = ""
-        dataset = None
-        for file in files:
-            if file.name.endswith(".pdf"):
-                with pdfplumber.open(file.name) as pdf:
-                    for page in pdf.pages:
-                        raw_text += page.extract_text() or ""
-            elif file.name.endswith(".json"):
-                with open(file.name, "r", encoding="utf-8") as f:
-                    raw_data = json.load(f)
-                    training_data = raw_data.get("training_pairs", raw_data)
-                    with open("temp_fraud_data.json", "w", encoding="utf-8") as f:
-                        json.dump({"training_pairs": training_data}, f)
-                    dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
-        if not raw_text and not dataset:
-            return "Error: No valid PDF or JSON data found."
-        if raw_text:
-            training_data = extract_training_pairs_from_text(raw_text)
-            with open("temp_fraud_data.json", "w") as f:
-                json.dump({"training_pairs": training_data}, f)
-            dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
-        def tokenize_data(example):
-            formatted_text = f"<s>[INST] {example['input']} [/INST] {example['output']}</s>"
-            inputs = tokenizer(formatted_text, padding="max_length", truncation=True, max_length=4096, return_tensors="pt")
-            inputs["labels"] = inputs["input_ids"].clone()
-            return {k: v.squeeze(0) for k, v in inputs.items()}
-        tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
-        training_args = TrainingArguments(
-            output_dir="./fine_tuned_llama4_healthcare",
-            per_device_train_batch_size=2,
-            gradient_accumulation_steps=8,
-            eval_strategy="no",
-            save_strategy="epoch",
-            save_total_limit=2,
-            num_train_epochs=5,
-            learning_rate=2e-5,
-            weight_decay=0.01,
-            logging_dir="./logs",
-            logging_steps=10,
-            bf16=True,
-            gradient_checkpointing=True,
-            optim="adamw_torch",
-            warmup_steps=100,
         )
-        def custom_data_collator(features):
-            return {
-                "input_ids": torch.stack([f["input_ids"] for f in features]),
-                "attention_mask": torch.stack([f["attention_mask"] for f in features]),
-                "labels": torch.stack([f["labels"] for f in features]),
-            }
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-            data_collator=custom_data_collator,
         )
         trainer.train()
-        model.save_pretrained("./fine_tuned_llama4_healthcare")
-        tokenizer.save_pretrained("./fine_tuned_llama4_healthcare")
-        return f"Training completed with {len(tokenized_dataset)} examples! Model saved to ./fine_tuned_llama4_healthcare"
     except Exception as e:
-        return f"Error: {str(e)}. Please check file format, dependencies, or the LLama token."
-# Function to analyze documents
-def analyze_document_ui(files):
     try:
-        if not files:
-            return "Error: No file uploaded. Please upload a PDF."
-        file = files[0]
-        if not file.name.endswith(".pdf"):
-            return "Error: Please upload a PDF file."
-        raw_text = ""
-        with pdfplumber.open(file.name) as pdf:
             for page in pdf.pages:
-                raw_text += page.extract_text() or ""
-        if not raw_text:
-            return "Error: Could not extract text from PDF."
-        analyzer = HealthcareFraudAnalyzer(model, tokenizer)
-        results = analyzer.analyze_document(raw_text)
-        return results["summary"]
     except Exception as e:
-        return f"Error during analysis: {str(e)}"
-# Gradio UI
-with gr.Blocks(title="Healthcare Fraud Detection Suite") as demo:
-    gr.Markdown("# Healthcare Fraud Detection Suite")
-    with gr.Tabs():
-        with gr.TabItem("Fine-Tune Model"):
-            gr.Markdown("## Train Llama 4 for Fraud Detection")
-            gr.Markdown("Upload PDFs or JSON with training pairs.")
-            train_file_input = gr.File(label="Upload Files (PDF/JSON)", file_count="multiple")
-            train_button = gr.Button("Start Fine-Tuning")
-            train_output = gr.Textbox(label="Training Status", lines=5)
-            train_button.click(fn=train_ui, inputs=train_file_input, outputs=train_output)
-        with gr.TabItem("Analyze Document"):
-            gr.Markdown("## Analyze for Fraud Indicators")
-            gr.Markdown("Upload a PDF to scan for fraud, neglect, or abuse.")
-            analyze_file_input = gr.File(label="Upload PDF")
-            analyze_button = gr.Button("Analyze Document")
-            analyze_output = gr.Markdown(label="Analysis Results")
-            analyze_button.click(fn=analyze_document_ui, inputs=analyze_file_input, outputs=analyze_output)
-    gr.Markdown("""
-    ### About This Tool
-    Uses Llama 4 Maverick to detect fraud in healthcare documents.
-    Fine-tune with custom data or analyze PDFs for suspicious patterns.
-    **Note:** All analysis is local - no data is shared.
-    """)
-# Launch the app
-demo.launch()

 # Import the HealthcareFraudAnalyzer
 from document_analyzer import HealthcareFraudAnalyzer
+# Debug: Confirm file version
+print("Running updated app.py with CPU offloading (version: 2025-04-21)")
 # Debug: Print environment variables
 print("Environment variables:", dict(os.environ))
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+# Custom device map for CPU offloading
 device_map = {
     "model.embed_tokens": 0,
+    "model.layers.0-15": 0,  # First 16 layers on GPU
+    "model.layers.16-31": "cpu",  # Remaining layers on CPU
     "model.norm": 0,
     "lm_head": 0
 }
+# Load model with 8-bit quantization and CPU offloading
 model = Llama4ForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16,
     attn_implementation="flex_attention"
 )
+# Resize token embeddings if pad token was added
+model.resize_token_embeddings(len(tokenizer))
+# Initialize Accelerator
+accelerator = Accelerator()
+model = accelerator.prepare(model)
+# Initialize analyzer
+analyzer = HealthcareFraudAnalyzer(model, tokenizer, accelerator)
+# Training function
+def fine_tune_model(training_data_file, epochs=1, batch_size=2):
     try:
+        dataset = datasets.load_dataset('json', data_files=training_data_file)
+        dataset = dataset['train']
+        lora_config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM"
         )
+        model = prepare_model_for_kbit_training(model)
+        model = get_peft_model(model, lora_config)
+        training_args = {
+            "output_dir": "./results",
+            "num_train_epochs": int(epochs),
+            "per_device_train_batch_size": int(batch_size),
+            "gradient_accumulation_steps": 8,
+            "optim": "adamw_torch",
+            "save_steps": 500,
+            "logging_steps": 100,
+            "learning_rate": 2e-4,
+            "fp16": True,
+            "max_grad_norm": 0.3,
+            "warmup_ratio": 0.03,
+            "lr_scheduler_type": "cosine"
+        }
+        trainer = accelerator.prepare(
+            datasets.Trainer(
+                model=model,
+                args=datasets.TrainingArguments(**training_args),
+                train_dataset=dataset,
+            )
         )
         trainer.train()
+        model.save_pretrained("./fine_tuned_model")
+        return f"Training completed with {len(dataset)} examples!"
     except Exception as e:
+        return f"Training failed: {str(e)}"
+# Document analysis function
+def analyze_document(pdf_file):
     try:
+        with pdfplumber.open(pdf_file) as pdf:
+            text = ""
             for page in pdf.pages:
+                text += page.extract_text() or ""
+        sentences = sent_tokenize(text)
+        fraud_indicators = analyzer.analyze_document(sentences)
+        if not fraud_indicators:
+            return "No fraud indicators detected."
+        report = "Potential Fraud Indicators Detected:\n"
+        for indicator in fraud_indicators:
+            report += f"- {indicator['sentence']}\n  Reason: {indicator['reason']}\n  Confidence: {indicator['confidence']:.2f}\n"
+        return report
     except Exception as e:
+        return f"Analysis failed: {str(e)}"
+# Gradio interface
+with gr.Blocks(theme=gr.themes.Default()) as demo:
+    gr.Markdown("# Llama 4 Healthcare Fraud Detection")
+    with gr.Tab("Fine-Tune Model"):
+        training_data = gr.File(label="Upload Training JSON File")
+        epochs = gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Epochs")
+        batch_size = gr.Slider(minimum=1, maximum=4, value=2, step=1, label="Batch Size")
+        train_button = gr.Button("Fine-Tune")
+        train_output = gr.Textbox(label="Training Output")
+        train_button.click(
+            fn=fine_tune_model,
+            inputs=[training_data, epochs, batch_size],
+            outputs=train_output
+        )
+    with gr.Tab("Analyze Document"):
+        pdf_input = gr.File(label="Upload PDF Document")
+        analyze_button = gr.Button("Analyze")
+        analysis_output = gr.Textbox(label="Analysis Results")
+        analyze_button.click(
+            fn=analyze_document,
+            inputs=pdf_input,
+            outputs=analysis_output
+        )
+demo.launch(server_name="0.0.0.0", server_port=7860)