Cylanoid commited on
Commit
579f8b6
·
verified ·
1 Parent(s): 4f2ef99

Delete updated_app.py

Browse files
Files changed (1) hide show
  1. updated_app.py +0 -272
updated_app.py DELETED
@@ -1,272 +0,0 @@
1
- # updated_app.py
2
- # Enhanced Gradio app for Llama 4 Maverick healthcare fraud detection
3
-
4
- import gradio as gr
5
- from transformers import AutoProcessor, Llama4ForConditionalGeneration
6
- import datasets
7
- import torch
8
- import json
9
- import os
10
- import pdfplumber
11
- from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
12
- from accelerate import Accelerator
13
- import huggingface_hub
14
- import re
15
- import nltk
16
- from nltk.tokenize import sent_tokenize
17
-
18
- try:
19
- nltk.data.find('tokenizers/punkt')
20
- except LookupError:
21
- nltk.download('punkt')
22
-
23
- # Import the HealthcareFraudAnalyzer
24
- from document_analyzer import HealthcareFraudAnalyzer
25
-
26
- # Debug: Print environment variables to verify 'LLama' is present
27
- print("Environment variables:", dict(os.environ))
28
-
29
- # Retrieve the token from Hugging Face Space secrets
30
- LLama = os.getenv("LLama")
31
- if not LLama:
32
- raise ValueError("LLama token not found. Set it in Hugging Face Space secrets as 'LLama'.")
33
-
34
- # Debug: Print token (first 5 chars for security, remove in production)
35
- print(f"Retrieved LLama token: {LLama[:5]}...")
36
-
37
- # Authenticate with Hugging Face
38
- huggingface_hub.login(token=LLama)
39
-
40
- # Model setup
41
- MODEL_ID = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
42
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
43
-
44
- # Load model with FP8 quantization to fit in 80 GB VRAM
45
- model = Llama4ForConditionalGeneration.from_pretrained(
46
- MODEL_ID,
47
- torch_dtype=torch.bfloat16,
48
- device_map="auto",
49
- quantization_config={"load_in_8bit": True},
50
- attn_implementation="flex_attention"
51
- )
52
-
53
- # Prepare model for LoRA training
54
- model = prepare_model_for_kbit_training(model)
55
- peft_config = LoraConfig(
56
- r=16,
57
- lora_alpha=32,
58
- lora_dropout=0.05,
59
- bias="none",
60
- task_type="CAUSAL_LM",
61
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
62
- )
63
- model = get_peft_model(model, peft_config)
64
- model.print_trainable_parameters()
65
-
66
- # Function to create training pairs from document text
67
- def extract_training_pairs_from_text(text):
68
- pairs = []
69
- patterns = [
70
- # Medication patterns
71
- (
72
- r"(?i).*?\b(haloperidol|lorazepam|ativan)\b.*?\b(daily|routine|regular)\b.*?",
73
- "Patient receives {} on a {} basis. Is this appropriate medication management?",
74
- "This may indicate inappropriate medication management. Regular use of psychotropic medications without documented need assessment, behavior monitoring, and attempted dose reductions may violate care standards."
75
- ),
76
- # Documentation patterns
77
- (
78
- r"(?i).*?\b(missing|omitted|absent|lacking)\b.*?\b(documentation|records|logs|notes)\b.*?",
79
- "Facility has {} {} for patient care. Is this a documentation concern?",
80
- "Yes, incomplete documentation is a significant red flag. Missing records may indicate attempts to conceal care issues or fraudulent billing for services not provided."
81
- ),
82
- # Visitation patterns
83
- (
84
- r"(?i).*?\b(restrict|limit|prevent|block)\b.*?\b(visits|visitation|access|family)\b.*?",
85
- "Facility {} family {} without documented medical necessity. Is this suspicious?",
86
- "Yes, unjustified visitation restrictions may indicate attempts to conceal care issues and prevent family oversight. This can constitute fraud when facilities bill for care while violating resident rights."
87
- ),
88
- # Hospice patterns
89
- (
90
- r"(?i).*?\b(hospice|terminal|end.of.life)\b.*?\b(not|without|lacking)\b.*?\b(evidence|decline|documentation)\b.*?",
91
- "Patient placed on {} care {} supporting {}. Is this fraudulent?",
92
- "Yes, hospice enrollment without documented terminal decline may indicate Medicare fraud. Hospice certification requires genuine clinical determination of terminal status with prognosis of six months or less."
93
- ),
94
- # Contradictory documentation
95
- (
96
- r"(?i).*?\b(different|contradicts|conflicts|inconsistent)\b.*?\b(records|documentation|testimony|statements)\b.*?",
97
- "Records show {} {} about patient condition. Is this fraudulent documentation?",
98
- "Yes, contradictory documentation is a strong indicator of fraudulent record-keeping designed to misrepresent care quality or patient condition, particularly when official records differ from internal communications."
99
- )
100
- ]
101
-
102
- for pattern, input_template, output_text in patterns:
103
- matches = re.finditer(pattern, text)
104
- for match in matches:
105
- groups = match.groups()
106
- if len(groups) >= 2:
107
- input_text = input_template.format(*groups)
108
- pairs.append({
109
- "input": input_text,
110
- "output": output_text
111
- })
112
-
113
- if not pairs:
114
- if any(x in text.lower() for x in ["medication", "prescribed", "administered"]):
115
- pairs.append({
116
- "input": "Medication records show inconsistencies in administration times. Is this concerning?",
117
- "output": "Yes, inconsistent medication administration timing may indicate fraudulent documentation or medication mismanagement that could harm patients."
118
- })
119
- if any(x in text.lower() for x in ["visit", "family", "spouse"]):
120
- pairs.append({
121
- "input": "Staff documents family visits inconsistently. Is this suspicious?",
122
- "output": "Yes, selective documentation of family visits indicates fraudulent record-keeping designed to create a false narrative about family involvement and patient responses."
123
- })
124
- if any(x in text.lower() for x in ["hospice", "terminal", "prognosis"]):
125
- pairs.append({
126
- "input": "Patient remained on hospice for extended period without documented decline. Is this Medicare fraud?",
127
- "output": "Yes, maintaining hospice services without documented decline suggests fraudulent hospice certification to obtain Medicare benefits inappropriately."
128
- })
129
-
130
- return pairs
131
-
132
- # Function to process uploaded files and train
133
- def train_ui(files):
134
- try:
135
- raw_text = ""
136
- dataset = None
137
- for file in files:
138
- if file.name.endswith(".pdf"):
139
- with pdfplumber.open(file.name) as pdf:
140
- for page in pdf.pages:
141
- raw_text += page.extract_text() or ""
142
- elif file.name.endswith(".json"):
143
- with open(file.name, "r", encoding="utf-8") as f:
144
- raw_data = json.load(f)
145
- training_data = raw_data.get("training_pairs", raw_data)
146
- with open("temp_fraud_data.json", "w", encoding="utf-8") as f:
147
- json.dump({"training_pairs": training_data}, f)
148
- dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
149
-
150
- if not raw_text and not dataset:
151
- return "Error: No valid PDF or JSON data found."
152
-
153
- if raw_text:
154
- training_data = extract_training_pairs_from_text(raw_text)
155
- with open("temp_fraud_data.json", "w") as f:
156
- json.dump({"training_pairs": training_data}, f)
157
- dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
158
-
159
- def tokenize_data(example):
160
- messages = [
161
- {
162
- "role": "user",
163
- "content": [{"type": "text", "text": example['input']}]
164
- },
165
- {
166
- "role": "assistant",
167
- "content": [{"type": "text", "text": example['output']}]
168
- }
169
- ]
170
- formatted_text = processor.apply_chat_template(messages, add_generation_prompt=False)
171
- inputs = processor(formatted_text, padding="max_length", truncation=True, max_length=4096, return_tensors="pt")
172
- inputs["labels"] = inputs["input_ids"].clone()
173
- return {k: v.squeeze(0) for k, v in inputs.items()}
174
-
175
- tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
176
-
177
- training_args = TrainingArguments(
178
- output_dir="./fine_tuned_llama4_healthcare",
179
- per_device_train_batch_size=2,
180
- gradient_accumulation_steps=8,
181
- eval_strategy="no",
182
- save_strategy="epoch",
183
- save_total_limit=2,
184
- num_train_epochs=5,
185
- learning_rate=2e-5,
186
- weight_decay=0.01,
187
- logging_dir="./logs",
188
- logging_steps=10,
189
- bf16=True,
190
- gradient_checkpointing=True,
191
- optim="adamw_torch",
192
- warmup_steps=100,
193
- )
194
-
195
- def custom_data_collator(features):
196
- return {
197
- "input_ids": torch.stack([f["input_ids"] for f in features]),
198
- "attention_mask": torch.stack([f["attention_mask"] for f in features]),
199
- "labels": torch.stack([f["labels"] for f in features]),
200
- }
201
-
202
- trainer = Trainer(
203
- model=model,
204
- args=training_args,
205
- train_dataset=tokenized_dataset,
206
- data_collator=custom_data_collator,
207
- )
208
-
209
- trainer.train()
210
- model.save_pretrained("./fine_tuned_llama4_healthcare")
211
- processor.save_pretrained("./fine_tuned_llama4_healthcare")
212
- return f"Training completed with {len(tokenized_dataset)} examples! Model saved to ./fine_tuned_llama4_healthcare"
213
-
214
- except Exception as e:
215
- return f"Error: {str(e)}. Please check file format, dependencies, or the LLama token."
216
-
217
- # Function to analyze uploaded document for fraud
218
- def analyze_document_ui(files):
219
- try:
220
- if not files:
221
- return "Error: No file uploaded. Please upload a PDF to analyze."
222
-
223
- file = files[0]
224
- if not file.name.endswith(".pdf"):
225
- return "Error: Please upload a PDF file for analysis."
226
-
227
- raw_text = ""
228
- with pdfplumber.open(file.name) as pdf:
229
- for page in pdf.pages:
230
- raw_text += page.extract_text() or ""
231
-
232
- if not raw_text:
233
- return "Error: Could not extract text from the PDF. The file may be corrupt or contain only images."
234
-
235
- analyzer = HealthcareFraudAnalyzer(model, processor)
236
- results = analyzer.analyze_document(raw_text)
237
- return results["summary"]
238
-
239
- except Exception as e:
240
- return f"Error during document analysis: {str(e)}"
241
-
242
- # Gradio UI with training and analysis tabs
243
- with gr.Blocks(title="Healthcare Fraud Detection Suite") as demo:
244
- gr.Markdown("# Healthcare Fraud Detection Suite")
245
-
246
- with gr.Tabs():
247
- with gr.TabItem("Fine-Tune Model"):
248
- gr.Markdown("## Train Llama 4 for Healthcare Fraud Detection")
249
- gr.Markdown("Upload PDFs (e.g., care logs, medication records) or a JSON file with training pairs.")
250
- train_file_input = gr.File(label="Upload Files (PDF/JSON)", file_count="multiple")
251
- train_button = gr.Button("Start Fine-Tuning")
252
- train_output = gr.Textbox(label="Training Status", lines=5)
253
- train_button.click(fn=train_ui, inputs=train_file_input, outputs=train_output)
254
-
255
- with gr.TabItem("Analyze Document"):
256
- gr.Markdown("## Analyze Document for Healthcare Fraud Indicators")
257
- gr.Markdown("Upload a PDF document to analyze for potential fraud, neglect, or abuse indicators.")
258
- analyze_file_input = gr.File(label="Upload PDF Document")
259
- analyze_button = gr.Button("Analyze Document")
260
- analyze_output = gr.Markdown(label="Analysis Results")
261
- analyze_button.click(fn=analyze_document_ui, inputs=analyze_file_input, outputs=analyze_output)
262
-
263
- gr.Markdown("""
264
- ### About This Tool
265
- This tool uses Llama 4 Maverick to identify patterns of potential fraud, neglect, and abuse in healthcare documentation.
266
- The fine-tuning tab allows model customization with your examples or automatic extraction from documents.
267
- The analysis tab scans documents for suspicious patterns, generating detailed reports.
268
- **Note:** All analysis is performed locally - no data is shared externally.
269
- """)
270
-
271
- # Launch the Gradio app
272
- demo.launch()