Blaiseboy commited on
Commit
814c36a
·
verified ·
1 Parent(s): 60e17c6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -591
app.py DELETED
@@ -1,591 +0,0 @@
1
- # BioGPT Medical Chatbot with Gradio Interface
2
-
3
- import gradio as gr
4
- import torch
5
- import warnings
6
- import numpy as np
7
- import faiss
8
- import os
9
- import re
10
- import time
11
- from datetime import datetime
12
- from typing import List, Dict, Optional, Tuple
13
- import json
14
-
15
- # Install required packages if not already installed
16
- try:
17
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
18
- from sentence_transformers import SentenceTransformer
19
- except ImportError:
20
- print("Installing required packages...")
21
- import subprocess
22
- import sys
23
-
24
- packages = [
25
- "transformers>=4.21.0",
26
- "torch>=1.12.0",
27
- "sentence-transformers",
28
- "faiss-cpu",
29
- "accelerate",
30
- "bitsandbytes",
31
- "datasets",
32
- "numpy",
33
- "sacremoses"
34
- ]
35
-
36
- for package in packages:
37
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
38
-
39
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
40
- from sentence_transformers import SentenceTransformer
41
-
42
- # Suppress warnings
43
- warnings.filterwarnings('ignore')
44
-
45
- class GradioBioGPTChatbot:
46
- def __init__(self, use_gpu=True, use_8bit=True):
47
- """Initialize BioGPT chatbot for Gradio deployment"""
48
- self.device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"
49
- self.use_8bit = use_8bit and torch.cuda.is_available()
50
-
51
- # Initialize components
52
- self.setup_embeddings()
53
- self.setup_faiss_index()
54
- self.setup_biogpt()
55
-
56
- # Conversation tracking
57
- self.conversation_history = []
58
- self.knowledge_chunks = []
59
- self.is_data_loaded = False
60
-
61
- def setup_embeddings(self):
62
- """Setup medical-optimized embeddings"""
63
- try:
64
- self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
65
- self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
66
- self.use_embeddings = True
67
- except Exception as e:
68
- print(f"Embeddings setup failed: {e}")
69
- self.embedding_model = None
70
- self.embedding_dim = 384
71
- self.use_embeddings = False
72
-
73
- def setup_faiss_index(self):
74
- """Setup FAISS for vector search"""
75
- try:
76
- self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)
77
- self.faiss_ready = True
78
- except Exception as e:
79
- print(f"FAISS setup failed: {e}")
80
- self.faiss_index = None
81
- self.faiss_ready = False
82
-
83
- def setup_biogpt(self):
84
- """Setup BioGPT model with optimizations"""
85
- model_name = "microsoft/BioGPT-Large"
86
-
87
- try:
88
- # Setup quantization config for memory efficiency
89
- if self.use_8bit:
90
- quantization_config = BitsAndBytesConfig(
91
- load_in_8bit=True,
92
- llm_int8_threshold=6.0,
93
- llm_int8_has_fp16_weight=False,
94
- )
95
- else:
96
- quantization_config = None
97
-
98
- # Load tokenizer
99
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
100
- if self.tokenizer.pad_token is None:
101
- self.tokenizer.pad_token = self.tokenizer.eos_token
102
-
103
- # Load model
104
- self.model = AutoModelForCausalLM.from_pretrained(
105
- model_name,
106
- quantization_config=quantization_config,
107
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
108
- device_map="auto" if self.device == "cuda" else None,
109
- trust_remote_code=True
110
- )
111
-
112
- if self.device == "cuda" and quantization_config is None:
113
- self.model = self.model.to(self.device)
114
-
115
- except Exception as e:
116
- print(f"BioGPT loading failed: {e}. Using fallback model...")
117
- self.setup_fallback_model()
118
-
119
- def setup_fallback_model(self):
120
- """Setup fallback model if BioGPT fails"""
121
- try:
122
- fallback_model = "microsoft/DialoGPT-medium"
123
- self.tokenizer = AutoTokenizer.from_pretrained(fallback_model)
124
- self.model = AutoModelForCausalLM.from_pretrained(fallback_model)
125
-
126
- if self.tokenizer.pad_token is None:
127
- self.tokenizer.pad_token = self.tokenizer.eos_token
128
-
129
- if self.device == "cuda":
130
- self.model = self.model.to(self.device)
131
-
132
- except Exception as e:
133
- print(f"All models failed: {e}")
134
- self.model = None
135
- self.tokenizer = None
136
-
137
- def create_medical_chunks(self, text: str, chunk_size: int = 400) -> List[Dict]:
138
- """Create medically-optimized text chunks"""
139
- chunks = []
140
-
141
- # Split by medical sections first
142
- medical_sections = self.split_by_medical_sections(text)
143
-
144
- chunk_id = 0
145
- for section in medical_sections:
146
- if len(section.split()) > chunk_size:
147
- # Split large sections by sentences
148
- sentences = re.split(r'[.!?]+', section)
149
- current_chunk = ""
150
-
151
- for sentence in sentences:
152
- sentence = sentence.strip()
153
- if not sentence:
154
- continue
155
-
156
- if len(current_chunk.split()) + len(sentence.split()) < chunk_size:
157
- current_chunk += sentence + ". "
158
- else:
159
- if current_chunk.strip():
160
- chunks.append({
161
- 'id': chunk_id,
162
- 'text': current_chunk.strip(),
163
- 'medical_focus': self.identify_medical_focus(current_chunk)
164
- })
165
- chunk_id += 1
166
- current_chunk = sentence + ". "
167
-
168
- if current_chunk.strip():
169
- chunks.append({
170
- 'id': chunk_id,
171
- 'text': current_chunk.strip(),
172
- 'medical_focus': self.identify_medical_focus(current_chunk)
173
- })
174
- chunk_id += 1
175
- else:
176
- chunks.append({
177
- 'id': chunk_id,
178
- 'text': section,
179
- 'medical_focus': self.identify_medical_focus(section)
180
- })
181
- chunk_id += 1
182
-
183
- return chunks
184
-
185
- def split_by_medical_sections(self, text: str) -> List[str]:
186
- """Split text by medical sections"""
187
- section_patterns = [
188
- r'\n\s*(?:SYMPTOMS?|TREATMENT|DIAGNOSIS|CAUSES?|PREVENTION|MANAGEMENT).*?\n',
189
- r'\n\s*\d+\.\s+',
190
- r'\n\n+'
191
- ]
192
-
193
- sections = [text]
194
- for pattern in section_patterns:
195
- new_sections = []
196
- for section in sections:
197
- splits = re.split(pattern, section, flags=re.IGNORECASE)
198
- new_sections.extend([s.strip() for s in splits if len(s.strip()) > 100])
199
- sections = new_sections
200
-
201
- return sections
202
-
203
- def identify_medical_focus(self, text: str) -> str:
204
- """Identify the medical focus of a text chunk"""
205
- text_lower = text.lower()
206
-
207
- categories = {
208
- 'pediatric_symptoms': ['fever', 'cough', 'rash', 'vomiting', 'diarrhea'],
209
- 'treatments': ['treatment', 'therapy', 'medication', 'antibiotics'],
210
- 'diagnosis': ['diagnosis', 'diagnostic', 'symptoms', 'signs'],
211
- 'emergency': ['emergency', 'urgent', 'serious', 'hospital'],
212
- 'prevention': ['prevention', 'vaccine', 'immunization', 'avoid']
213
- }
214
-
215
- for category, keywords in categories.items():
216
- if any(keyword in text_lower for keyword in keywords):
217
- return category
218
-
219
- return 'general_medical'
220
-
221
- def load_medical_data_from_file(self, file_path: str) -> Tuple[str, bool]:
222
- """Load medical data from uploaded file"""
223
- if not file_path or not os.path.exists(file_path):
224
- return "❌ No file uploaded or file not found.", False
225
-
226
- try:
227
- with open(file_path, 'r', encoding='utf-8') as f:
228
- text = f.read()
229
-
230
- # Create chunks
231
- chunks = self.create_medical_chunks(text)
232
- self.knowledge_chunks = chunks
233
-
234
- # Generate embeddings if available
235
- if self.use_embeddings and self.embedding_model and self.faiss_ready:
236
- success = self.generate_embeddings_and_index(chunks)
237
- if success:
238
- self.is_data_loaded = True
239
- return f"✅ Medical data loaded successfully! {len(chunks)} chunks processed with vector search.", True
240
-
241
- self.is_data_loaded = True
242
- return f"✅ Medical data loaded successfully! {len(chunks)} chunks processed (keyword search mode).", True
243
-
244
- except Exception as e:
245
- return f"❌ Error loading file: {str(e)}", False
246
-
247
- def generate_embeddings_and_index(self, chunks: List[Dict]) -> bool:
248
- """Generate embeddings and add to FAISS index"""
249
- try:
250
- texts = [chunk['text'] for chunk in chunks]
251
- embeddings = self.embedding_model.encode(texts, show_progress_bar=False)
252
- self.faiss_index.add(np.array(embeddings))
253
- return True
254
- except Exception as e:
255
- print(f"Embedding generation failed: {e}")
256
- return False
257
-
258
- def retrieve_medical_context(self, query: str, n_results: int = 3) -> List[str]:
259
- """Retrieve relevant medical context"""
260
- if self.use_embeddings and self.embedding_model and self.faiss_ready:
261
- try:
262
- query_embedding = self.embedding_model.encode([query])
263
- distances, indices = self.faiss_index.search(np.array(query_embedding), n_results)
264
- context_chunks = [self.knowledge_chunks[i]['text'] for i in indices[0] if i != -1]
265
- if context_chunks:
266
- return context_chunks
267
- except Exception as e:
268
- print(f"Embedding search failed: {e}")
269
-
270
- # Fallback to keyword search
271
- return self.keyword_search_medical(query, n_results)
272
-
273
- def keyword_search_medical(self, query: str, n_results: int) -> List[str]:
274
- """Medical-focused keyword search"""
275
- if not self.knowledge_chunks:
276
- return []
277
-
278
- query_words = set(query.lower().split())
279
- chunk_scores = []
280
-
281
- for chunk_info in self.knowledge_chunks:
282
- chunk_text = chunk_info['text']
283
- chunk_words = set(chunk_text.lower().split())
284
-
285
- word_overlap = len(query_words.intersection(chunk_words))
286
- base_score = word_overlap / len(query_words) if query_words else 0
287
-
288
- # Boost medical content
289
- medical_boost = 0
290
- if chunk_info.get('medical_focus') in ['pediatric_symptoms', 'treatments', 'diagnosis']:
291
- medical_boost = 0.5
292
-
293
- final_score = base_score + medical_boost
294
-
295
- if final_score > 0:
296
- chunk_scores.append((final_score, chunk_text))
297
-
298
- chunk_scores.sort(reverse=True)
299
- return [chunk for _, chunk in chunk_scores[:n_results]]
300
-
301
- def generate_biogpt_response(self, context: str, query: str) -> str:
302
- """Generate medical response using BioGPT"""
303
- if not self.model or not self.tokenizer:
304
- return "Medical model not available. Please check the setup."
305
-
306
- try:
307
- prompt = f"""Medical Context: {context[:800]}
308
-
309
- Question: {query}
310
-
311
- Medical Answer:"""
312
-
313
- inputs = self.tokenizer(
314
- prompt,
315
- return_tensors="pt",
316
- truncation=True,
317
- max_length=1024
318
- )
319
-
320
- if self.device == "cuda":
321
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
322
-
323
- with torch.no_grad():
324
- outputs = self.model.generate(
325
- **inputs,
326
- max_new_tokens=150,
327
- do_sample=True,
328
- temperature=0.7,
329
- top_p=0.9,
330
- pad_token_id=self.tokenizer.eos_token_id,
331
- repetition_penalty=1.1
332
- )
333
-
334
- full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
335
-
336
- if "Medical Answer:" in full_response:
337
- generated_response = full_response.split("Medical Answer:")[-1].strip()
338
- else:
339
- generated_response = full_response[len(prompt):].strip()
340
-
341
- return self.clean_medical_response(generated_response)
342
-
343
- except Exception as e:
344
- print(f"BioGPT generation failed: {e}")
345
- return self.fallback_response(context, query)
346
-
347
- def clean_medical_response(self, response: str) -> str:
348
- """Clean and format medical response"""
349
- sentences = re.split(r'[.!?]+', response)
350
- clean_sentences = []
351
-
352
- for sentence in sentences:
353
- sentence = sentence.strip()
354
- if len(sentence) > 10 and not sentence.endswith(('and', 'or', 'but', 'however')):
355
- clean_sentences.append(sentence)
356
- if len(clean_sentences) >= 3:
357
- break
358
-
359
- if clean_sentences:
360
- cleaned = '. '.join(clean_sentences) + '.'
361
- else:
362
- cleaned = response[:200] + '...' if len(response) > 200 else response
363
-
364
- return cleaned
365
-
366
- def fallback_response(self, context: str, query: str) -> str:
367
- """Fallback response when BioGPT fails"""
368
- sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
369
-
370
- if sentences:
371
- response = sentences[0] + '.'
372
- if len(sentences) > 1:
373
- response += ' ' + sentences[1] + '.'
374
- else:
375
- response = context[:300] + '...'
376
-
377
- return response
378
-
379
- def handle_conversational_interactions(self, query: str) -> Optional[str]:
380
- """Handle conversational interactions"""
381
- query_lower = query.lower().strip()
382
-
383
- # Greetings
384
- if any(greeting in query_lower for greeting in ['hello', 'hi', 'hey', 'good morning', 'good afternoon']):
385
- return "👋 Hello! I'm BioGPT, your medical AI assistant specialized in pediatric medicine. Please upload your medical data file first, then ask me any health-related questions!"
386
-
387
- # Thanks
388
- if any(thanks in query_lower for thanks in ['thank you', 'thanks', 'thx', 'appreciate']):
389
- return "🙏 You're welcome! I'm glad I could help. Remember to always consult healthcare professionals for medical decisions. Feel free to ask more questions!"
390
-
391
- # Goodbyes
392
- if any(bye in query_lower for bye in ['bye', 'goodbye', 'see you', 'farewell']):
393
- return "👋 Goodbye! Take care of yourself and your family. Stay healthy! 🏥"
394
-
395
- # Help/About
396
- if any(help_word in query_lower for help_word in ['help', 'what can you do', 'how do you work']):
397
- return """🤖 **BioGPT Medical Assistant**
398
-
399
- I'm an AI medical assistant that can help with:
400
- • Pediatric medicine and children's health
401
- • Medical symptoms and conditions
402
- • Treatment information
403
- • When to seek medical care
404
-
405
- **How to use:**
406
- 1. Upload your medical data file using the file upload above
407
- 2. Ask specific medical questions
408
- 3. Get evidence-based medical information
409
-
410
- ⚠️ **Important:** I provide educational information only. Always consult healthcare professionals for medical advice."""
411
-
412
- return None
413
-
414
- def chat_interface(self, message: str, history: List[List[str]]) -> Tuple[str, List[List[str]]]:
415
- """Main chat interface for Gradio"""
416
- if not message.strip():
417
- return "", history
418
-
419
- # Check if data is loaded
420
- if not self.is_data_loaded:
421
- response = "⚠️ Please upload your medical data file first using the file upload above before asking questions."
422
- history.append([message, response])
423
- return "", history
424
-
425
- # Handle conversational interactions
426
- conversational_response = self.handle_conversational_interactions(message)
427
- if conversational_response:
428
- history.append([message, conversational_response])
429
- return "", history
430
-
431
- # Process medical query
432
- context = self.retrieve_medical_context(message)
433
-
434
- if not context:
435
- response = "I don't have specific information about this topic in my medical database. Please consult with a healthcare professional for personalized medical advice."
436
- else:
437
- main_context = '\n\n'.join(context)
438
- medical_response = self.generate_biogpt_response(main_context, message)
439
- response = f"🩺 **Medical Information:** {medical_response}\n\n⚠️ **Important:** This information is for educational purposes only. Always consult with qualified healthcare professionals for medical diagnosis, treatment, and personalized advice."
440
-
441
- # Add to conversation history
442
- self.conversation_history.append({
443
- 'query': message,
444
- 'response': response,
445
- 'timestamp': datetime.now().isoformat()
446
- })
447
-
448
- history.append([message, response])
449
- return "", history
450
-
451
- # Initialize the chatbot
452
- print("🚀 Initializing BioGPT Medical Chatbot...")
453
- chatbot = GradioBioGPTChatbot(use_gpu=True, use_8bit=True)
454
-
455
- def upload_and_process_file(file):
456
- """Handle file upload and processing"""
457
- if file is None:
458
- return "❌ No file uploaded."
459
-
460
- message, success = chatbot.load_medical_data_from_file(file.name)
461
- return message
462
-
463
- # Create Gradio Interface
464
- def create_gradio_interface():
465
- """Create and launch Gradio interface"""
466
-
467
- with gr.Blocks(
468
- title="🏥 BioGPT Medical Assistant",
469
- theme=gr.themes.Soft(),
470
- css="""
471
- .gradio-container {
472
- max-width: 1200px !important;
473
- }
474
- .chat-message {
475
- border-radius: 10px !important;
476
- }
477
- """
478
- ) as demo:
479
-
480
- gr.HTML("""
481
- <div style="text-align: center; padding: 20px;">
482
- <h1>🏥 BioGPT Medical Assistant</h1>
483
- <p style="font-size: 18px; color: #666;">
484
- Professional AI Medical Chatbot powered by BioGPT-Large
485
- </p>
486
- <p style="color: #888;">
487
- ⚠️ For educational purposes only. Always consult healthcare professionals for medical advice.
488
- </p>
489
- </div>
490
- """)
491
-
492
- with gr.Row():
493
- with gr.Column(scale=1):
494
- gr.HTML("<h3>📁 Upload Medical Data</h3>")
495
- file_upload = gr.File(
496
- label="Upload Medical Text File (.txt)",
497
- file_types=[".txt"],
498
- type="file"
499
- )
500
- upload_status = gr.Textbox(
501
- label="Upload Status",
502
- value="📋 Please upload your medical data file to begin...",
503
- interactive=False,
504
- lines=3
505
- )
506
-
507
- gr.HTML("""
508
- <div style="margin-top: 20px; padding: 15px; background-color: #f0f8ff; border-radius: 10px;">
509
- <h4>💡 How to Use:</h4>
510
- <ol>
511
- <li>Upload your medical text file (.txt format)</li>
512
- <li>Wait for processing confirmation</li>
513
- <li>Start asking medical questions!</li>
514
- </ol>
515
-
516
- <h4>📝 Example Questions:</h4>
517
- <ul>
518
- <li>"What causes fever in children?"</li>
519
- <li>"How to treat a persistent cough?"</li>
520
- <li>"When should I call the doctor?"</li>
521
- <li>"Signs of dehydration in infants?"</li>
522
- </ul>
523
- </div>
524
- """)
525
-
526
- with gr.Column(scale=2):
527
- gr.HTML("<h3>💬 Medical Consultation</h3>")
528
- chatbot_interface = gr.Chatbot(
529
- label="BioGPT Medical Chat",
530
- height=500,
531
- bubble_full_width=False
532
- )
533
-
534
- msg_input = gr.Textbox(
535
- label="Your Medical Question",
536
- placeholder="Ask me about pediatric health, symptoms, treatments, or when to seek care...",
537
- lines=2
538
- )
539
-
540
- with gr.Row():
541
- send_btn = gr.Button("🩺 Send Question", variant="primary")
542
- clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
543
-
544
- # Event handlers
545
- file_upload.change(
546
- fn=upload_and_process_file,
547
- inputs=[file_upload],
548
- outputs=[upload_status]
549
- )
550
-
551
- msg_input.submit(
552
- fn=chatbot.chat_interface,
553
- inputs=[msg_input, chatbot_interface],
554
- outputs=[msg_input, chatbot_interface]
555
- )
556
-
557
- send_btn.click(
558
- fn=chatbot.chat_interface,
559
- inputs=[msg_input, chatbot_interface],
560
- outputs=[msg_input, chatbot_interface]
561
- )
562
-
563
- clear_btn.click(
564
- fn=lambda: ([], ""),
565
- outputs=[chatbot_interface, msg_input]
566
- )
567
-
568
- gr.HTML("""
569
- <div style="text-align: center; margin-top: 30px; padding: 20px; background-color: #fff3cd; border-radius: 10px;">
570
- <h4>⚠️ Medical Disclaimer</h4>
571
- <p>This AI assistant provides educational medical information only and is not a substitute for professional medical advice, diagnosis, or treatment. Always seek the advice of qualified healthcare providers with questions about medical conditions.</p>
572
- </div>
573
- """)
574
-
575
- return demo
576
-
577
- if __name__ == "__main__":
578
- # Create and launch the Gradio interface
579
- demo = create_gradio_interface()
580
-
581
- print("🌐 Launching Gradio interface...")
582
- print("📋 Upload your medical data file and start chatting!")
583
-
584
- # Launch with public sharing (set share=False for local only)
585
- demo.launch(
586
- share=True, # Set to False for local deployment only
587
- server_name="0.0.0.0", # Allow external access
588
- server_port=7860, # Default Gradio port
589
- show_error=True,
590
- debug=True
591
- )