Spaces:

Blaiseboy
/

BioGPT-chatbot

Sleeping

File size: 18,030 Bytes

f1ca076

import os
import re
import torch
import warnings
import numpy as np
import faiss
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Optional
import time
from datetime import datetime

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

class ColabBioGPTChatbot:
    def __init__(self, use_gpu=True, use_8bit=True):
        """Initialize BioGPT chatbot optimized for Hugging Face Spaces"""
        print("🏥 Initializing Medical Chatbot...")
        self.use_gpu = use_gpu
        self.use_8bit = use_8bit
        self.device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"
        print(f"🖥️ Using device: {self.device}")
        
        self.tokenizer = None
        self.model = None
        self.knowledge_chunks = []
        self.conversation_history = []
        self.embedding_model = None
        self.faiss_index = None
        self.faiss_ready = False
        self.use_embeddings = True
        
        # Initialize components
        self.setup_biogpt()
        self.load_sentence_transformer()
        
    def setup_biogpt(self):
        """Setup BioGPT model with fallback to base BioGPT if Large fails"""
        print("🧠 Loading BioGPT model...")
        
        try:
            # Try BioGPT-Large first
            model_name = "microsoft/BioGPT-Large"
            print(f"Attempting to load {model_name}...")
            
            if self.use_8bit and self.device == "cuda":
                quantization_config = BitsAndBytesConfig(
                    load_in_8bit=True,
                    llm_int8_threshold=6.0,
                    llm_int8_has_fp16_weight=False,
                )
            else:
                quantization_config = None
            
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quantization_config,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None,
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            
            if self.device == "cuda" and quantization_config is None:
                self.model = self.model.to(self.device)
                
            print("✅ BioGPT-Large loaded successfully!")
            
        except Exception as e:
            print(f"❌ BioGPT-Large loading failed: {e}")
            print("🔁 Falling back to base BioGPT...")
            self.setup_fallback_biogpt()
    
    def setup_fallback_biogpt(self):
        """Fallback to microsoft/BioGPT if BioGPT-Large fails"""
        try:
            model_name = "microsoft/BioGPT"
            print(f"Loading fallback model: {model_name}")
            
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float32,
                trust_remote_code=True,
                low_cpu_mem_usage=True
            )
            
            if self.device == "cuda":
                self.model = self.model.to(self.device)
                
            print("✅ Base BioGPT model loaded successfully!")
            
        except Exception as e:
            print(f"❌ Failed to load fallback BioGPT: {e}")
            self.model = None
            self.tokenizer = None
    
    def load_sentence_transformer(self):
        """Load sentence transformer for embeddings"""
        try:
            print("🔮 Loading sentence transformer...")
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
            
            # Initialize FAISS index (will be populated when data is loaded)
            embedding_dim = 384  # Dimension for all-MiniLM-L6-v2
            self.faiss_index = faiss.IndexFlatL2(embedding_dim)
            self.faiss_ready = True
            print("✅ Sentence transformer and FAISS index ready!")
            
        except Exception as e:
            print(f"❌ Failed to load sentence transformer: {e}")
            self.use_embeddings = False
            self.faiss_ready = False

    def load_medical_data(self, file_path):
        """Load and process medical data"""
        print(f"📖 Loading medical data from {file_path}...")
        
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File {file_path} not found")
                
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            print(f"📄 File loaded: {len(text):,} characters")
            
        except Exception as e:
            print(f"❌ Error loading file: {e}")
            raise ValueError(f"Failed to load medical data: {e}")
        
        # Create chunks
        print("📝 Creating medical chunks...")
        chunks = self.create_medical_chunks(text)
        print(f"📋 Created {len(chunks)} medical chunks")
        
        self.knowledge_chunks = chunks
        
        # Generate embeddings if available
        if self.use_embeddings and self.embedding_model and self.faiss_ready:
            try:
                self.generate_embeddings_with_progress(chunks)
                print("✅ Medical data loaded with embeddings!")
            except Exception as e:
                print(f"⚠️ Embedding generation failed: {e}")
                print("✅ Medical data loaded (keyword search mode)")
        else:
            print("✅ Medical data loaded (keyword search mode)")
    
    def create_medical_chunks(self, text: str, chunk_size: int = 400) -> List[Dict]:
        """Create medically-optimized text chunks"""
        chunks = []
        
        # Split by paragraphs first
        paragraphs = [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]
        
        chunk_id = 0
        for paragraph in paragraphs:
            if len(paragraph.split()) <= chunk_size:
                chunks.append({
                    'id': chunk_id,
                    'text': paragraph,
                    'medical_focus': self.identify_medical_focus(paragraph)
                })
                chunk_id += 1
            else:
                # Split large paragraphs by sentences
                sentences = re.split(r'[.!?]+', paragraph)
                current_chunk = ""
                
                for sentence in sentences:
                    sentence = sentence.strip()
                    if not sentence:
                        continue
                    
                    if len(current_chunk.split()) + len(sentence.split()) <= chunk_size:
                        current_chunk += sentence + ". "
                    else:
                        if current_chunk.strip():
                            chunks.append({
                                'id': chunk_id,
                                'text': current_chunk.strip(),
                                'medical_focus': self.identify_medical_focus(current_chunk)
                            })
                            chunk_id += 1
                        current_chunk = sentence + ". "
                
                if current_chunk.strip():
                    chunks.append({
                        'id': chunk_id,
                        'text': current_chunk.strip(),
                        'medical_focus': self.identify_medical_focus(current_chunk)
                    })
                    chunk_id += 1
        
        return chunks
    
    def identify_medical_focus(self, text: str) -> str:
        """Identify the medical focus of a text chunk"""
        text_lower = text.lower()
        
        categories = {
            'pediatric_symptoms': ['fever', 'cough', 'rash', 'vomiting', 'diarrhea'],
            'treatments': ['treatment', 'therapy', 'medication', 'antibiotics'],
            'diagnosis': ['diagnosis', 'diagnostic', 'symptoms', 'signs'],
            'emergency': ['emergency', 'urgent', 'serious', 'hospital'],
            'prevention': ['prevention', 'vaccine', 'immunization', 'avoid']
        }
        
        for category, keywords in categories.items():
            if any(keyword in text_lower for keyword in keywords):
                return category
        
        return 'general_medical'
    
    def generate_embeddings_with_progress(self, chunks: List[Dict]):
        """Generate embeddings and add to FAISS index"""
        print("🔮 Generating embeddings...")
        
        try:
            texts = [chunk['text'] for chunk in chunks]
            
            # Generate embeddings in batches
            batch_size = 32
            all_embeddings = []
            
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i+batch_size]
                batch_embeddings = self.embedding_model.encode(batch_texts, show_progress_bar=False)
                all_embeddings.extend(batch_embeddings)
                
                progress = min(i + batch_size, len(texts))
                print(f"   Progress: {progress}/{len(texts)} chunks processed", end='\r')
            
            print(f"\n   ✅ Generated embeddings for {len(texts)} chunks")
            
            # Add to FAISS index
            embeddings_array = np.array(all_embeddings).astype('float32')
            self.faiss_index.add(embeddings_array)
            print("✅ Embeddings added to FAISS index!")
            
        except Exception as e:
            print(f"❌ Embedding generation failed: {e}")
            raise
    
    def retrieve_medical_context(self, query: str, n_results: int = 3) -> List[str]:
        """Retrieve relevant medical context"""
        if self.use_embeddings and self.embedding_model and self.faiss_ready and self.faiss_index.ntotal > 0:
            try:
                # Generate query embedding
                query_embedding = self.embedding_model.encode([query])
                
                # Search FAISS index
                distances, indices = self.faiss_index.search(
                    np.array(query_embedding).astype('float32'), 
                    min(n_results, self.faiss_index.ntotal)
                )
                
                # Get relevant chunks
                context_chunks = []
                for idx in indices[0]:
                    if idx != -1 and idx < len(self.knowledge_chunks):
                        context_chunks.append(self.knowledge_chunks[idx]['text'])
                
                if context_chunks:
                    return context_chunks
                    
            except Exception as e:
                print(f"⚠️ Embedding search failed: {e}")
        
        # Fallback to keyword search
        return self.keyword_search_medical(query, n_results)
    
    def keyword_search_medical(self, query: str, n_results: int) -> List[str]:
        """Medical-focused keyword search"""
        if not self.knowledge_chunks:
            return []
        
        query_words = set(query.lower().split())
        chunk_scores = []
        
        for chunk_info in self.knowledge_chunks:
            chunk_text = chunk_info['text']
            chunk_words = set(chunk_text.lower().split())
            
            # Calculate relevance score
            word_overlap = len(query_words.intersection(chunk_words))
            base_score = word_overlap / len(query_words) if query_words else 0
            
            # Boost medical content
            medical_boost = 0
            if chunk_info.get('medical_focus') in ['pediatric_symptoms', 'treatments', 'diagnosis']:
                medical_boost = 0.3
            
            final_score = base_score + medical_boost
            
            if final_score > 0:
                chunk_scores.append((final_score, chunk_text))
        
        # Return top matches
        chunk_scores.sort(reverse=True)
        return [chunk for _, chunk in chunk_scores[:n_results]]
    
    def generate_biogpt_response(self, context: str, query: str) -> str:
        """Generate medical response using BioGPT"""
        if not self.model or not self.tokenizer:
            return "Medical model not available. Please check the setup."
        
        try:
            # Create medical prompt
            prompt = f"""Medical Context: {context[:800]}

Question: {query}

Medical Answer:"""
            
            # Tokenize
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=1024
            )
            
            # Move to device
            if self.device == "cuda":
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=150,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=self.tokenizer.eos_token_id,
                    repetition_penalty=1.1
                )
            
            # Decode response
            full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract generated part
            if "Medical Answer:" in full_response:
                generated_response = full_response.split("Medical Answer:")[-1].strip()
            else:
                generated_response = full_response[len(prompt):].strip()
            
            return self.clean_medical_response(generated_response)
            
        except Exception as e:
            print(f"⚠️ BioGPT generation failed: {e}")
            return self.fallback_response(context, query)
    
    def clean_medical_response(self, response: str) -> str:
        """Clean and format medical response"""
        # Remove incomplete sentences and limit length
        sentences = re.split(r'[.!?]+', response)
        clean_sentences = []
        
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) > 10 and not sentence.endswith(('and', 'or', 'but', 'however')):
                clean_sentences.append(sentence)
            if len(clean_sentences) >= 3:
                break
        
        if clean_sentences:
            cleaned = '. '.join(clean_sentences) + '.'
        else:
            cleaned = response[:200] + '...' if len(response) > 200 else response
        
        return cleaned
    
    def fallback_response(self, context: str, query: str) -> str:
        """Fallback response when BioGPT fails"""
        sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 20]
        
        if sentences:
            response = sentences[0] + '.'
            if len(sentences) > 1:
                response += ' ' + sentences[1] + '.'
        else:
            response = context[:300] + '...'
        
        return response
    
    def handle_conversational_interactions(self, query: str) -> Optional[str]:
        """Handle conversational interactions"""
        query_lower = query.lower().strip()
        
        # Greetings
        if any(greeting in query_lower for greeting in ['hello', 'hi', 'hey', 'good morning', 'good afternoon']):
            return "👋 Hello! I'm your pediatric medical AI assistant. How can I help you with medical questions today?"
        
        # Thanks
        if any(thanks in query_lower for thanks in ['thank you', 'thanks', 'thx']):
            return "🙏 You're welcome! I'm glad I could help. Remember to consult healthcare professionals for medical decisions. What else can I help you with?"
        
        # Goodbyes
        if any(bye in query_lower for bye in ['bye', 'goodbye', 'see you later']):
            return "👋 Goodbye! Take care and remember to consult healthcare professionals for any medical concerns. Stay healthy!"
        
        return None
    
    def chat(self, query: str) -> str:
        """Main chat function"""
        if not query.strip():
            return "Hello! I'm your pediatric medical AI assistant. How can I help you today?"
        
        # Handle conversational interactions
        conversational_response = self.handle_conversational_interactions(query)
        if conversational_response:
            return conversational_response
        
        if not self.knowledge_chunks:
            return "Please load medical data first to access the medical knowledge base."
        
        if not self.model or not self.tokenizer:
            return "Medical model not available. Please check the setup and try again."
        
        # Retrieve context
        context = self.retrieve_medical_context(query)
        
        if not context:
            return "I don't have specific information about this topic in my medical database. Please consult with a healthcare professional for personalized medical advice."
        
        # Generate response
        main_context = '\n\n'.join(context)
        response = self.generate_biogpt_response(main_context, query)
        
        # Format final response
        final_response = f"🩺 **Medical Information:** {response}\n\n⚠️ **Important:** This information is for educational purposes only. Always consult with qualified healthcare professionals for medical diagnosis, treatment, and personalized advice."
        
        return final_response