File size: 15,854 Bytes

import os
import torch
from transformers import AutoTokenizer, PreTrainedTokenizerFast, AutoConfig
from torch.nn import functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SentenceEmbeddingModel(torch.nn.Module):
    """
    Sentence Embedding model for inference
    """
    def __init__(self, config):
        super(SentenceEmbeddingModel, self).__init__()
        
        # Create transformer model from config
        from transformers import AutoModel
        self.transformer = AutoModel.from_config(config)
        self.pooling_mode = 'mean'
        
    def forward(self, input_ids, attention_mask):
        # Get sequence outputs from transformer
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Mean pooling
        token_embeddings = outputs[0]  # First element of model_output contains token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        
        # Sum embeddings
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        
        # Sum mask
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Pool
        pooled_output = sum_embeddings / sum_mask
        
        # Normalize
        pooled_output = F.normalize(pooled_output, p=2, dim=1)
        
        return pooled_output

class SentenceEmbedder:
    def __init__(self, model_path):
        # Load saved model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Improved tokenizer loading with more robust error handling
        tokenizer_loaded = False
        
        # 1. Try AutoTokenizer first (most general approach)
        if not tokenizer_loaded:
            try:
                print(f"Trying AutoTokenizer from {model_path}")
                self.tokenizer = AutoTokenizer.from_pretrained(model_path)
                tokenizer_loaded = True
                print(f"Successfully loaded tokenizer with AutoTokenizer, vocab size: {self.tokenizer.vocab_size}")
            except Exception as e:
                print(f"AutoTokenizer failed: {e}")
        
        # 2. Try SentencePiece model if available
        if not tokenizer_loaded:
            spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
            if os.path.exists(spm_model_path):
                try:
                    print(f"Trying to load SentencePiece model from {spm_model_path}")
                    # Use SentencePiece directly
                    import sentencepiece as spm
                    sp_model = spm.SentencePieceProcessor()
                    sp_model.Load(spm_model_path)
                    
                    # Create a wrapper tokenizer
                    from transformers import PreTrainedTokenizer
                    
                    class SentencePieceTokenizer(PreTrainedTokenizer):
                        def __init__(self, sp_model):
                            super().__init__(bos_token="<s>", eos_token="</s>", 
                                            unk_token="<unk>", pad_token="<pad>",
                                            mask_token="<mask>")
                            self.sp_model = sp_model
                            
                        def _tokenize(self, text):
                            return self.sp_model.EncodeAsPieces(text)
                            
                        def _convert_token_to_id(self, token):
                            return self.sp_model.PieceToId(token)
                            
                        def _convert_id_to_token(self, index):
                            return self.sp_model.IdToPiece(index)
                            
                        @property
                        def vocab_size(self):
                            return self.sp_model.GetPieceSize()
                    
                    self.tokenizer = SentencePieceTokenizer(sp_model)
                    tokenizer_loaded = True
                    print(f"Successfully loaded SentencePiece tokenizer, vocab size: {self.tokenizer.vocab_size}")
                except Exception as e:
                    print(f"SentencePiece loading failed: {e}")
        
        # 3. Try tokenizer.json if available
        if not tokenizer_loaded:
            tokenizer_json_path = os.path.join(model_path, "tokenizer.json")
            if os.path.exists(tokenizer_json_path):
                try:
                    print(f"Trying to load tokenizer from {tokenizer_json_path}")
                    self.tokenizer = PreTrainedTokenizerFast(
                        tokenizer_file=tokenizer_json_path,
                        bos_token="<s>",
                        eos_token="</s>",
                        unk_token="<unk>",
                        pad_token="<pad>",
                        mask_token="<mask>",
                        model_max_length=512
                    )
                    tokenizer_loaded = True
                    print(f"Successfully loaded tokenizer with PreTrainedTokenizerFast, vocab size: {self.tokenizer.vocab_size}")
                except Exception as e:
                    print(f"PreTrainedTokenizerFast failed: {e}")
        
        # 4. Search for any tokenizer file as last resort
        if not tokenizer_loaded:
            try:
                print("Searching for any tokenizer files in the directory...")
                candidate_files = []
                for file in os.listdir(model_path):
                    filepath = os.path.join(model_path, file)
                    if os.path.isfile(filepath) and any(keyword in file.lower() for keyword in ['token', 'vocab', 'sentencepiece', 'bpe']):
                        candidate_files.append(filepath)
                
                if candidate_files:
                    print(f"Found potential tokenizer files: {candidate_files}")
                    # Try each file until one works
                    for file_path in candidate_files:
                        try:
                            if file_path.endswith('.json'):
                                self.tokenizer = PreTrainedTokenizerFast(
                                    tokenizer_file=file_path,
                                    bos_token="<s>",
                                    eos_token="</s>",
                                    unk_token="<unk>",
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    model_max_length=512
                                )
                                tokenizer_loaded = True
                                print(f"Successfully loaded tokenizer from {file_path}")
                                break
                            elif file_path.endswith('.model'):
                                import sentencepiece as spm
                                sp_model = spm.SentencePieceProcessor()
                                sp_model.Load(file_path)
                                # Create custom tokenizer as above
                                # This is simplified for brevity
                                tokenizer_loaded = True
                                print(f"Successfully loaded SentencePiece from {file_path}")
                                break
                        except Exception as file_e:
                            print(f"Failed to load {file_path}: {file_e}")
            except Exception as e:
                print(f"Error searching for tokenizer files: {e}")
        
        if not tokenizer_loaded:
            raise ValueError("Could not load tokenizer from any available source. Please check the model directory.")
        
        # Load model config
        try:
            print(f"Loading config from {model_path}")
            config = AutoConfig.from_pretrained(model_path)
            print(f"Config loaded with hidden_size={config.hidden_size}")
        except Exception as e:
            print(f"Error loading config: {e}")
            raise RuntimeError("Could not load model configuration")
        
        # Load model weights with handling for PyTorch version differences
        try:
            model_path_pt = os.path.join(model_path, 'embedding_model.pt')
            try:
                # Try with weights_only parameter (PyTorch >= 2.6)
                model_info = torch.load(
                    model_path_pt,
                    map_location=self.device,
                    weights_only=False
                )
            except TypeError:
                # Fall back for older PyTorch versions
                model_info = torch.load(
                    model_path_pt,
                    map_location=self.device
                )
            
            print(f"Model info keys: {list(model_info.keys())}")
        except Exception as e:
            print(f"Error loading model weights: {e}")
            raise RuntimeError(f"Could not load model weights: {e}")
        
        # Create model
        self.model = SentenceEmbeddingModel(config)
        
        # Load weights
        if 'model_state_dict' in model_info:
            self.model.load_state_dict(model_info['model_state_dict'])
        else:
            # If the state_dict is the whole model_info
            self.model.load_state_dict(model_info)
        
        self.model.to(self.device)
        self.model.eval()
        
        # Get embedding dimension
        self.embedding_dim = model_info.get('embedding_dim', config.hidden_size)
        print(f"Model loaded successfully with embedding dimension: {self.embedding_dim}")

    def encode(self, sentences, batch_size=32):
        """
        Encode sentences to embeddings
        """
        if isinstance(sentences, str):
            sentences = [sentences]
            
        all_embeddings = []
        
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            
            # Tokenize
            encoded_input = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=128, 
                return_tensors='pt'
            ).to(self.device)
            
            # Compute embeddings
            with torch.no_grad():
                embeddings = self.model(encoded_input['input_ids'], encoded_input['attention_mask'])
                
            all_embeddings.append(embeddings.cpu().numpy())
            
        # Concatenate all embeddings
        all_embeddings = np.vstack(all_embeddings)
        
        return all_embeddings
    
    def compute_similarity(self, sentences1, sentences2=None):
        """
        Compute similarity between sentences
        """
        embeddings1 = self.encode(sentences1)
        
        if sentences2 is None:
            # Compute similarity matrix for the sentences
            return cosine_similarity(embeddings1)
        else:
            embeddings2 = self.encode(sentences2)
            # Compute pairwise similarity
            return np.array([cosine_similarity([e1], [e2])[0][0] for e1, e2 in zip(embeddings1, embeddings2)])
    
    def search(self, query, documents, top_k=5):
        """
        Search for the most similar documents to a query
        """
        query_embedding = self.encode([query])[0]
        document_embeddings = self.encode(documents)
        
        # Compute cosine similarities
        similarities = cosine_similarity([query_embedding], document_embeddings)[0]
        
        # Get top_k indices
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        # Return results with scores
        results = []
        for idx in top_indices:
            results.append({
                'document': documents[idx],
                'score': similarities[idx]
            })
            
        return results

def main():
    # Remove args dependency and use fixed parameters
    model_path = "output/hindi-sentence-embeddings-from-scratch/final"
    mode = "similarity"
    
    # Load model
    model = SentenceEmbedder(model_path)
    
    # Example sentences for similarity computation
    sentences = [
        'मुझे हिंदी भाषा बहुत पसंद है।',
        'मैं हिंदी भाषा सीख रहा हूँ।',
        'भारत एक विशाल देश है।',
        'भारत में बहुत सारी भाषाएँ बोली जाती हैं।',
        'आज मौसम बहुत अच्छा है।',
        'कल बारिश होगी।',
        'दिल्ली भारत की राजधानी है।',
        'मुंबई भारत का आर्थिक केंद्र है।',
        'भारतीय खाना बहुत स्वादिष्ट होता है।',
        'मैं आज बाजार जाऊंगा।'
    ]
    
    # Document corpus for search
    document_corpus = [
        'हिंदी भारत की आधिकारिक भाषा है।',
        'भारत में अनेक भाषाएँ बोली जाती हैं।',
        'दिल्ली भारत की राजधानी है।',
        'मुंबई भारत का सबसे बड़ा शहर है।',
        'हिमालय पर्वत भारत के उत्तर में स्थित है।',
        'गंगा नदी भारत की सबसे पवित्र नदी है।',
        'भारतीय संस्कृति बहुत समृद्ध है।',
        'भारत में अनेक त्योहार मनाए जाते हैं।',
        'तमिल, तेलुगु, कन्नड़ और मलयालम दक्षिण भारत की प्रमुख भाषाएँ हैं।',
        'आम, अमरूद और केला भारत के लोकप्रिय फल हैं।',
        'भारत में विभिन्न धर्मों के लोग एक साथ रहते हैं।',
        'रामायण और महाभारत भारत के प्रसिद्ध महाकाव्य हैं।'
    ]
    
    if mode == 'similarity':
        # Compute similarity matrix
        print("Computing similarity matrix...")
        sim_matrix = model.compute_similarity(sentences)
        
        # Print sentences with indices
        print("\nSentences:")
        for i, sentence in enumerate(sentences):
            print(f"[{i}] {sentence}")
        
        # Print similarity matrix
        print("\nSimilarity matrix:")
        np.set_printoptions(precision=2)
        print(sim_matrix)
        
        # Find most similar sentence pairs
        print("\nMost similar sentence pairs:")
        # Skip diagonal (self-similarity)
        sim_matrix_no_diag = sim_matrix.copy()
        np.fill_diagonal(sim_matrix_no_diag, -1)
        for _ in range(5):  # Top 5 most similar pairs
            max_idx = np.unravel_index(sim_matrix_no_diag.argmax(), sim_matrix_no_diag.shape)
            i, j = max_idx
            print(f"Similarity: {sim_matrix[i, j]:.4f}")
            print(f"Sentence 1: {sentences[i]}")
            print(f"Sentence 2: {sentences[j]}")
            print("---")
            # Mark this pair as processed
            sim_matrix_no_diag[i, j] = -1

if __name__ == "__main__":
    main()