File size: 15,854 Bytes
2e61e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4d54f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c0b170
 
bc4d54f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c0b170
 
 
 
 
 
 
 
 
bc4d54f
 
 
 
 
 
 
 
 
 
 
 
 
 
2e61e52
bc4d54f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e61e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc4d54f
2e61e52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
import os
import torch
from transformers import AutoTokenizer, PreTrainedTokenizerFast, AutoConfig
from torch.nn import functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SentenceEmbeddingModel(torch.nn.Module):
    """
    Sentence Embedding model for inference
    """
    def __init__(self, config):
        super(SentenceEmbeddingModel, self).__init__()
        
        # Create transformer model from config
        from transformers import AutoModel
        self.transformer = AutoModel.from_config(config)
        self.pooling_mode = 'mean'
        
    def forward(self, input_ids, attention_mask):
        # Get sequence outputs from transformer
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Mean pooling
        token_embeddings = outputs[0]  # First element of model_output contains token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        
        # Sum embeddings
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        
        # Sum mask
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Pool
        pooled_output = sum_embeddings / sum_mask
        
        # Normalize
        pooled_output = F.normalize(pooled_output, p=2, dim=1)
        
        return pooled_output

class SentenceEmbedder:
    def __init__(self, model_path):
        # Load saved model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Improved tokenizer loading with more robust error handling
        tokenizer_loaded = False
        
        # 1. Try AutoTokenizer first (most general approach)
        if not tokenizer_loaded:
            try:
                print(f"Trying AutoTokenizer from {model_path}")
                self.tokenizer = AutoTokenizer.from_pretrained(model_path)
                tokenizer_loaded = True
                print(f"Successfully loaded tokenizer with AutoTokenizer, vocab size: {self.tokenizer.vocab_size}")
            except Exception as e:
                print(f"AutoTokenizer failed: {e}")
        
        # 2. Try SentencePiece model if available
        if not tokenizer_loaded:
            spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
            if os.path.exists(spm_model_path):
                try:
                    print(f"Trying to load SentencePiece model from {spm_model_path}")
                    # Use SentencePiece directly
                    import sentencepiece as spm
                    sp_model = spm.SentencePieceProcessor()
                    sp_model.Load(spm_model_path)
                    
                    # Create a wrapper tokenizer
                    from transformers import PreTrainedTokenizer
                    
                    class SentencePieceTokenizer(PreTrainedTokenizer):
                        def __init__(self, sp_model):
                            super().__init__(bos_token="<s>", eos_token="</s>", 
                                            unk_token="<unk>", pad_token="<pad>",
                                            mask_token="<mask>")
                            self.sp_model = sp_model
                            
                        def _tokenize(self, text):
                            return self.sp_model.EncodeAsPieces(text)
                            
                        def _convert_token_to_id(self, token):
                            return self.sp_model.PieceToId(token)
                            
                        def _convert_id_to_token(self, index):
                            return self.sp_model.IdToPiece(index)
                            
                        @property
                        def vocab_size(self):
                            return self.sp_model.GetPieceSize()
                    
                    self.tokenizer = SentencePieceTokenizer(sp_model)
                    tokenizer_loaded = True
                    print(f"Successfully loaded SentencePiece tokenizer, vocab size: {self.tokenizer.vocab_size}")
                except Exception as e:
                    print(f"SentencePiece loading failed: {e}")
        
        # 3. Try tokenizer.json if available
        if not tokenizer_loaded:
            tokenizer_json_path = os.path.join(model_path, "tokenizer.json")
            if os.path.exists(tokenizer_json_path):
                try:
                    print(f"Trying to load tokenizer from {tokenizer_json_path}")
                    self.tokenizer = PreTrainedTokenizerFast(
                        tokenizer_file=tokenizer_json_path,
                        bos_token="<s>",
                        eos_token="</s>",
                        unk_token="<unk>",
                        pad_token="<pad>",
                        mask_token="<mask>",
                        model_max_length=512
                    )
                    tokenizer_loaded = True
                    print(f"Successfully loaded tokenizer with PreTrainedTokenizerFast, vocab size: {self.tokenizer.vocab_size}")
                except Exception as e:
                    print(f"PreTrainedTokenizerFast failed: {e}")
        
        # 4. Search for any tokenizer file as last resort
        if not tokenizer_loaded:
            try:
                print("Searching for any tokenizer files in the directory...")
                candidate_files = []
                for file in os.listdir(model_path):
                    filepath = os.path.join(model_path, file)
                    if os.path.isfile(filepath) and any(keyword in file.lower() for keyword in ['token', 'vocab', 'sentencepiece', 'bpe']):
                        candidate_files.append(filepath)
                
                if candidate_files:
                    print(f"Found potential tokenizer files: {candidate_files}")
                    # Try each file until one works
                    for file_path in candidate_files:
                        try:
                            if file_path.endswith('.json'):
                                self.tokenizer = PreTrainedTokenizerFast(
                                    tokenizer_file=file_path,
                                    bos_token="<s>",
                                    eos_token="</s>",
                                    unk_token="<unk>",
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    model_max_length=512
                                )
                                tokenizer_loaded = True
                                print(f"Successfully loaded tokenizer from {file_path}")
                                break
                            elif file_path.endswith('.model'):
                                import sentencepiece as spm
                                sp_model = spm.SentencePieceProcessor()
                                sp_model.Load(file_path)
                                # Create custom tokenizer as above
                                # This is simplified for brevity
                                tokenizer_loaded = True
                                print(f"Successfully loaded SentencePiece from {file_path}")
                                break
                        except Exception as file_e:
                            print(f"Failed to load {file_path}: {file_e}")
            except Exception as e:
                print(f"Error searching for tokenizer files: {e}")
        
        if not tokenizer_loaded:
            raise ValueError("Could not load tokenizer from any available source. Please check the model directory.")
        
        # Load model config
        try:
            print(f"Loading config from {model_path}")
            config = AutoConfig.from_pretrained(model_path)
            print(f"Config loaded with hidden_size={config.hidden_size}")
        except Exception as e:
            print(f"Error loading config: {e}")
            raise RuntimeError("Could not load model configuration")
        
        # Load model weights with handling for PyTorch version differences
        try:
            model_path_pt = os.path.join(model_path, 'embedding_model.pt')
            try:
                # Try with weights_only parameter (PyTorch >= 2.6)
                model_info = torch.load(
                    model_path_pt,
                    map_location=self.device,
                    weights_only=False
                )
            except TypeError:
                # Fall back for older PyTorch versions
                model_info = torch.load(
                    model_path_pt,
                    map_location=self.device
                )
            
            print(f"Model info keys: {list(model_info.keys())}")
        except Exception as e:
            print(f"Error loading model weights: {e}")
            raise RuntimeError(f"Could not load model weights: {e}")
        
        # Create model
        self.model = SentenceEmbeddingModel(config)
        
        # Load weights
        if 'model_state_dict' in model_info:
            self.model.load_state_dict(model_info['model_state_dict'])
        else:
            # If the state_dict is the whole model_info
            self.model.load_state_dict(model_info)
        
        self.model.to(self.device)
        self.model.eval()
        
        # Get embedding dimension
        self.embedding_dim = model_info.get('embedding_dim', config.hidden_size)
        print(f"Model loaded successfully with embedding dimension: {self.embedding_dim}")

    def encode(self, sentences, batch_size=32):
        """
        Encode sentences to embeddings
        """
        if isinstance(sentences, str):
            sentences = [sentences]
            
        all_embeddings = []
        
        for i in range(0, len(sentences), batch_size):
            batch = sentences[i:i+batch_size]
            
            # Tokenize
            encoded_input = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=128, 
                return_tensors='pt'
            ).to(self.device)
            
            # Compute embeddings
            with torch.no_grad():
                embeddings = self.model(encoded_input['input_ids'], encoded_input['attention_mask'])
                
            all_embeddings.append(embeddings.cpu().numpy())
            
        # Concatenate all embeddings
        all_embeddings = np.vstack(all_embeddings)
        
        return all_embeddings
    
    def compute_similarity(self, sentences1, sentences2=None):
        """
        Compute similarity between sentences
        """
        embeddings1 = self.encode(sentences1)
        
        if sentences2 is None:
            # Compute similarity matrix for the sentences
            return cosine_similarity(embeddings1)
        else:
            embeddings2 = self.encode(sentences2)
            # Compute pairwise similarity
            return np.array([cosine_similarity([e1], [e2])[0][0] for e1, e2 in zip(embeddings1, embeddings2)])
    
    def search(self, query, documents, top_k=5):
        """
        Search for the most similar documents to a query
        """
        query_embedding = self.encode([query])[0]
        document_embeddings = self.encode(documents)
        
        # Compute cosine similarities
        similarities = cosine_similarity([query_embedding], document_embeddings)[0]
        
        # Get top_k indices
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        # Return results with scores
        results = []
        for idx in top_indices:
            results.append({
                'document': documents[idx],
                'score': similarities[idx]
            })
            
        return results

def main():
    # Remove args dependency and use fixed parameters
    model_path = "output/hindi-sentence-embeddings-from-scratch/final"
    mode = "similarity"
    
    # Load model
    model = SentenceEmbedder(model_path)
    
    # Example sentences for similarity computation
    sentences = [
        'मुझे हिंदी भाषा बहुत पसंद है।',
        'मैं हिंदी भाषा सीख रहा हूँ।',
        'भारत एक विशाल देश है।',
        'भारत में बहुत सारी भाषाएँ बोली जाती हैं।',
        'आज मौसम बहुत अच्छा है।',
        'कल बारिश होगी।',
        'दिल्ली भारत की राजधानी है।',
        'मुंबई भारत का आर्थिक केंद्र है।',
        'भारतीय खाना बहुत स्वादिष्ट होता है।',
        'मैं आज बाजार जाऊंगा।'
    ]
    
    # Document corpus for search
    document_corpus = [
        'हिंदी भारत की आधिकारिक भाषा है।',
        'भारत में अनेक भाषाएँ बोली जाती हैं।',
        'दिल्ली भारत की राजधानी है।',
        'मुंबई भारत का सबसे बड़ा शहर है।',
        'हिमालय पर्वत भारत के उत्तर में स्थित है।',
        'गंगा नदी भारत की सबसे पवित्र नदी है।',
        'भारतीय संस्कृति बहुत समृद्ध है।',
        'भारत में अनेक त्योहार मनाए जाते हैं।',
        'तमिल, तेलुगु, कन्नड़ और मलयालम दक्षिण भारत की प्रमुख भाषाएँ हैं।',
        'आम, अमरूद और केला भारत के लोकप्रिय फल हैं।',
        'भारत में विभिन्न धर्मों के लोग एक साथ रहते हैं।',
        'रामायण और महाभारत भारत के प्रसिद्ध महाकाव्य हैं।'
    ]
    
    if mode == 'similarity':
        # Compute similarity matrix
        print("Computing similarity matrix...")
        sim_matrix = model.compute_similarity(sentences)
        
        # Print sentences with indices
        print("\nSentences:")
        for i, sentence in enumerate(sentences):
            print(f"[{i}] {sentence}")
        
        # Print similarity matrix
        print("\nSimilarity matrix:")
        np.set_printoptions(precision=2)
        print(sim_matrix)
        
        # Find most similar sentence pairs
        print("\nMost similar sentence pairs:")
        # Skip diagonal (self-similarity)
        sim_matrix_no_diag = sim_matrix.copy()
        np.fill_diagonal(sim_matrix_no_diag, -1)
        for _ in range(5):  # Top 5 most similar pairs
            max_idx = np.unravel_index(sim_matrix_no_diag.argmax(), sim_matrix_no_diag.shape)
            i, j = max_idx
            print(f"Similarity: {sim_matrix[i, j]:.4f}")
            print(f"Sentence 1: {sentences[i]}")
            print(f"Sentence 2: {sentences[j]}")
            print("---")
            # Mark this pair as processed
            sim_matrix_no_diag[i, j] = -1

if __name__ == "__main__":
    main()