convaiinnovations commited on
Commit
bc4d54f
·
verified ·
1 Parent(s): 2c0b170

Initial model upload: Hindi Sentence Embeddings Foundational Model

Browse files
Files changed (2) hide show
  1. config.json +1 -1
  2. hindi_embeddings.py +106 -59
config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_name_or_path": "convai-hindi-embedding",
3
  "architectures": [
4
- "XLMRobertaModel"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
 
1
  {
2
  "_name_or_path": "convai-hindi-embedding",
3
  "architectures": [
4
+ "ConvaiEmbedding"
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
hindi_embeddings.py CHANGED
@@ -45,29 +45,65 @@ class SentenceEmbedder:
45
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
  print(f"Using device: {self.device}")
47
 
48
- # Try to load tokenizer using sentencepiece model
49
- try:
50
- # Use the sentencepiece.bpe.model file
 
 
 
 
 
 
 
 
 
 
 
 
51
  spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
52
  if os.path.exists(spm_model_path):
53
- print(f"Loading SentencePiece tokenizer from {spm_model_path}")
54
- from transformers import XLNetTokenizer
55
- self.tokenizer = XLNetTokenizer.from_pretrained(
56
- model_path,
57
- vocab_file=None, # Not needed for SPM
58
- bos_token="<s>",
59
- eos_token="</s>",
60
- unk_token="<unk>",
61
- pad_token="<pad>",
62
- mask_token="<mask>",
63
- model_max_length=512,
64
- do_lower_case=False
65
- )
66
- else:
67
- # Fallback to other tokenizer methods
68
- tokenizer_json_path = os.path.join(model_path, "tokenizer.json")
69
- if os.path.exists(tokenizer_json_path):
70
- print(f"Loading tokenizer from {tokenizer_json_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  self.tokenizer = PreTrainedTokenizerFast(
72
  tokenizer_file=tokenizer_json_path,
73
  bos_token="<s>",
@@ -77,44 +113,55 @@ class SentenceEmbedder:
77
  mask_token="<mask>",
78
  model_max_length=512
79
  )
80
- else:
81
- # Try using AutoTokenizer
82
- print(f"Loading tokenizer using AutoTokenizer from {model_path}")
83
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
 
 
 
 
 
 
 
84
 
85
- print(f"Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
86
-
87
- except Exception as e:
88
- print(f"Error loading tokenizer: {e}")
89
- # Look for alternative tokenizer files
90
- tokenizer_files = [f for f in os.listdir(model_path) if (f.endswith('.model') or f.endswith('.json')) and ('token' in f.lower() or 'sentence' in f.lower())]
91
- if tokenizer_files:
92
- tokenizer_file = os.path.join(model_path, tokenizer_files[0])
93
- print(f"Found alternative tokenizer file: {tokenizer_file}")
94
- if tokenizer_file.endswith('.model'):
95
- from transformers import XLNetTokenizer
96
- self.tokenizer = XLNetTokenizer.from_pretrained(
97
- tokenizer_file,
98
- bos_token="<s>",
99
- eos_token="</s>",
100
- unk_token="<unk>",
101
- pad_token="<pad>",
102
- mask_token="<mask>",
103
- model_max_length=512,
104
- do_lower_case=False
105
- )
106
- else:
107
- self.tokenizer = PreTrainedTokenizerFast(
108
- tokenizer_file=tokenizer_file,
109
- bos_token="<s>",
110
- eos_token="</s>",
111
- unk_token="<unk>",
112
- pad_token="<pad>",
113
- mask_token="<mask>",
114
- model_max_length=512
115
- )
116
- else:
117
- raise ValueError(f"No tokenizer file found in {model_path}")
 
118
 
119
  # Load model config
120
  try:
@@ -235,7 +282,7 @@ class SentenceEmbedder:
235
 
236
  def main():
237
  # Remove args dependency and use fixed parameters
238
- model_path = "/content/hindi-embeddings-foundational-model"
239
  mode = "similarity"
240
 
241
  # Load model
 
45
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
46
  print(f"Using device: {self.device}")
47
 
48
+ # Improved tokenizer loading with more robust error handling
49
+ tokenizer_loaded = False
50
+
51
+ # 1. Try AutoTokenizer first (most general approach)
52
+ if not tokenizer_loaded:
53
+ try:
54
+ print(f"Trying AutoTokenizer from {model_path}")
55
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
56
+ tokenizer_loaded = True
57
+ print(f"Successfully loaded tokenizer with AutoTokenizer, vocab size: {self.tokenizer.vocab_size}")
58
+ except Exception as e:
59
+ print(f"AutoTokenizer failed: {e}")
60
+
61
+ # 2. Try SentencePiece model if available
62
+ if not tokenizer_loaded:
63
  spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
64
  if os.path.exists(spm_model_path):
65
+ try:
66
+ print(f"Trying to load SentencePiece model from {spm_model_path}")
67
+ # Use SentencePiece directly
68
+ import sentencepiece as spm
69
+ sp_model = spm.SentencePieceProcessor()
70
+ sp_model.Load(spm_model_path)
71
+
72
+ # Create a wrapper tokenizer
73
+ from transformers import PreTrainedTokenizer
74
+
75
+ class SentencePieceTokenizer(PreTrainedTokenizer):
76
+ def __init__(self, sp_model):
77
+ super().__init__(bos_token="<s>", eos_token="</s>",
78
+ unk_token="<unk>", pad_token="<pad>",
79
+ mask_token="<mask>")
80
+ self.sp_model = sp_model
81
+
82
+ def _tokenize(self, text):
83
+ return self.sp_model.EncodeAsPieces(text)
84
+
85
+ def _convert_token_to_id(self, token):
86
+ return self.sp_model.PieceToId(token)
87
+
88
+ def _convert_id_to_token(self, index):
89
+ return self.sp_model.IdToPiece(index)
90
+
91
+ @property
92
+ def vocab_size(self):
93
+ return self.sp_model.GetPieceSize()
94
+
95
+ self.tokenizer = SentencePieceTokenizer(sp_model)
96
+ tokenizer_loaded = True
97
+ print(f"Successfully loaded SentencePiece tokenizer, vocab size: {self.tokenizer.vocab_size}")
98
+ except Exception as e:
99
+ print(f"SentencePiece loading failed: {e}")
100
+
101
+ # 3. Try tokenizer.json if available
102
+ if not tokenizer_loaded:
103
+ tokenizer_json_path = os.path.join(model_path, "tokenizer.json")
104
+ if os.path.exists(tokenizer_json_path):
105
+ try:
106
+ print(f"Trying to load tokenizer from {tokenizer_json_path}")
107
  self.tokenizer = PreTrainedTokenizerFast(
108
  tokenizer_file=tokenizer_json_path,
109
  bos_token="<s>",
 
113
  mask_token="<mask>",
114
  model_max_length=512
115
  )
116
+ tokenizer_loaded = True
117
+ print(f"Successfully loaded tokenizer with PreTrainedTokenizerFast, vocab size: {self.tokenizer.vocab_size}")
118
+ except Exception as e:
119
+ print(f"PreTrainedTokenizerFast failed: {e}")
120
+
121
+ # 4. Search for any tokenizer file as last resort
122
+ if not tokenizer_loaded:
123
+ try:
124
+ print("Searching for any tokenizer files in the directory...")
125
+ candidate_files = []
126
+ for file in os.listdir(model_path):
127
+ filepath = os.path.join(model_path, file)
128
+ if os.path.isfile(filepath) and any(keyword in file.lower() for keyword in ['token', 'vocab', 'sentencepiece', 'bpe']):
129
+ candidate_files.append(filepath)
130
 
131
+ if candidate_files:
132
+ print(f"Found potential tokenizer files: {candidate_files}")
133
+ # Try each file until one works
134
+ for file_path in candidate_files:
135
+ try:
136
+ if file_path.endswith('.json'):
137
+ self.tokenizer = PreTrainedTokenizerFast(
138
+ tokenizer_file=file_path,
139
+ bos_token="<s>",
140
+ eos_token="</s>",
141
+ unk_token="<unk>",
142
+ pad_token="<pad>",
143
+ mask_token="<mask>",
144
+ model_max_length=512
145
+ )
146
+ tokenizer_loaded = True
147
+ print(f"Successfully loaded tokenizer from {file_path}")
148
+ break
149
+ elif file_path.endswith('.model'):
150
+ import sentencepiece as spm
151
+ sp_model = spm.SentencePieceProcessor()
152
+ sp_model.Load(file_path)
153
+ # Create custom tokenizer as above
154
+ # This is simplified for brevity
155
+ tokenizer_loaded = True
156
+ print(f"Successfully loaded SentencePiece from {file_path}")
157
+ break
158
+ except Exception as file_e:
159
+ print(f"Failed to load {file_path}: {file_e}")
160
+ except Exception as e:
161
+ print(f"Error searching for tokenizer files: {e}")
162
+
163
+ if not tokenizer_loaded:
164
+ raise ValueError("Could not load tokenizer from any available source. Please check the model directory.")
165
 
166
  # Load model config
167
  try:
 
282
 
283
  def main():
284
  # Remove args dependency and use fixed parameters
285
+ model_path = "output/hindi-sentence-embeddings-from-scratch/final"
286
  mode = "similarity"
287
 
288
  # Load model