Initial model upload: Hindi Sentence Embeddings Foundational Model
Browse files- config.json +1 -1
- hindi_embeddings.py +106 -59
config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
{
|
2 |
"_name_or_path": "convai-hindi-embedding",
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"bos_token_id": 0,
|
|
|
1 |
{
|
2 |
"_name_or_path": "convai-hindi-embedding",
|
3 |
"architectures": [
|
4 |
+
"ConvaiEmbedding"
|
5 |
],
|
6 |
"attention_probs_dropout_prob": 0.1,
|
7 |
"bos_token_id": 0,
|
hindi_embeddings.py
CHANGED
@@ -45,29 +45,65 @@ class SentenceEmbedder:
|
|
45 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
46 |
print(f"Using device: {self.device}")
|
47 |
|
48 |
-
#
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
|
52 |
if os.path.exists(spm_model_path):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
self.tokenizer = PreTrainedTokenizerFast(
|
72 |
tokenizer_file=tokenizer_json_path,
|
73 |
bos_token="<s>",
|
@@ -77,44 +113,55 @@ class SentenceEmbedder:
|
|
77 |
mask_token="<mask>",
|
78 |
model_max_length=512
|
79 |
)
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
|
119 |
# Load model config
|
120 |
try:
|
@@ -235,7 +282,7 @@ class SentenceEmbedder:
|
|
235 |
|
236 |
def main():
|
237 |
# Remove args dependency and use fixed parameters
|
238 |
-
model_path = "/
|
239 |
mode = "similarity"
|
240 |
|
241 |
# Load model
|
|
|
45 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
46 |
print(f"Using device: {self.device}")
|
47 |
|
48 |
+
# Improved tokenizer loading with more robust error handling
|
49 |
+
tokenizer_loaded = False
|
50 |
+
|
51 |
+
# 1. Try AutoTokenizer first (most general approach)
|
52 |
+
if not tokenizer_loaded:
|
53 |
+
try:
|
54 |
+
print(f"Trying AutoTokenizer from {model_path}")
|
55 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
56 |
+
tokenizer_loaded = True
|
57 |
+
print(f"Successfully loaded tokenizer with AutoTokenizer, vocab size: {self.tokenizer.vocab_size}")
|
58 |
+
except Exception as e:
|
59 |
+
print(f"AutoTokenizer failed: {e}")
|
60 |
+
|
61 |
+
# 2. Try SentencePiece model if available
|
62 |
+
if not tokenizer_loaded:
|
63 |
spm_model_path = os.path.join(model_path, "sentencepiece.bpe.model")
|
64 |
if os.path.exists(spm_model_path):
|
65 |
+
try:
|
66 |
+
print(f"Trying to load SentencePiece model from {spm_model_path}")
|
67 |
+
# Use SentencePiece directly
|
68 |
+
import sentencepiece as spm
|
69 |
+
sp_model = spm.SentencePieceProcessor()
|
70 |
+
sp_model.Load(spm_model_path)
|
71 |
+
|
72 |
+
# Create a wrapper tokenizer
|
73 |
+
from transformers import PreTrainedTokenizer
|
74 |
+
|
75 |
+
class SentencePieceTokenizer(PreTrainedTokenizer):
|
76 |
+
def __init__(self, sp_model):
|
77 |
+
super().__init__(bos_token="<s>", eos_token="</s>",
|
78 |
+
unk_token="<unk>", pad_token="<pad>",
|
79 |
+
mask_token="<mask>")
|
80 |
+
self.sp_model = sp_model
|
81 |
+
|
82 |
+
def _tokenize(self, text):
|
83 |
+
return self.sp_model.EncodeAsPieces(text)
|
84 |
+
|
85 |
+
def _convert_token_to_id(self, token):
|
86 |
+
return self.sp_model.PieceToId(token)
|
87 |
+
|
88 |
+
def _convert_id_to_token(self, index):
|
89 |
+
return self.sp_model.IdToPiece(index)
|
90 |
+
|
91 |
+
@property
|
92 |
+
def vocab_size(self):
|
93 |
+
return self.sp_model.GetPieceSize()
|
94 |
+
|
95 |
+
self.tokenizer = SentencePieceTokenizer(sp_model)
|
96 |
+
tokenizer_loaded = True
|
97 |
+
print(f"Successfully loaded SentencePiece tokenizer, vocab size: {self.tokenizer.vocab_size}")
|
98 |
+
except Exception as e:
|
99 |
+
print(f"SentencePiece loading failed: {e}")
|
100 |
+
|
101 |
+
# 3. Try tokenizer.json if available
|
102 |
+
if not tokenizer_loaded:
|
103 |
+
tokenizer_json_path = os.path.join(model_path, "tokenizer.json")
|
104 |
+
if os.path.exists(tokenizer_json_path):
|
105 |
+
try:
|
106 |
+
print(f"Trying to load tokenizer from {tokenizer_json_path}")
|
107 |
self.tokenizer = PreTrainedTokenizerFast(
|
108 |
tokenizer_file=tokenizer_json_path,
|
109 |
bos_token="<s>",
|
|
|
113 |
mask_token="<mask>",
|
114 |
model_max_length=512
|
115 |
)
|
116 |
+
tokenizer_loaded = True
|
117 |
+
print(f"Successfully loaded tokenizer with PreTrainedTokenizerFast, vocab size: {self.tokenizer.vocab_size}")
|
118 |
+
except Exception as e:
|
119 |
+
print(f"PreTrainedTokenizerFast failed: {e}")
|
120 |
+
|
121 |
+
# 4. Search for any tokenizer file as last resort
|
122 |
+
if not tokenizer_loaded:
|
123 |
+
try:
|
124 |
+
print("Searching for any tokenizer files in the directory...")
|
125 |
+
candidate_files = []
|
126 |
+
for file in os.listdir(model_path):
|
127 |
+
filepath = os.path.join(model_path, file)
|
128 |
+
if os.path.isfile(filepath) and any(keyword in file.lower() for keyword in ['token', 'vocab', 'sentencepiece', 'bpe']):
|
129 |
+
candidate_files.append(filepath)
|
130 |
|
131 |
+
if candidate_files:
|
132 |
+
print(f"Found potential tokenizer files: {candidate_files}")
|
133 |
+
# Try each file until one works
|
134 |
+
for file_path in candidate_files:
|
135 |
+
try:
|
136 |
+
if file_path.endswith('.json'):
|
137 |
+
self.tokenizer = PreTrainedTokenizerFast(
|
138 |
+
tokenizer_file=file_path,
|
139 |
+
bos_token="<s>",
|
140 |
+
eos_token="</s>",
|
141 |
+
unk_token="<unk>",
|
142 |
+
pad_token="<pad>",
|
143 |
+
mask_token="<mask>",
|
144 |
+
model_max_length=512
|
145 |
+
)
|
146 |
+
tokenizer_loaded = True
|
147 |
+
print(f"Successfully loaded tokenizer from {file_path}")
|
148 |
+
break
|
149 |
+
elif file_path.endswith('.model'):
|
150 |
+
import sentencepiece as spm
|
151 |
+
sp_model = spm.SentencePieceProcessor()
|
152 |
+
sp_model.Load(file_path)
|
153 |
+
# Create custom tokenizer as above
|
154 |
+
# This is simplified for brevity
|
155 |
+
tokenizer_loaded = True
|
156 |
+
print(f"Successfully loaded SentencePiece from {file_path}")
|
157 |
+
break
|
158 |
+
except Exception as file_e:
|
159 |
+
print(f"Failed to load {file_path}: {file_e}")
|
160 |
+
except Exception as e:
|
161 |
+
print(f"Error searching for tokenizer files: {e}")
|
162 |
+
|
163 |
+
if not tokenizer_loaded:
|
164 |
+
raise ValueError("Could not load tokenizer from any available source. Please check the model directory.")
|
165 |
|
166 |
# Load model config
|
167 |
try:
|
|
|
282 |
|
283 |
def main():
|
284 |
# Remove args dependency and use fixed parameters
|
285 |
+
model_path = "output/hindi-sentence-embeddings-from-scratch/final"
|
286 |
mode = "similarity"
|
287 |
|
288 |
# Load model
|