KingNish commited on
Commit
695fb8c
·
verified ·
1 Parent(s): d502da0

Delete mmtokenizer.py

Browse files
Files changed (1) hide show
  1. mmtokenizer.py +0 -367
mmtokenizer.py DELETED
@@ -1,367 +0,0 @@
1
- from abc import ABC
2
- from abc import abstractmethod
3
-
4
-
5
- class AbstractTokenizer(ABC):
6
- """Abstract class for tokenizer."""
7
-
8
- def __init__(self, name):
9
- self.name = name
10
- super().__init__()
11
-
12
- @property
13
- @abstractmethod
14
- def vocab_size(self):
15
- pass
16
-
17
- @property
18
- @abstractmethod
19
- def vocab(self):
20
- """Dictionary from vocab text token to id token."""
21
- pass
22
-
23
- @property
24
- @abstractmethod
25
- def inv_vocab(self):
26
- """Dictionary from vocab id token to text token."""
27
- pass
28
-
29
- @abstractmethod
30
- def tokenize(self, text):
31
- pass
32
-
33
- def detokenize(self, token_ids):
34
- raise NotImplementedError('detokenizer is not implemented for {} '
35
- 'tokenizer'.format(self.name))
36
-
37
- @property
38
- def cls(self):
39
- raise NotImplementedError('CLS is not provided for {} '
40
- 'tokenizer'.format(self.name))
41
-
42
- @property
43
- def sep(self):
44
- raise NotImplementedError('SEP is not provided for {} '
45
- 'tokenizer'.format(self.name))
46
-
47
- @property
48
- def pad(self):
49
- raise NotImplementedError('PAD is not provided for {} '
50
- 'tokenizer'.format(self.name))
51
-
52
- @property
53
- def eod(self):
54
- raise NotImplementedError('EOD is not provided for {} '
55
- 'tokenizer'.format(self.name))
56
-
57
- @property
58
- def mask(self):
59
- raise NotImplementedError('MASK is not provided for {} '
60
- 'tokenizer'.format(self.name))
61
-
62
-
63
- class _SentencePieceTokenizer(AbstractTokenizer):
64
- """SentencePieceTokenizer-Megatron wrapper"""
65
-
66
- def __init__(self, model_file, vocab_extra_ids=0):
67
- name = 'SentencePieceTokenizer'
68
- super().__init__(name)
69
-
70
- import sentencepiece
71
- self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=model_file)
72
- self._initalize(vocab_extra_ids)
73
-
74
- def _populate_vocab(self):
75
- self._vocab = {}
76
- self._inv_vocab = {}
77
-
78
- for i in range(len(self.tokenizer)):
79
- t = self.tokenizer.id_to_piece(i)
80
- self._inv_vocab[i] = t
81
- self._vocab[t] = i
82
-
83
- def _initalize(self, vocab_extra_ids):
84
- self._populate_vocab()
85
- self._special_tokens = {}
86
- self._inv_special_tokens = {}
87
-
88
- self._t5_tokens = []
89
-
90
- def _add_special_token(t):
91
- if t not in self._vocab:
92
- next_id = len(self._vocab)
93
- self._vocab[t] = next_id
94
- self._inv_vocab[next_id] = t
95
- self._special_tokens[t] = self._vocab[t]
96
- self._inv_special_tokens[self._vocab[t]] = t
97
-
98
- _add_special_token('<CLS>')
99
- self._cls_id = self._vocab['<CLS>']
100
- _add_special_token('<SEP>')
101
- self._sep_id = self._vocab['<SEP>']
102
- _add_special_token('<EOD>')
103
- self._eod_id = self._vocab['<EOD>']
104
- _add_special_token('<MASK>')
105
- self._mask_id = self._vocab['<MASK>']
106
-
107
- pad_id = self.tokenizer.pad_id()
108
- try:
109
- pad_token = self.tokenizer.id_to_piece(pad_id)
110
- except IndexError:
111
- pad_token = '<PAD>'
112
- _add_special_token(pad_token)
113
- self._pad_id = self._vocab[pad_token]
114
-
115
- bos_id = self.tokenizer.bos_id()
116
- try:
117
- bos_token = self.tokenizer.id_to_piece(bos_id)
118
- except IndexError:
119
- bos_token = '<BOS>'
120
- _add_special_token(bos_token)
121
- self._bos_id = self._vocab[bos_token]
122
-
123
- eos_id = self.tokenizer.eos_id()
124
- try:
125
- eos_token = self.tokenizer.id_to_piece(eos_id)
126
- except IndexError:
127
- eos_token = '<EOS>'
128
- _add_special_token(eos_token)
129
- self._eos_id = self._vocab[eos_token]
130
-
131
- for i in range(vocab_extra_ids):
132
- t = "<extra_id_{}>".format(i)
133
- _add_special_token(t)
134
- self._t5_tokens += [t]
135
-
136
- @property
137
- def vocab_size(self):
138
- return len(self._vocab)
139
-
140
- @property
141
- def vocab(self):
142
- return self._vocab
143
-
144
- @property
145
- def inv_vocab(self):
146
- return self._inv_vocab
147
-
148
- @property
149
- def decoder(self):
150
- return self._inv_vocab
151
-
152
- @property
153
- def encoder(self):
154
- return self._vocab
155
-
156
- # From:
157
- # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L89
158
- def tokenize(self, text):
159
- ids = []
160
- idx = 0
161
-
162
- while 1:
163
- indices = {}
164
- for token in self._special_tokens:
165
- try:
166
- indices[token] = text[idx:].index(token)
167
- except ValueError:
168
- continue
169
- if len(indices) == 0:
170
- break
171
-
172
- next_token = min(indices, key=indices.get)
173
- next_idx = idx + indices[next_token]
174
-
175
- ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
176
- ids.append(self._special_tokens[next_token])
177
- idx = next_idx + len(next_token)
178
-
179
- ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
180
- return ids
181
-
182
- # From:
183
- # https://github.com/NVIDIA/NeMo/blob/c8fa217e811d60d11d014827c7f3845ff6c99ae7/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py#L125
184
- def detokenize(self, ids):
185
- text = ""
186
- last_i = 0
187
-
188
- for i, id in enumerate(ids):
189
- if id in self._inv_special_tokens:
190
- text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
191
- text += self._inv_special_tokens[id] + " "
192
- last_i = i + 1
193
-
194
- text += self.tokenizer.decode_ids(ids[last_i:])
195
- return text
196
-
197
- @property
198
- def cls(self):
199
- return self._cls_id
200
-
201
- @property
202
- def sep(self):
203
- return self._sep_id
204
-
205
- @property
206
- def pad(self):
207
- return self._pad_id
208
-
209
- @property
210
- def bos_token_id(self):
211
- return self._bos_id
212
-
213
- @property
214
- def bos(self):
215
- return self._bos_id
216
-
217
- @property
218
- def eod(self):
219
- return self._eod_id
220
-
221
- @property
222
- def eos_token_id(self):
223
- return self._eos_id
224
-
225
- @property
226
- def eos(self):
227
- return self._eos_id
228
-
229
- @property
230
- def mask(self):
231
- return self._mask_id
232
-
233
- @property
234
- def additional_special_tokens_ids(self):
235
- return [self.vocab[k] for k in self._t5_tokens]
236
-
237
- class _MMSentencePieceTokenizer(_SentencePieceTokenizer):
238
- """SentencePieceTokenizer-Megatron wrapper"""
239
-
240
- def __init__(self, model_file, vocab_extra_ids=0):
241
- super().__init__(model_file, vocab_extra_ids)
242
-
243
-
244
- def _initalize(self, vocab_extra_ids):
245
- self._populate_vocab()
246
- self._special_tokens = {}
247
- self._inv_special_tokens = {}
248
-
249
- self._t5_tokens = []
250
-
251
- def _add_special_token(t):
252
- if t not in self._vocab:
253
- next_id = len(self._vocab)
254
- self._vocab[t] = next_id
255
- self._inv_vocab[next_id] = t
256
- self._special_tokens[t] = self._vocab[t]
257
- self._inv_special_tokens[self._vocab[t]] = t
258
-
259
- _add_special_token('<CLS>')
260
- self._cls_id = self._vocab['<CLS>']
261
- _add_special_token('<SEP>')
262
- self._sep_id = self._vocab['<SEP>']
263
- _add_special_token('<EOD>')
264
- self._eod_id = self._vocab['<EOD>']
265
- _add_special_token('<MASK>')
266
- self._mask_id = self._vocab['<MASK>']
267
-
268
- _add_special_token('<SOA>')
269
- self._soa_id = self._vocab['<SOA>']
270
- _add_special_token('<EOA>')
271
- self._eoa_id = self._vocab['<EOA>']
272
- _add_special_token('<SOV>')
273
- self._sov_id = self._vocab['<SOV>']
274
- _add_special_token('<EOV>')
275
- self._eov_id = self._vocab['<EOV>']
276
- _add_special_token('<SOI>')
277
- self._soi_id = self._vocab['<SOI>']
278
- _add_special_token('<EOI>')
279
- self._eoi_id = self._vocab['<EOI>']
280
- _add_special_token('<s_local>')
281
- self._s_local_id = self._vocab['<s_local>']
282
- _add_special_token('<e_local>')
283
- self._e_local_id = self._vocab['<e_local>']
284
- _add_special_token('<s_global>')
285
- self._s_global_id = self._vocab['<s_global>']
286
- _add_special_token('<e_global>')
287
- self._e_global_id = self._vocab['<e_global>']
288
- _add_special_token('<stage_1>')
289
- self._stage_1_id = self._vocab['<stage_1>']
290
- _add_special_token('<stage_2>')
291
- self._stage_2_id = self._vocab['<stage_2>']
292
- pad_id = self.tokenizer.pad_id()
293
- try:
294
- pad_token = self.tokenizer.id_to_piece(pad_id)
295
- except IndexError:
296
- pad_token = '<PAD>'
297
- _add_special_token(pad_token)
298
- self._pad_id = self._vocab[pad_token]
299
-
300
- bos_id = self.tokenizer.bos_id()
301
- try:
302
- bos_token = self.tokenizer.id_to_piece(bos_id)
303
- except IndexError:
304
- bos_token = '<BOS>'
305
- _add_special_token(bos_token)
306
- self._bos_id = self._vocab[bos_token]
307
-
308
- eos_id = self.tokenizer.eos_id()
309
- try:
310
- eos_token = self.tokenizer.id_to_piece(eos_id)
311
- except IndexError:
312
- eos_token = '<EOS>'
313
- _add_special_token(eos_token)
314
- self._eos_id = self._vocab[eos_token]
315
-
316
- for i in range(vocab_extra_ids):
317
- t = "<extra_id_{}>".format(i)
318
- _add_special_token(t)
319
- self._t5_tokens += [t]
320
-
321
- @property
322
- def soa(self):
323
- return self._soa_id
324
-
325
- @property
326
- def eoa(self):
327
- return self._eoa_id
328
-
329
- @property
330
- def sov(self):
331
- return self._sov_id
332
-
333
- @property
334
- def eov(self):
335
- return self._eov_id
336
-
337
- @property
338
- def soi(self):
339
- return self._soi_id
340
-
341
- @property
342
- def eoi(self):
343
- return self._eoi_id
344
-
345
- @property
346
- def s_local(self):
347
- return self._s_local_id
348
-
349
- @property
350
- def e_local(self):
351
- return self._e_local_id
352
-
353
- @property
354
- def s_global(self):
355
- return self._s_global_id
356
-
357
- @property
358
- def e_global(self):
359
- return self._e_global_id
360
-
361
- @property
362
- def stage_1(self):
363
- return self._stage_1_id
364
-
365
- @property
366
- def stage_2(self):
367
- return self._stage_2_id