dleemiller commited on
Commit
26997c7
·
verified ·
1 Parent(s): 25db288

Upload 9 files

Browse files
CECorrelationEvaluator_sts-validation_results.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,steps,Pearson_Correlation,Spearman_Correlation
2
+ 0,25,0.9189372391213726,0.9179337483079397
3
+ 0,50,0.9152543064665498,0.9194351367345442
4
+ 0,75,0.9153291132604612,0.9178149216577286
5
+ 0,-1,0.9188164812161956,0.9189697661626719
6
+ 1,25,0.9197733012671969,0.9202793789097345
7
+ 1,50,0.9188638378695348,0.9211696321589165
8
+ 1,75,0.9208501169893029,0.9211827194606879
9
+ 1,-1,0.9210947909286328,0.9210740121150552
10
+ 2,25,0.9198938387151563,0.9209662122505786
11
+ 2,50,0.9205063261555415,0.9205844883094307
12
+ 2,75,0.9184405810495602,0.9204992438001216
13
+ 2,-1,0.9206405648445563,0.9201900169271011
14
+ 3,25,0.9196077435063903,0.9199175910840248
15
+ 3,50,0.9188407593562442,0.9199236818840201
16
+ 3,75,0.9191514361050183,0.9200204389483886
17
+ 3,-1,0.9192619782893461,0.919999626418944
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "output/wiki-sim-binary",
3
+ "architectures": [
4
+ "NeoBERTForSequenceClassification"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "model.NeoBERTConfig",
8
+ "AutoModel": "chandar-lab/NeoBERT--model.NeoBERT",
9
+ "AutoModelForMaskedLM": "chandar-lab/NeoBERT--model.NeoBERTLMHead",
10
+ "AutoModelForSequenceClassification": "chandar-lab/NeoBERT--model.NeoBERTForSequenceClassification"
11
+ },
12
+ "classifier_dropout": 0.3,
13
+ "classifier_init_range": 0.02,
14
+ "decoder_init_range": 0.02,
15
+ "dim_head": 64,
16
+ "embedding_init_range": 0.02,
17
+ "hidden_size": 768,
18
+ "id2label": {
19
+ "0": "LABEL_0"
20
+ },
21
+ "intermediate_size": 3072,
22
+ "kwargs": {
23
+ "_commit_hash": null,
24
+ "_name_or_path": "chandar-lab/NeoBERT",
25
+ "architectures": [
26
+ "NeoBERTForSequenceClassification"
27
+ ],
28
+ "attn_implementation": null,
29
+ "auto_map": {
30
+ "AutoConfig": "model.NeoBERTConfig",
31
+ "AutoModel": "chandar-lab/NeoBERT--model.NeoBERT",
32
+ "AutoModelForMaskedLM": "chandar-lab/NeoBERT--model.NeoBERTLMHead",
33
+ "AutoModelForSequenceClassification": "chandar-lab/NeoBERT--model.NeoBERTForSequenceClassification"
34
+ },
35
+ "classifier_dropout": 0.3,
36
+ "classifier_init_range": 0.02,
37
+ "dim_head": 64,
38
+ "id2label": {
39
+ "0": "LABEL_0"
40
+ },
41
+ "kwargs": {
42
+ "_commit_hash": "a4fbc49a61db10ff2db66140ae59c09d96c027f9",
43
+ "architectures": [
44
+ "NeoBERTLMHead"
45
+ ],
46
+ "attn_implementation": null,
47
+ "auto_map": {
48
+ "AutoConfig": "chandar-lab/NeoBERT--model.NeoBERTConfig",
49
+ "AutoModel": "chandar-lab/NeoBERT--model.NeoBERT",
50
+ "AutoModelForMaskedLM": "chandar-lab/NeoBERT--model.NeoBERTLMHead",
51
+ "AutoModelForSequenceClassification": "chandar-lab/NeoBERT--model.NeoBERTForSequenceClassification"
52
+ },
53
+ "classifier_init_range": 0.02,
54
+ "dim_head": 64,
55
+ "kwargs": {
56
+ "classifier_init_range": 0.02,
57
+ "pretrained_model_name_or_path": "google-bert/bert-base-uncased",
58
+ "trust_remote_code": true
59
+ },
60
+ "model_type": "neobert",
61
+ "pretrained_model_name_or_path": "google-bert/bert-base-uncased",
62
+ "torch_dtype": "float32",
63
+ "transformers_version": "4.48.2",
64
+ "trust_remote_code": true
65
+ },
66
+ "label2id": {
67
+ "LABEL_0": 0
68
+ },
69
+ "model_type": "neobert",
70
+ "pretrained_model_name_or_path": "google-bert/bert-base-uncased",
71
+ "torch_dtype": "float32",
72
+ "transformers_version": "4.49.0",
73
+ "trust_remote_code": true
74
+ },
75
+ "label2id": {
76
+ "LABEL_0": 0
77
+ },
78
+ "max_length": 4096,
79
+ "model_type": "neobert",
80
+ "norm_eps": 1e-05,
81
+ "num_attention_heads": 12,
82
+ "num_hidden_layers": 28,
83
+ "pad_token_id": 0,
84
+ "pretrained_model_name_or_path": "google-bert/bert-base-uncased",
85
+ "torch_dtype": "float32",
86
+ "transformers_version": "4.49.0",
87
+ "trust_remote_code": true,
88
+ "vocab_size": 30522
89
+ }
model.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From https://github.com/facebookresearch/llama/blob/main/llama/model.py
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
7
+ from torch.nn.functional import scaled_dot_product_attention
8
+
9
+ from typing import Optional
10
+ import numpy as np
11
+
12
+ from xformers.ops import SwiGLU
13
+
14
+ try:
15
+ from flash_attn.flash_attn_interface import flash_attn_varlen_func
16
+
17
+ FLASH_ATTN_AVAILABLE = True
18
+ except ImportError:
19
+ FLASH_ATTN_AVAILABLE = False
20
+
21
+ from transformers import (
22
+ PreTrainedModel,
23
+ PretrainedConfig,
24
+ DataCollatorForLanguageModeling,
25
+ )
26
+ from transformers.modeling_outputs import (
27
+ BaseModelOutput,
28
+ MaskedLMOutput,
29
+ SequenceClassifierOutput,
30
+ )
31
+
32
+ from .rotary import precompute_freqs_cis, apply_rotary_emb
33
+
34
+
35
+ class DataCollatorWithPacking(DataCollatorForLanguageModeling):
36
+ def __init__(self, pack_sequences=False, **kwargs):
37
+ super().__init__(**kwargs)
38
+ self.pack_sequences = pack_sequences
39
+
40
+ def __call__(self, batch):
41
+ if self.pack_sequences:
42
+ # Add position_ids if not present
43
+ if "position_ids" not in batch[0]:
44
+ for item in batch:
45
+ item["position_ids"] = list(range(len(item["input_ids"])))
46
+
47
+ # Pack the sequences into a single list
48
+ input_ids_list = [item["input_ids"] for item in batch]
49
+ position_ids_list = [item["position_ids"] for item in batch]
50
+ seqlens = np.array([0] + [len(ids) for ids in input_ids_list])
51
+
52
+ packed_batch = {
53
+ "position_ids": np.concatenate(position_ids_list, axis=0),
54
+ "input_ids": np.concatenate(input_ids_list, axis=0),
55
+ "cu_seqlens": np.cumsum(seqlens),
56
+ "max_seqlen": max(seqlens),
57
+ }
58
+
59
+ batch = super().__call__([packed_batch])
60
+ batch["cu_seqlens"] = batch["cu_seqlens"].to(torch.int32).squeeze()
61
+ else:
62
+ batch = super().__call__(batch)
63
+ batch["attention_mask"] = batch["attention_mask"].to(torch.bool)
64
+
65
+ return batch
66
+
67
+
68
+ class NeoBERTConfig(PretrainedConfig):
69
+ model_type = "neobert"
70
+
71
+ # All config parameters must have a default value.
72
+ def __init__(
73
+ self,
74
+ hidden_size: int = 768,
75
+ num_hidden_layers: int = 28,
76
+ num_attention_heads: int = 12,
77
+ intermediate_size: int = 3072,
78
+ embedding_init_range: float = 0.02,
79
+ decoder_init_range: float = 0.02,
80
+ norm_eps: float = 1e-06,
81
+ vocab_size: int = 30522,
82
+ pad_token_id: int = 0,
83
+ max_length: int = 1024,
84
+ **kwargs,
85
+ ):
86
+ super().__init__(**kwargs)
87
+
88
+ self.hidden_size = hidden_size
89
+ self.num_hidden_layers = num_hidden_layers
90
+ self.num_attention_heads = num_attention_heads
91
+ if hidden_size % num_attention_heads != 0:
92
+ raise ValueError("Hidden size must be divisible by the number of heads.")
93
+ self.dim_head = hidden_size // num_attention_heads
94
+ self.intermediate_size = intermediate_size
95
+ self.embedding_init_range = embedding_init_range
96
+ self.decoder_init_range = decoder_init_range
97
+ self.norm_eps = norm_eps
98
+ self.vocab_size = vocab_size
99
+ self.pad_token_id = pad_token_id
100
+ self.max_length = max_length
101
+ self.kwargs = kwargs
102
+
103
+
104
+ class EncoderBlock(nn.Module):
105
+ """Transformer encoder block."""
106
+
107
+ def __init__(self, config: NeoBERTConfig):
108
+ super().__init__()
109
+
110
+ self.config = config
111
+
112
+ # Attention
113
+ self.qkv = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size * 3, bias=False)
114
+ self.wo = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=False)
115
+
116
+ # Feedforward network
117
+ multiple_of = 8
118
+ intermediate_size = int(2 * config.intermediate_size / 3)
119
+ intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
120
+ self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
121
+
122
+ # Layer norms
123
+ self.attention_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)
124
+ self.ffn_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)
125
+
126
+ def forward(
127
+ self,
128
+ x: torch.Tensor,
129
+ attention_mask: torch.Tensor,
130
+ freqs_cis: torch.Tensor,
131
+ output_attentions: bool,
132
+ max_seqlen: int = None,
133
+ cu_seqlens: torch.Tensor = None,
134
+ ):
135
+ # Attention
136
+ attn_output, attn_weights = self._att_block(
137
+ self.attention_norm(x), attention_mask, freqs_cis, output_attentions, max_seqlen, cu_seqlens
138
+ )
139
+
140
+ # Residual
141
+ x = x + attn_output
142
+
143
+ # Feed-forward
144
+ x = x + self.ffn(self.ffn_norm(x))
145
+
146
+ return x, attn_weights
147
+
148
+ def _att_block(
149
+ self,
150
+ x: torch.Tensor,
151
+ attention_mask: torch.Tensor,
152
+ freqs_cis: torch.Tensor,
153
+ output_attentions: bool,
154
+ max_seqlen: int = None,
155
+ cu_seqlens: torch.Tensor = None,
156
+ ):
157
+ batch_size, seq_len, _ = x.shape
158
+
159
+ xq, xk, xv = self.qkv(x).view(batch_size, seq_len, self.config.num_attention_heads, self.config.dim_head * 3).chunk(3, axis=-1)
160
+
161
+ xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
162
+
163
+ # Attn block
164
+ attn_weights = None
165
+
166
+ # Flash attention if the tensors are packed
167
+ if cu_seqlens is not None:
168
+ attn = flash_attn_varlen_func(
169
+ q=xq.squeeze(0),
170
+ k=xk.squeeze(0),
171
+ v=xv.squeeze(0),
172
+ cu_seqlens_q=cu_seqlens,
173
+ cu_seqlens_k=cu_seqlens,
174
+ max_seqlen_q=max_seqlen,
175
+ max_seqlen_k=max_seqlen,
176
+ dropout_p=0.0,
177
+ causal=False,
178
+ )
179
+ # Eager attention if attention weights are needed in the output
180
+ elif output_attentions:
181
+ attn_weights = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
182
+ if attention_mask is not None:
183
+ attn_weights = attn_weights * attention_mask
184
+ attn_weights = attn_weights.softmax(-1)
185
+ attn = attn_weights @ xv.permute(0, 2, 1, 3)
186
+ attn = attn.transpose(1, 2)
187
+ # Fall back to SDPA otherwise
188
+ else:
189
+ attn = scaled_dot_product_attention(
190
+ query=xq.transpose(1, 2),
191
+ key=xk.transpose(1, 2),
192
+ value=xv.transpose(1, 2),
193
+ attn_mask=attention_mask.bool(),
194
+ dropout_p=0,
195
+ ).transpose(1, 2)
196
+
197
+ return self.wo(attn.reshape(batch_size, seq_len, self.config.num_attention_heads * self.config.dim_head)), attn_weights
198
+
199
+
200
+ class NeoBERTPreTrainedModel(PreTrainedModel):
201
+ config_class = NeoBERTConfig
202
+ base_model_prefix = "model"
203
+ _supports_cache_class = True
204
+
205
+ def _init_weights(self, module):
206
+ if isinstance(module, nn.Linear):
207
+ module.weight.data.uniform_(-self.config.decoder_init_range, self.config.decoder_init_range)
208
+ elif isinstance(module, nn.Embedding):
209
+ module.weight.data.uniform_(-self.config.embedding_init_range, self.config.embedding_init_range)
210
+
211
+
212
+ class NeoBERT(NeoBERTPreTrainedModel):
213
+ config_class = NeoBERTConfig
214
+
215
+ def __init__(self, config: NeoBERTConfig):
216
+ super().__init__(config)
217
+
218
+ self.config = config
219
+
220
+ self.encoder = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
221
+
222
+ # Ensures freqs_cis is moved to the same devices as the model. Non-persistent buffers are not saved in the state_dict.
223
+ freqs_cis = precompute_freqs_cis(config.hidden_size // config.num_attention_heads, config.max_length)
224
+ self.register_buffer("freqs_cis", freqs_cis, persistent=False)
225
+
226
+ self.transformer_encoder = nn.ModuleList()
227
+ for _ in range(config.num_hidden_layers):
228
+ self.transformer_encoder.append(EncoderBlock(config))
229
+
230
+ self.layer_norm = nn.RMSNorm(config.hidden_size, config.norm_eps)
231
+
232
+ # Initialize weights and apply final processing
233
+ self.post_init()
234
+
235
+ def forward(
236
+ self,
237
+ input_ids: torch.Tensor,
238
+ position_ids: torch.Tensor = None,
239
+ max_seqlen: int = None,
240
+ cu_seqlens: torch.Tensor = None,
241
+ attention_mask: torch.Tensor = None,
242
+ output_hidden_states: bool = False,
243
+ output_attentions: bool = False,
244
+ **kwargs,
245
+ ):
246
+ # Initialize
247
+ hidden_states, attentions = [], []
248
+
249
+ # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
250
+ if attention_mask is not None:
251
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
252
+
253
+ # Checks to be done if inputs are packed sequences
254
+ if cu_seqlens is not None:
255
+ assert (
256
+ FLASH_ATTN_AVAILABLE
257
+ ), "Flash-attention is not available. Please ''pip install flash_attn'', or provide un-packed sequences."
258
+ assert not output_attentions, "Output attentions is not supported when sequences are packed."
259
+ assert max_seqlen is not None, "Missing max_seqlen. It must be provided when cu_seqlens are not None."
260
+ assert input_ids.shape[0] == 1, "Cumulative sequence lengths are provided but input_ids are not packed."
261
+ assert input_ids.is_cuda, "Packing uses an implementation of flash-attention and is only supported on GPU."
262
+
263
+ # RoPE
264
+ freqs_cis = self.freqs_cis[position_ids] if position_ids is not None else self.freqs_cis[: input_ids.shape[1]].unsqueeze(0)
265
+
266
+ # Embedding
267
+ x = self.encoder(input_ids)
268
+
269
+ # Transformer encoder
270
+ for layer in self.transformer_encoder:
271
+ x, attn = layer(x, attention_mask, freqs_cis, output_attentions, max_seqlen, cu_seqlens)
272
+ if output_hidden_states:
273
+ hidden_states.append(x)
274
+ if output_attentions:
275
+ attentions.append(attn)
276
+
277
+ # Final normalization layer
278
+ x = self.layer_norm(x)
279
+
280
+ # Return the output of the last hidden layer
281
+ return BaseModelOutput(
282
+ last_hidden_state=x,
283
+ hidden_states=hidden_states if output_hidden_states else None,
284
+ attentions=attentions if output_attentions else None,
285
+ )
286
+
287
+
288
+ class NeoBERTLMHead(NeoBERTPreTrainedModel):
289
+ config_class = NeoBERTConfig
290
+
291
+ def __init__(self, config: NeoBERTConfig):
292
+ super().__init__(config)
293
+
294
+ self.config = config
295
+
296
+ self.model = NeoBERT(config)
297
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
298
+
299
+ self.post_init()
300
+
301
+ def forward(
302
+ self,
303
+ input_ids: torch.Tensor,
304
+ position_ids: torch.Tensor = None,
305
+ max_seqlen: int = None,
306
+ cu_seqlens: torch.Tensor = None,
307
+ attention_mask: torch.Tensor = None,
308
+ output_hidden_states: bool = False,
309
+ output_attentions: bool = False,
310
+ **kwargs,
311
+ ):
312
+
313
+ output = self.model.forward(
314
+ input_ids,
315
+ position_ids,
316
+ max_seqlen,
317
+ cu_seqlens,
318
+ attention_mask,
319
+ output_hidden_states,
320
+ output_attentions,
321
+ )
322
+ logits = self.decoder(output.last_hidden_state)
323
+
324
+ return MaskedLMOutput(
325
+ hidden_states=output.hidden_states if output_hidden_states else None,
326
+ attentions=output.attentions if output_attentions else None,
327
+ logits=logits,
328
+ )
329
+
330
+
331
+ class NeoBERTForSequenceClassification(NeoBERTPreTrainedModel):
332
+ config_class = NeoBERTConfig
333
+
334
+ def __init__(self, config: NeoBERTConfig):
335
+ super().__init__(config)
336
+
337
+ self.config = config
338
+
339
+ self.num_labels = getattr(config, "num_labels", 2)
340
+ self.classifier_dropout = getattr(config, "classifier_dropout", 0.1)
341
+ self.classifier_init_range = getattr(config, "classifier_init_range", 0.02)
342
+
343
+ self.model = NeoBERT(config)
344
+
345
+ self.dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)
346
+ self.dropout = nn.Dropout(self.classifier_dropout)
347
+ self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
348
+
349
+ self.post_init()
350
+
351
+ def _init_weights(self, module):
352
+ if isinstance(module, nn.Linear):
353
+ module.weight.data.normal_(mean=0.0, std=self.classifier_init_range)
354
+ if module.bias is not None:
355
+ module.bias.data.zero_()
356
+
357
+ def forward(
358
+ self,
359
+ input_ids: torch.Tensor,
360
+ position_ids: torch.Tensor = None,
361
+ max_seqlen: int = None,
362
+ cu_seqlens: torch.Tensor = None,
363
+ attention_mask: torch.Tensor = None,
364
+ output_hidden_states: bool = False,
365
+ output_attentions: bool = False,
366
+ labels: Optional[torch.Tensor] = None,
367
+ return_dict: Optional[bool] = None,
368
+ ):
369
+
370
+ output = self.model.forward(
371
+ input_ids,
372
+ position_ids,
373
+ max_seqlen,
374
+ cu_seqlens,
375
+ attention_mask,
376
+ output_hidden_states,
377
+ output_attentions,
378
+ )
379
+ hidden_states = output.last_hidden_state
380
+
381
+ x = hidden_states[:, 0, :]
382
+ x = self.dropout(x)
383
+ x = self.dense(x)
384
+ x = torch.tanh(x)
385
+ x = self.dropout(x)
386
+
387
+ logits = self.classifier(x)
388
+
389
+ loss = None
390
+ if labels is not None:
391
+ if self.config.problem_type is None:
392
+ if self.num_labels == 1:
393
+ self.config.problem_type = "regression"
394
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
395
+ self.config.problem_type = "single_label_classification"
396
+ else:
397
+ self.config.problem_type = "multi_label_classification"
398
+
399
+ if self.config.problem_type == "regression":
400
+ loss_fct = MSELoss()
401
+ if self.num_labels == 1:
402
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
403
+ else:
404
+ loss = loss_fct(logits, labels)
405
+ elif self.config.problem_type == "single_label_classification":
406
+ loss_fct = CrossEntropyLoss()
407
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
408
+ elif self.config.problem_type == "multi_label_classification":
409
+ loss_fct = BCEWithLogitsLoss()
410
+ loss = loss_fct(logits, labels)
411
+
412
+ if not return_dict:
413
+ result = (logits,)
414
+ return ((loss,) + result) if loss is not None else result
415
+
416
+ return SequenceClassifierOutput(
417
+ loss=loss,
418
+ logits=logits,
419
+ hidden_states=output.hidden_states if output_hidden_states else None,
420
+ attentions=output.attentions if output_attentions else None,
421
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c25ade63d4f04cf9e9dcfcc9e8e787ee278b5a365b8bfb10b748b484ff12c8
3
+ size 889047508
rotary.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # From https://github.com/facebookresearch/llama/blob/main/llama/model.py
2
+
3
+ import torch
4
+ from typing import Tuple
5
+
6
+
7
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
8
+ """
9
+ Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
10
+
11
+ This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
12
+ and the end index 'end'. The 'theta' parameter scales the frequencies.
13
+ The returned tensor contains complex values in complex64 data type.
14
+
15
+ Args:
16
+ dim (int): Dimension of the frequency tensor.
17
+ end (int): End index for precomputing frequencies.
18
+ theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
19
+
20
+ Returns:
21
+ torch.Tensor: Precomputed frequency tensor with complex exponentials.
22
+ """
23
+
24
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
25
+ t = torch.arange(end, device=freqs.device)
26
+ freqs = torch.outer(t, freqs).float()
27
+ return torch.polar(torch.ones_like(freqs), freqs)
28
+
29
+
30
+ def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
31
+ assert freqs_cis.shape[1:] == (x.shape[1], x.shape[-1])
32
+ return freqs_cis.contiguous().unsqueeze(2)
33
+
34
+
35
+ def apply_rotary_emb(
36
+ xq: torch.Tensor,
37
+ xk: torch.Tensor,
38
+ freqs_cis: torch.Tensor,
39
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
40
+ """
41
+ Apply rotary embeddings to input tensors using the given frequency tensor.
42
+
43
+ This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
44
+ frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
45
+ is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
46
+ returned as real tensors.
47
+
48
+ Args:
49
+ xq (torch.Tensor): Query tensor to apply rotary embeddings.
50
+ xk (torch.Tensor): Key tensor to apply rotary embeddings.
51
+ freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
52
+
53
+ Returns:
54
+ Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
55
+ """
56
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
57
+ xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
58
+ freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
59
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
60
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
61
+ return xq_out.type_as(xq), xk_out.type_as(xk)
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 304,
50
+ "model_input_names": [
51
+ "input_ids",
52
+ "attention_mask"
53
+ ],
54
+ "model_max_length": 4096,
55
+ "pad_to_multiple_of": null,
56
+ "pad_token": "[PAD]",
57
+ "pad_token_type_id": 0,
58
+ "padding_side": "right",
59
+ "sep_token": "[SEP]",
60
+ "stride": 0,
61
+ "strip_accents": null,
62
+ "tokenize_chinese_chars": true,
63
+ "tokenizer_class": "BertTokenizer",
64
+ "truncation_side": "right",
65
+ "truncation_strategy": "longest_first",
66
+ "unk_token": "[UNK]",
67
+ "vocab_size": 30522
68
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff