imdatta0 commited on
Commit
be3a39d
·
0 Parent(s):

Add Sherlock Pretrained tranformer

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
ReadME.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPT: A Lightweight Language Model
2
+
3
+ NanoLlama is a compact language model trained on Sherlock Holmes stories.
4
+
5
+ ## Model Details
6
+
7
+ - **Model Type**: NanoLlama (Causal Language Model)
8
+ - **Number of Layers**: 12
9
+ - **Hidden Size**: 512
10
+ - **Number of Attention Heads**: 16
11
+ - **Number of KV Heads**: 16
12
+ - **Intermediate Size**: 2048
13
+ - **Maximum Sequence Length**: 2048
14
+ - **Vocabulary Size**: 97 (including special tokens)
15
+
16
+ ## Usage
17
+
18
+ You can use this model with the Hugging Face Transformers library:
19
+
20
+ ```python
21
+ from transformers import AutoTokenizer, AutoModelForCausalLM
22
+
23
+ tokenizer = AutoTokenizer.from_pretrained("imdatta0/spt")
24
+ model = AutoModelForCausalLM.from_pretrained("imdatta0/spt")
25
+
26
+ # Generate text
27
+ input_text = "Sherlock and I were "
28
+ input_ids = tokenizer(input_text, return_tensors="pt").input_ids
29
+ output = model.generate(input_ids, max_length=50, num_return_sequences=1)
30
+
31
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
32
+ print(generated_text)
33
+ ```
34
+
35
+ ## Training
36
+
37
+ This model was trained on Sherlock Holmes' stories on a single A100 with a batch size of 2 and gradient accumulation steps of 32 effective batch size of 64. It was trained on 1024 length character sequences for 10000 steps.
38
+
39
+ ## Limitations
40
+
41
+ - The model has a limited vocabulary of 97 tokens, which may affect its performance on certain tasks or domains.
42
+ - The maximum sequence length is 2048 tokens, which may not be sufficient for very long text generation tasks.
43
+
44
+
45
+ ## Acknowledgements
46
+
47
+ - Thanks to Andrej Karpathy for his excellent videos on how to train GPT from scratch
48
+ - Sir Arthur Conan Doyle for the amazing stories :)
__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig, AutoModelForCausalLM
2
+ from .configuration_spt import SPTConfig
3
+ from .modeling_spt import SPTForCausalLM
4
+
5
+ AutoConfig.register("spt", SPTConfig)
6
+ AutoModelForCausalLM.register(SPTConfig, SPTForCausalLM)
config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["SPTForCausalLM"],
3
+ "model_type": "spt",
4
+ "n_layers": 12,
5
+ "vocab_size": 97,
6
+ "hidden_size": 512,
7
+ "n_attn_heads": 16,
8
+ "n_kv_heads": 16,
9
+ "intermediate_size": 2048,
10
+ "max_len": 2048,
11
+ "residual": true,
12
+ "normalise": true,
13
+ "bos_token_id": 95,
14
+ "eos_token_id": 95,
15
+ "pad_token_id": 95,
16
+ "unk_token_id": 96
17
+ }
configuration_spt.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig, AutoConfig
2
+
3
+ class SPTConfig(PretrainedConfig):
4
+ model_type = "spt"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=97,
9
+ hidden_size=512,
10
+ n_layers=12,
11
+ n_attn_heads=16,
12
+ n_kv_heads=16,
13
+ intermediate_size=2048,
14
+ max_len=2048,
15
+ residual=True,
16
+ normalise=True,
17
+ pad_token_id=95,
18
+ bos_token_id=95,
19
+ eos_token_id=95,
20
+ **kwargs
21
+ ):
22
+ self.vocab_size = vocab_size
23
+ self.hidden_size = hidden_size
24
+ self.n_layers = n_layers
25
+ self.n_attn_heads = n_attn_heads
26
+ self.n_kv_heads = n_kv_heads
27
+ self.intermediate_size = intermediate_size
28
+ self.max_len = max_len
29
+ self.residual = residual
30
+ self.normalise = normalise
31
+
32
+ super().__init__(
33
+ pad_token_id=pad_token_id,
34
+ bos_token_id=bos_token_id,
35
+ eos_token_id=eos_token_id,
36
+ **kwargs
37
+ )
38
+
39
+ AutoConfig.register("spt", SPTConfig)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c76a9147a43dca2c005df5255d1f075b140020986f3526221f959277751cd47
3
+ size 390660192
modeling_spt.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import PreTrainedModel, AutoModelForCausalLM
5
+ from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
6
+ from transformers.file_utils import add_start_docstrings_to_model_forward
7
+ from .configuration_spt import SPTConfig
8
+
9
+ def repeat_kv(hidden_states, repeat_times):
10
+ if repeat_times == 1:
11
+ return hidden_states
12
+ batch, n_kv_heads, seq_len, head_dim = hidden_states.shape
13
+ hidden_states = hidden_states[:,:,None,:,:].expand(batch, n_kv_heads, repeat_times, seq_len, head_dim)
14
+ return hidden_states.reshape(batch, n_kv_heads*repeat_times, seq_len, head_dim)
15
+
16
+ class RMSNorm(nn.Module):
17
+ def __init__(self, hidden_size, eps=1e-6):
18
+ super().__init__()
19
+ self.weight = nn.Parameter(torch.ones(hidden_size))
20
+ self.variance_epsilon = eps
21
+
22
+ def forward(self, hidden_states):
23
+ input_dtype = hidden_states.dtype
24
+ hidden_states = hidden_states.to(torch.float32)
25
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
26
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
27
+ return self.weight * hidden_states.to(input_dtype)
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(self, config):
31
+ super().__init__()
32
+ self.head_dim = config.hidden_size // config.n_attn_heads
33
+ kv_size = config.n_kv_heads * self.head_dim
34
+ self.hidden_size = config.hidden_size
35
+ self.n_attn_heads = config.n_attn_heads
36
+ self.n_kv_heads = config.n_kv_heads
37
+
38
+ self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
39
+ self.k = nn.Linear(config.hidden_size, kv_size, bias=False)
40
+ self.v = nn.Linear(config.hidden_size, kv_size, bias=False)
41
+
42
+ self.register_buffer('tril', torch.tril(torch.ones(config.max_len, config.max_len)).view(1, 1, config.max_len, config.max_len))
43
+
44
+ def forward(self, x):
45
+ batch_size, seq_len, hidden_dim = x.shape
46
+
47
+ q = self.q(x)
48
+ k = self.k(x)
49
+ v = self.v(x)
50
+
51
+ q = q.view(batch_size, seq_len, self.n_attn_heads, self.head_dim).transpose(1, 2)
52
+ k = k.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
53
+ v = v.view(batch_size, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
54
+
55
+ k = repeat_kv(k, self.n_attn_heads//self.n_kv_heads)
56
+ v = repeat_kv(v, self.n_attn_heads//self.n_kv_heads)
57
+
58
+ attention = (q @ k.transpose(-2,-1)) * (1.0/math.sqrt(self.hidden_size))
59
+ attention = attention.masked_fill(self.tril[:,:,:seq_len,:seq_len]==0, float('-inf'))
60
+ probs = nn.functional.softmax(attention,dim=-1)
61
+ y = probs@v
62
+ y = y.transpose(1,2).contiguous().reshape(batch_size, seq_len, -1)
63
+
64
+ return y
65
+
66
+ class MLP(nn.Module):
67
+ def __init__(self, config):
68
+ super().__init__()
69
+ self.up = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
70
+ self.gate = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
71
+ self.down = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
72
+ self.act_fn = nn.GELU()
73
+
74
+ def forward(self,x):
75
+ up = self.up(x)
76
+ gate = self.gate(x)
77
+ return self.down(self.act_fn(up * gate))
78
+
79
+ class TransformerBlock(nn.Module):
80
+ def __init__(self, config):
81
+ super().__init__()
82
+ self.attn = Attention(config)
83
+ self.mlp = MLP(config)
84
+ self.residual = config.residual
85
+ self.norm = RMSNorm(config.hidden_size) if config.normalise else nn.Identity()
86
+
87
+ def forward(self, x):
88
+ if self.residual:
89
+ x = x + self.attn(self.norm(x))
90
+ x = x + self.mlp(self.norm(x))
91
+ else:
92
+ x = self.attn(self.norm(x))
93
+ x = self.mlp(self.norm(x))
94
+ return x
95
+
96
+ class SPTModel(PreTrainedModel):
97
+ config_class = SPTConfig
98
+
99
+ def __init__(self, config):
100
+ super().__init__(config)
101
+ self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
102
+ self.layers = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
103
+ self.norm = RMSNorm(config.hidden_size) if config.normalise else nn.Identity()
104
+
105
+ def forward(self, input_ids):
106
+ x = self.embedding(input_ids)
107
+ for layer in self.layers:
108
+ x = layer(x)
109
+ x = self.norm(x)
110
+ return x
111
+
112
+ class SPTForCausalLM(PreTrainedModel):
113
+ config_class = SPTConfig
114
+
115
+ def __init__(self, config):
116
+ super().__init__(config)
117
+ self.model = SPTModel(config)
118
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
119
+
120
+ def forward(self, input_ids, labels=None):
121
+ x = self.model(input_ids)
122
+ logits = self.lm_head(x)
123
+
124
+ loss = None
125
+ if labels is not None:
126
+ loss_fct = nn.CrossEntropyLoss()
127
+ loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
128
+
129
+ return CausalLMOutputWithCrossAttentions(
130
+ loss=loss,
131
+ logits=logits,
132
+ hidden_states=x,
133
+ )
134
+
135
+ def prepare_inputs_for_generation(self, input_ids, **kwargs):
136
+ return {"input_ids": input_ids}
137
+
138
+ @staticmethod
139
+ def _reorder_cache(past, beam_idx):
140
+ return past
141
+
142
+ # Register the custom model
143
+ AutoModelForCausalLM.register(SPTConfig, SPTForCausalLM)
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token": "#",
3
+ "bos_token": "#",
4
+ "eos_token": "#",
5
+ "unk_token": "[UNK]"
6
+ }
tokenization_spt.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ from typing import List, Optional
3
+ import json
4
+
5
+ class SPTTokenizer(PreTrainedTokenizer):
6
+ def __init__(self, vocab_file=None, **kwargs):
7
+ super().__init__(**kwargs)
8
+ self.vocab = self.load_vocab(vocab_file)
9
+ self.inv_vocab = {v: k for k, v in self.vocab.items()}
10
+ self.pad_token = self.eos_token = "#"
11
+ self.unk_token = "[UNK]"
12
+
13
+ @property
14
+ def vocab_size(self):
15
+ return len(self.vocab)
16
+
17
+ def get_vocab(self):
18
+ return dict(self.vocab)
19
+
20
+ def _tokenize(self, text):
21
+ return list(text)
22
+
23
+ def _convert_token_to_id(self, token):
24
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
25
+
26
+ def _convert_id_to_token(self, index):
27
+ return self.inv_vocab.get(index, self.unk_token)
28
+
29
+ def convert_tokens_to_string(self, tokens):
30
+ return ''.join(tokens)
31
+
32
+ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
33
+ if token_ids_1 is None:
34
+ return token_ids_0 + [self.eos_token_id]
35
+ return token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
36
+
37
+ def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False) -> List[int]:
38
+ if already_has_special_tokens:
39
+ return [1 if token in [self.eos_token_id] else 0 for token in token_ids_0]
40
+ if token_ids_1 is None:
41
+ return [0] * len(token_ids_0) + [1]
42
+ return [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
43
+
44
+ def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
45
+ if token_ids_1 is None:
46
+ return [0] * (len(token_ids_0) + 1)
47
+ return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 1)
48
+
49
+ @classmethod
50
+ def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
51
+ tokenizer = super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
52
+ return tokenizer
53
+
54
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
55
+ import os
56
+
57
+ if not os.path.isdir(save_directory):
58
+ os.mkdir(save_directory)
59
+
60
+ vocab_file = os.path.join(
61
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
62
+ )
63
+
64
+ with open(vocab_file, "w", encoding="utf-8") as f:
65
+ f.write(json.dumps(self.vocab, ensure_ascii=False))
66
+
67
+ return (vocab_file,)
68
+
69
+ def load_vocab(self, vocab_file):
70
+ if vocab_file is None:
71
+ return {'\n': 0,
72
+ ' ': 1,
73
+ '!': 2,
74
+ '"': 3,
75
+ '&': 4,
76
+ "'": 5,
77
+ '(': 6,
78
+ ')': 7,
79
+ '*': 8,
80
+ ',': 9,
81
+ '-': 10,
82
+ '.': 11,
83
+ '0': 12,
84
+ '1': 13,
85
+ '2': 14,
86
+ '3': 15,
87
+ '4': 16,
88
+ '5': 17,
89
+ '6': 18,
90
+ '7': 19,
91
+ '8': 20,
92
+ '9': 21,
93
+ ':': 22,
94
+ ';': 23,
95
+ '?': 24,
96
+ 'A': 25,
97
+ 'B': 26,
98
+ 'C': 27,
99
+ 'D': 28,
100
+ 'E': 29,
101
+ 'F': 30,
102
+ 'G': 31,
103
+ 'H': 32,
104
+ 'I': 33,
105
+ 'J': 34,
106
+ 'K': 35,
107
+ 'L': 36,
108
+ 'M': 37,
109
+ 'N': 38,
110
+ 'O': 39,
111
+ 'P': 40,
112
+ 'Q': 41,
113
+ 'R': 42,
114
+ 'S': 43,
115
+ 'T': 44,
116
+ 'U': 45,
117
+ 'V': 46,
118
+ 'W': 47,
119
+ 'X': 48,
120
+ 'Y': 49,
121
+ 'Z': 50,
122
+ '[': 51,
123
+ ']': 52,
124
+ '`': 53,
125
+ 'a': 54,
126
+ 'b': 55,
127
+ 'c': 56,
128
+ 'd': 57,
129
+ 'e': 58,
130
+ 'f': 59,
131
+ 'g': 60,
132
+ 'h': 61,
133
+ 'i': 62,
134
+ 'j': 63,
135
+ 'k': 64,
136
+ 'l': 65,
137
+ 'm': 66,
138
+ 'n': 67,
139
+ 'o': 68,
140
+ 'p': 69,
141
+ 'q': 70,
142
+ 'r': 71,
143
+ 's': 72,
144
+ 't': 73,
145
+ 'u': 74,
146
+ 'v': 75,
147
+ 'w': 76,
148
+ 'x': 77,
149
+ 'y': 78,
150
+ 'z': 79,
151
+ '£': 80,
152
+ '°': 81,
153
+ 'ß': 82,
154
+ 'à': 83,
155
+ 'â': 84,
156
+ 'è': 85,
157
+ 'é': 86,
158
+ 'ê': 87,
159
+ 'î': 88,
160
+ 'ñ': 89,
161
+ 'ô': 90,
162
+ 'ö': 91,
163
+ 'û': 92,
164
+ 'ü': 93}
165
+ else:
166
+ with open(vocab_file, 'r', encoding='utf-8') as f:
167
+ return json.load(f)
tokenizer.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 95,
8
+ "special": true,
9
+ "content": "#",
10
+ "single_word": false,
11
+ "lstrip": false,
12
+ "rstrip": false,
13
+ "normalized": false
14
+ },
15
+ {
16
+ "id": 96,
17
+ "special": true,
18
+ "content": "[UNK]",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false
23
+ }
24
+ ],
25
+ "normalizer": null,
26
+ "pre_tokenizer": null,
27
+ "post_processor": null,
28
+ "decoder": null,
29
+ "model": {
30
+ "type": "BPE",
31
+ "vocab": {
32
+ "\n": 0, " ": 1, "!": 2, "\"": 3, "&": 4, "'": 5, "(": 6, ")": 7, "*": 8, ",": 9, "-": 10, ".": 11, "0": 12, "1": 13, "2": 14, "3": 15, "4": 16, "5": 17, "6": 18, "7": 19, "8": 20, "9": 21, ":": 22, ";": 23, "?": 24, "A": 25, "B": 26, "C": 27, "D": 28, "E": 29, "F": 30, "G": 31, "H": 32, "I": 33, "J": 34, "K": 35, "L": 36, "M": 37, "N": 38, "O": 39, "P": 40, "Q": 41, "R": 42, "S": 43, "T": 44, "U": 45, "V": 46, "W": 47, "X": 48, "Y": 49, "Z": 50, "[": 51, "]": 52, "`": 53, "a": 54, "b": 55, "c": 56, "d": 57, "e": 58, "f": 59, "g": 60, "h": 61, "i": 62, "j": 63, "k": 64, "l": 65, "m": 66, "n": 67, "o": 68, "p": 69, "q": 70, "r": 71, "s": 72, "t": 73, "u": 74, "v": 75, "w": 76, "x": 77, "y": 78, "z": 79, "£": 80, "°": 81, "ß": 82, "à": 83, "â": 84, "è": 85, "é": 86, "ê": 87, "î": 88, "ñ": 89, "ô": 90, "ö": 91, "û": 92, "ü": 93, "'": 94, "#": 95, "[UNK]": 96
33
+ },
34
+ "merges": []
35
+ }
36
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 2048,
3
+ "pad_token": "#",
4
+ "bos_token": "#",
5
+ "eos_token": "#",
6
+ "unk_token": "[UNK]",
7
+ "tokenizer_class": "NanoLlamaTokenizer"
8
+ }