alisrbdni commited on
Commit
92bae51
·
verified ·
1 Parent(s): 20edbc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -223
app.py CHANGED
@@ -14,233 +14,233 @@ NUM_ROUNDS = 3
14
 
15
 
16
 
17
- ########################TinyLLM####################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- import torch
20
- import torch.nn as nn
21
- from torch.nn import functional as F
22
-
23
- # hyperparameters
24
- batch_size = 64 # how many independent sequences will we process in parallel?
25
- block_size = 256 # what is the maximum context length for predictions?
26
- max_iters = 5000
27
- eval_interval = 500
28
- learning_rate = 3e-4
29
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
30
- eval_iters = 200
31
- n_embd = 384
32
- n_head = 6
33
- n_layer = 6
34
- dropout = 0.2
35
- # ------------
36
-
37
- torch.manual_seed(1337)
38
-
39
- # wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
40
- with open('input.txt', 'r', encoding='utf-8') as f:
41
- text = f.read()
42
-
43
- # here are all the unique characters that occur in this text
44
- chars = sorted(list(set(text)))
45
- vocab_size = len(chars)
46
- # create a mapping from characters to integers
47
- stoi = { ch:i for i,ch in enumerate(chars) }
48
- itos = { i:ch for i,ch in enumerate(chars) }
49
- encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
50
- decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
51
-
52
- # Train and test splits
53
- data = torch.tensor(encode(text), dtype=torch.long)
54
- n = int(0.9*len(data)) # first 90% will be train, rest val
55
- train_data = data[:n]
56
- val_data = data[n:]
57
-
58
- # data loading
59
- def get_batch(split):
60
- # generate a small batch of data of inputs x and targets y
61
- data = train_data if split == 'train' else val_data
62
- ix = torch.randint(len(data) - block_size, (batch_size,))
63
- x = torch.stack([data[i:i+block_size] for i in ix])
64
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
65
- x, y = x.to(device), y.to(device)
66
- return x, y
67
-
68
- @torch.no_grad()
69
- def estimate_loss():
70
- out = {}
71
- model.eval()
72
- for split in ['train', 'val']:
73
- losses = torch.zeros(eval_iters)
74
- for k in range(eval_iters):
75
- X, Y = get_batch(split)
76
- logits, loss = model(X, Y)
77
- losses[k] = loss.item()
78
- out[split] = losses.mean()
79
- model.train()
80
- return out
81
-
82
- class Head(nn.Module):
83
- """ one head of self-attention """
84
-
85
- def __init__(self, head_size):
86
- super().__init__()
87
- self.key = nn.Linear(n_embd, head_size, bias=False)
88
- self.query = nn.Linear(n_embd, head_size, bias=False)
89
- self.value = nn.Linear(n_embd, head_size, bias=False)
90
- self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
91
-
92
- self.dropout = nn.Dropout(dropout)
93
-
94
- def forward(self, x):
95
- # input of size (batch, time-step, channels)
96
- # output of size (batch, time-step, head size)
97
- B,T,C = x.shape
98
- k = self.key(x) # (B,T,hs)
99
- q = self.query(x) # (B,T,hs)
100
- # compute attention scores ("affinities")
101
- wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
102
- wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
103
- wei = F.softmax(wei, dim=-1) # (B, T, T)
104
- wei = self.dropout(wei)
105
- # perform the weighted aggregation of the values
106
- v = self.value(x) # (B,T,hs)
107
- out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
108
- return out
109
-
110
- class MultiHeadAttention(nn.Module):
111
- """ multiple heads of self-attention in parallel """
112
-
113
- def __init__(self, num_heads, head_size):
114
- super().__init__()
115
- self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
116
- self.proj = nn.Linear(head_size * num_heads, n_embd)
117
- self.dropout = nn.Dropout(dropout)
118
-
119
- def forward(self, x):
120
- out = torch.cat([h(x) for h in self.heads], dim=-1)
121
- out = self.dropout(self.proj(out))
122
- return out
123
-
124
- class FeedFoward(nn.Module):
125
- """ a simple linear layer followed by a non-linearity """
126
-
127
- def __init__(self, n_embd):
128
- super().__init__()
129
- self.net = nn.Sequential(
130
- nn.Linear(n_embd, 4 * n_embd),
131
- nn.ReLU(),
132
- nn.Linear(4 * n_embd, n_embd),
133
- nn.Dropout(dropout),
134
- )
135
-
136
- def forward(self, x):
137
- return self.net(x)
138
-
139
- class Block(nn.Module):
140
- """ Transformer block: communication followed by computation """
141
-
142
- def __init__(self, n_embd, n_head):
143
- # n_embd: embedding dimension, n_head: the number of heads we'd like
144
- super().__init__()
145
- head_size = n_embd // n_head
146
- self.sa = MultiHeadAttention(n_head, head_size)
147
- self.ffwd = FeedFoward(n_embd)
148
- self.ln1 = nn.LayerNorm(n_embd)
149
- self.ln2 = nn.LayerNorm(n_embd)
150
-
151
- def forward(self, x):
152
- x = x + self.sa(self.ln1(x))
153
- x = x + self.ffwd(self.ln2(x))
154
- return x
155
-
156
- class GPTLanguageModel(nn.Module):
157
-
158
- def __init__(self):
159
- super().__init__()
160
- # each token directly reads off the logits for the next token from a lookup table
161
- self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
162
- self.position_embedding_table = nn.Embedding(block_size, n_embd)
163
- self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
164
- self.ln_f = nn.LayerNorm(n_embd) # final layer norm
165
- self.lm_head = nn.Linear(n_embd, vocab_size)
166
-
167
- # better init, not covered in the original GPT video, but important, will cover in followup video
168
- self.apply(self._init_weights)
169
-
170
- def _init_weights(self, module):
171
- if isinstance(module, nn.Linear):
172
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
173
- if module.bias is not None:
174
- torch.nn.init.zeros_(module.bias)
175
- elif isinstance(module, nn.Embedding):
176
- torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
177
-
178
- def forward(self, idx, targets=None):
179
- B, T = idx.shape
180
-
181
- # idx and targets are both (B,T) tensor of integers
182
- tok_emb = self.token_embedding_table(idx) # (B,T,C)
183
- pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
184
- x = tok_emb + pos_emb # (B,T,C)
185
- x = self.blocks(x) # (B,T,C)
186
- x = self.ln_f(x) # (B,T,C)
187
- logits = self.lm_head(x) # (B,T,vocab_size)
188
-
189
- if targets is None:
190
- loss = None
191
- else:
192
- B, T, C = logits.shape
193
- logits = logits.view(B*T, C)
194
- targets = targets.view(B*T)
195
- loss = F.cross_entropy(logits, targets)
196
-
197
- return logits, loss
198
-
199
- def generate(self, idx, max_new_tokens):
200
- # idx is (B, T) array of indices in the current context
201
- for _ in range(max_new_tokens):
202
- # crop idx to the last block_size tokens
203
- idx_cond = idx[:, -block_size:]
204
- # get the predictions
205
- logits, loss = self(idx_cond)
206
- # focus only on the last time step
207
- logits = logits[:, -1, :] # becomes (B, C)
208
- # apply softmax to get probabilities
209
- probs = F.softmax(logits, dim=-1) # (B, C)
210
- # sample from the distribution
211
- idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
212
- # append sampled index to the running sequence
213
- idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
214
- return idx
215
-
216
- model = GPTLanguageModel()
217
- m = model.to(device)
218
- # print the number of parameters in the model
219
- print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
220
-
221
- # create a PyTorch optimizer
222
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
223
-
224
- for iter in range(max_iters):
225
 
226
- # every once in a while evaluate the loss on train and val sets
227
- if iter % eval_interval == 0 or iter == max_iters - 1:
228
- losses = estimate_loss()
229
- print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
230
 
231
- # sample a batch of data
232
- xb, yb = get_batch('train')
233
 
234
- # evaluate the loss
235
- logits, loss = model(xb, yb)
236
- optimizer.zero_grad(set_to_none=True)
237
- loss.backward()
238
- optimizer.step()
239
 
240
- # generate from the model
241
- context = torch.zeros((1, 1), dtype=torch.long, device=device)
242
- print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
243
- #open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))
244
 
245
 
246
 
@@ -252,7 +252,7 @@ print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
252
 
253
 
254
 
255
- ########################TinyLLM##################################
256
 
257
  def load_data(dataset_name):
258
  raw_datasets = load_dataset(dataset_name)
 
14
 
15
 
16
 
17
+ # ########################TinyLLM####################################
18
+
19
+ # import torch
20
+ # import torch.nn as nn
21
+ # from torch.nn import functional as F
22
+
23
+ # # hyperparameters
24
+ # batch_size = 64 # how many independent sequences will we process in parallel?
25
+ # block_size = 256 # what is the maximum context length for predictions?
26
+ # max_iters = 5000
27
+ # eval_interval = 500
28
+ # learning_rate = 3e-4
29
+ # device = 'cuda' if torch.cuda.is_available() else 'cpu'
30
+ # eval_iters = 200
31
+ # n_embd = 384
32
+ # n_head = 6
33
+ # n_layer = 6
34
+ # dropout = 0.2
35
+ # # ------------
36
+
37
+ # torch.manual_seed(1337)
38
+
39
+ # # wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
40
+ # with open('input.txt', 'r', encoding='utf-8') as f:
41
+ # text = f.read()
42
+
43
+ # # here are all the unique characters that occur in this text
44
+ # chars = sorted(list(set(text)))
45
+ # vocab_size = len(chars)
46
+ # # create a mapping from characters to integers
47
+ # stoi = { ch:i for i,ch in enumerate(chars) }
48
+ # itos = { i:ch for i,ch in enumerate(chars) }
49
+ # encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
50
+ # decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
51
+
52
+ # # Train and test splits
53
+ # data = torch.tensor(encode(text), dtype=torch.long)
54
+ # n = int(0.9*len(data)) # first 90% will be train, rest val
55
+ # train_data = data[:n]
56
+ # val_data = data[n:]
57
+
58
+ # # data loading
59
+ # def get_batch(split):
60
+ # # generate a small batch of data of inputs x and targets y
61
+ # data = train_data if split == 'train' else val_data
62
+ # ix = torch.randint(len(data) - block_size, (batch_size,))
63
+ # x = torch.stack([data[i:i+block_size] for i in ix])
64
+ # y = torch.stack([data[i+1:i+block_size+1] for i in ix])
65
+ # x, y = x.to(device), y.to(device)
66
+ # return x, y
67
+
68
+ # @torch.no_grad()
69
+ # def estimate_loss():
70
+ # out = {}
71
+ # model.eval()
72
+ # for split in ['train', 'val']:
73
+ # losses = torch.zeros(eval_iters)
74
+ # for k in range(eval_iters):
75
+ # X, Y = get_batch(split)
76
+ # logits, loss = model(X, Y)
77
+ # losses[k] = loss.item()
78
+ # out[split] = losses.mean()
79
+ # model.train()
80
+ # return out
81
+
82
+ # class Head(nn.Module):
83
+ # """ one head of self-attention """
84
+
85
+ # def __init__(self, head_size):
86
+ # super().__init__()
87
+ # self.key = nn.Linear(n_embd, head_size, bias=False)
88
+ # self.query = nn.Linear(n_embd, head_size, bias=False)
89
+ # self.value = nn.Linear(n_embd, head_size, bias=False)
90
+ # self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
91
+
92
+ # self.dropout = nn.Dropout(dropout)
93
+
94
+ # def forward(self, x):
95
+ # # input of size (batch, time-step, channels)
96
+ # # output of size (batch, time-step, head size)
97
+ # B,T,C = x.shape
98
+ # k = self.key(x) # (B,T,hs)
99
+ # q = self.query(x) # (B,T,hs)
100
+ # # compute attention scores ("affinities")
101
+ # wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
102
+ # wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
103
+ # wei = F.softmax(wei, dim=-1) # (B, T, T)
104
+ # wei = self.dropout(wei)
105
+ # # perform the weighted aggregation of the values
106
+ # v = self.value(x) # (B,T,hs)
107
+ # out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
108
+ # return out
109
+
110
+ # class MultiHeadAttention(nn.Module):
111
+ # """ multiple heads of self-attention in parallel """
112
+
113
+ # def __init__(self, num_heads, head_size):
114
+ # super().__init__()
115
+ # self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
116
+ # self.proj = nn.Linear(head_size * num_heads, n_embd)
117
+ # self.dropout = nn.Dropout(dropout)
118
+
119
+ # def forward(self, x):
120
+ # out = torch.cat([h(x) for h in self.heads], dim=-1)
121
+ # out = self.dropout(self.proj(out))
122
+ # return out
123
+
124
+ # class FeedFoward(nn.Module):
125
+ # """ a simple linear layer followed by a non-linearity """
126
+
127
+ # def __init__(self, n_embd):
128
+ # super().__init__()
129
+ # self.net = nn.Sequential(
130
+ # nn.Linear(n_embd, 4 * n_embd),
131
+ # nn.ReLU(),
132
+ # nn.Linear(4 * n_embd, n_embd),
133
+ # nn.Dropout(dropout),
134
+ # )
135
+
136
+ # def forward(self, x):
137
+ # return self.net(x)
138
+
139
+ # class Block(nn.Module):
140
+ # """ Transformer block: communication followed by computation """
141
+
142
+ # def __init__(self, n_embd, n_head):
143
+ # # n_embd: embedding dimension, n_head: the number of heads we'd like
144
+ # super().__init__()
145
+ # head_size = n_embd // n_head
146
+ # self.sa = MultiHeadAttention(n_head, head_size)
147
+ # self.ffwd = FeedFoward(n_embd)
148
+ # self.ln1 = nn.LayerNorm(n_embd)
149
+ # self.ln2 = nn.LayerNorm(n_embd)
150
+
151
+ # def forward(self, x):
152
+ # x = x + self.sa(self.ln1(x))
153
+ # x = x + self.ffwd(self.ln2(x))
154
+ # return x
155
+
156
+ # class GPTLanguageModel(nn.Module):
157
+
158
+ # def __init__(self):
159
+ # super().__init__()
160
+ # # each token directly reads off the logits for the next token from a lookup table
161
+ # self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
162
+ # self.position_embedding_table = nn.Embedding(block_size, n_embd)
163
+ # self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
164
+ # self.ln_f = nn.LayerNorm(n_embd) # final layer norm
165
+ # self.lm_head = nn.Linear(n_embd, vocab_size)
166
+
167
+ # # better init, not covered in the original GPT video, but important, will cover in followup video
168
+ # self.apply(self._init_weights)
169
+
170
+ # def _init_weights(self, module):
171
+ # if isinstance(module, nn.Linear):
172
+ # torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
173
+ # if module.bias is not None:
174
+ # torch.nn.init.zeros_(module.bias)
175
+ # elif isinstance(module, nn.Embedding):
176
+ # torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
177
+
178
+ # def forward(self, idx, targets=None):
179
+ # B, T = idx.shape
180
+
181
+ # # idx and targets are both (B,T) tensor of integers
182
+ # tok_emb = self.token_embedding_table(idx) # (B,T,C)
183
+ # pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
184
+ # x = tok_emb + pos_emb # (B,T,C)
185
+ # x = self.blocks(x) # (B,T,C)
186
+ # x = self.ln_f(x) # (B,T,C)
187
+ # logits = self.lm_head(x) # (B,T,vocab_size)
188
+
189
+ # if targets is None:
190
+ # loss = None
191
+ # else:
192
+ # B, T, C = logits.shape
193
+ # logits = logits.view(B*T, C)
194
+ # targets = targets.view(B*T)
195
+ # loss = F.cross_entropy(logits, targets)
196
+
197
+ # return logits, loss
198
+
199
+ # def generate(self, idx, max_new_tokens):
200
+ # # idx is (B, T) array of indices in the current context
201
+ # for _ in range(max_new_tokens):
202
+ # # crop idx to the last block_size tokens
203
+ # idx_cond = idx[:, -block_size:]
204
+ # # get the predictions
205
+ # logits, loss = self(idx_cond)
206
+ # # focus only on the last time step
207
+ # logits = logits[:, -1, :] # becomes (B, C)
208
+ # # apply softmax to get probabilities
209
+ # probs = F.softmax(logits, dim=-1) # (B, C)
210
+ # # sample from the distribution
211
+ # idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
212
+ # # append sampled index to the running sequence
213
+ # idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
214
+ # return idx
215
+
216
+ # model = GPTLanguageModel()
217
+ # m = model.to(device)
218
+ # # print the number of parameters in the model
219
+ # print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
220
+
221
+ # # create a PyTorch optimizer
222
+ # optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
223
 
224
+ # for iter in range(max_iters):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ # # every once in a while evaluate the loss on train and val sets
227
+ # if iter % eval_interval == 0 or iter == max_iters - 1:
228
+ # losses = estimate_loss()
229
+ # print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
230
 
231
+ # # sample a batch of data
232
+ # xb, yb = get_batch('train')
233
 
234
+ # # evaluate the loss
235
+ # logits, loss = model(xb, yb)
236
+ # optimizer.zero_grad(set_to_none=True)
237
+ # loss.backward()
238
+ # optimizer.step()
239
 
240
+ # # generate from the model
241
+ # context = torch.zeros((1, 1), dtype=torch.long, device=device)
242
+ # print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
243
+ # #open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))
244
 
245
 
246
 
 
252
 
253
 
254
 
255
+ # ########################TinyLLM##################################
256
 
257
  def load_data(dataset_name):
258
  raw_datasets = load_dataset(dataset_name)