Shilpaj commited on
Commit
0c94512
·
1 Parent(s): 5bdbe00

Feat: Huggingface app

Browse files
Files changed (2) hide show
  1. README.md +12 -1
  2. app.py +161 -0
README.md CHANGED
@@ -1,4 +1,15 @@
1
- # Pre-Training
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  This section focuses on Embeddings and Pre-training.
4
 
 
1
+ title: ShakespeareGPT
2
+ emoji: 🐠
3
+ colorFrom: gray
4
+ colorTo: red
5
+ sdk: gradio
6
+ sdk_version: 5.12.0
7
+ app_file: app.py
8
+ pinned: false
9
+ license: mit
10
+ short_description: 'GPT model pre-training step on Shakespeare dataset '
11
+
12
+ # ShakespeareGPT
13
 
14
  This section focuses on Embeddings and Pre-training.
15
 
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import gradio as gr
4
+ import tiktoken
5
+ from dataclasses import dataclass
6
+ import torch.nn as nn
7
+ import math
8
+ import inspect
9
+
10
+ # Configuration class (same as in training)
11
+ @dataclass
12
+ class GPTConfig:
13
+ block_size: int = 512
14
+ vocab_size: int = 50304
15
+ n_layer: int = 8
16
+ n_head: int = 8
17
+ n_embd: int = 384
18
+
19
+ # Model architecture classes (copied from training notebook)
20
+ class CausalSelfAttention(nn.Module):
21
+ def __init__(self, config):
22
+ super().__init__()
23
+ assert config.n_embd % config.n_head == 0
24
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
25
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
26
+ self.c_proj.NANGPT_SCALE_INIT = 1
27
+ self.n_head = config.n_head
28
+ self.n_embd = config.n_embd
29
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
30
+
31
+ def forward(self, x):
32
+ B, T, C = x.size()
33
+ qkv = self.c_attn(x)
34
+ q, k, v = qkv.split(self.n_embd, dim=2)
35
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
36
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
37
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
38
+ y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
39
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
40
+ y = self.c_proj(y)
41
+ return y
42
+
43
+ class MLP(nn.Module):
44
+ def __init__(self, config):
45
+ super().__init__()
46
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
47
+ self.gelu = nn.GELU(approximate='tanh')
48
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
49
+ self.c_proj.NANOGPT_SCALE_INIT = 1
50
+
51
+ def forward(self, x):
52
+ x = self.c_fc(x)
53
+ x = self.gelu(x)
54
+ x = self.c_proj(x)
55
+ return x
56
+
57
+ class Block(nn.Module):
58
+ def __init__(self, config):
59
+ super().__init__()
60
+ self.ln_1 = nn.LayerNorm(config.n_embd)
61
+ self.attn = CausalSelfAttention(config)
62
+ self.ln_2 = nn.LayerNorm(config.n_embd)
63
+ self.mlp = MLP(config)
64
+
65
+ def forward(self, x):
66
+ x = x + self.attn(self.ln_1(x))
67
+ x = x + self.mlp(self.ln_2(x))
68
+ return x
69
+
70
+ class GPT(nn.Module):
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.config = config
74
+ self.gradient_checkpointing = True
75
+
76
+ self.transformer = nn.ModuleDict(dict(
77
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
78
+ wpe = nn.Embedding(config.block_size, config.n_embd),
79
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
80
+ ln_f = nn.LayerNorm(config.n_embd),
81
+ ))
82
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
83
+ self.transformer.wte.weight = self.lm_head.weight
84
+ self.apply(self._init_weights)
85
+
86
+ def _init_weights(self, module):
87
+ if isinstance(module, nn.Linear):
88
+ std = 0.02
89
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
90
+ std *= (2 * self.config.n_layer) ** -0.5
91
+ torch.nn.init.normal_(module.weight, mean=0.0, std=std)
92
+ if module.bias is not None:
93
+ torch.nn.init.zeros_(module.bias)
94
+ elif isinstance(module, nn.Embedding):
95
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
96
+
97
+ def forward(self, idx, targets=None):
98
+ B, T = idx.size()
99
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
100
+
101
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
102
+ pos_emb = self.transformer.wpe(pos)
103
+ tok_emb = self.transformer.wte(idx)
104
+ x = tok_emb + pos_emb
105
+
106
+ for block in self.transformer.h:
107
+ x = block(x)
108
+
109
+ x = self.transformer.ln_f(x)
110
+ logits = self.lm_head(x)
111
+
112
+ return logits, None if targets is None else F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
113
+
114
+ # Initialize model and load weights
115
+ def load_model():
116
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
117
+ model = GPT(GPTConfig())
118
+ model.load_state_dict(torch.load('nano_gpt_model.pt', map_location=device))
119
+ model.to(device)
120
+ model.eval()
121
+ return model, device
122
+
123
+ # Text generation function
124
+ def generate_text(prompt, num_tokens, model, device, temperature=0.8):
125
+ enc = tiktoken.get_encoding('gpt2')
126
+ x = torch.tensor([enc.encode(prompt)], dtype=torch.long, device=device)
127
+
128
+ with torch.no_grad():
129
+ while x.size(1) < num_tokens:
130
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
131
+ logits = model(x)[0]
132
+ logits = logits[:, -1, :] / temperature
133
+ probs = F.softmax(logits, dim=-1)
134
+ next_token = torch.multinomial(probs, num_samples=1)
135
+ x = torch.cat([x, next_token], dim=1)
136
+
137
+ decoded = enc.decode(x[0].tolist())
138
+ return decoded
139
+
140
+ # Load the model globally
141
+ model, device = load_model()
142
+
143
+ # Gradio interface
144
+ def gradio_interface(prompt, num_tokens, temperature):
145
+ return generate_text(prompt, num_tokens, model, device, temperature)
146
+
147
+ # Create the Gradio interface
148
+ iface = gr.Interface(
149
+ fn=gradio_interface,
150
+ inputs=[
151
+ gr.Textbox(label="Enter your prompt", value="Once upon a time"),
152
+ gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Number of tokens to generate"),
153
+ gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature (higher = more random)")
154
+ ],
155
+ outputs=gr.Textbox(label="Generated Text"),
156
+ title="NanoGPT Text Generator",
157
+ description="Generate Shakespeare-style text using a trained NanoGPT model",
158
+ )
159
+
160
+ if __name__ == "__main__":
161
+ iface.launch()