File size: 6,251 Bytes
b8c40bc
 
50859d6
d775f55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50859d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8c40bc
50859d6
b8c40bc
50859d6
 
 
 
 
 
 
 
 
 
b8c40bc
 
50859d6
b8c40bc
 
50859d6
b8c40bc
 
 
50859d6
b8c40bc
 
d775f55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import torch
import tiktoken
import math

class LayerNorm(torch.nn.Module):
    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.ones(ndim))
        self.bias = torch.nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return torch.nn.functional.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

class CausalSelfAttention(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config["emb_dim"] % config["n_heads"] == 0
        self.c_attn = torch.nn.Linear(config["emb_dim"], 3 * config["emb_dim"], bias=config["qkv_bias"])
        self.c_proj = torch.nn.Linear(config["emb_dim"], config["emb_dim"], bias=True)
        self.attn_dropout = torch.nn.Dropout(config["drop_rate"])
        self.resid_dropout = torch.nn.Dropout(config["drop_rate"])
        self.n_heads = config["n_heads"]
        self.n_embd = config["emb_dim"]
        self.dropout = config["drop_rate"]
        self.register_buffer("bias", torch.tril(torch.ones(config["context_length"], config["context_length"]))
                                     .view(1, 1, config["context_length"], config["context_length"]))

    def forward(self, x):
        B, T, C = x.size()
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
        q = q.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
        v = v.view(B, T, self.n_heads, C // self.n_heads).transpose(1, 2)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = torch.nn.functional.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_dropout(self.c_proj(y))
        return y

class MLP(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc    = torch.nn.Linear(config["emb_dim"], 4 * config["emb_dim"], bias=True)
        self.gelu    = torch.nn.GELU()
        self.c_proj  = torch.nn.Linear(4 * config["emb_dim"], config["emb_dim"], bias=True)
        self.dropout = torch.nn.Dropout(config["drop_rate"])

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

class Block(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config["emb_dim"], bias=True)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config["emb_dim"], bias=True)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = torch.nn.ModuleDict(dict(
            wte = torch.nn.Embedding(config["vocab_size"], config["emb_dim"]),
            wpe = torch.nn.Embedding(config["context_length"], config["emb_dim"]),
            drop = torch.nn.Dropout(config["drop_rate"]),
            h = torch.nn.ModuleList([Block(config) for _ in range(config["n_layers"])]),
            ln_f = LayerNorm(config["emb_dim"], bias=True)
        ))
        self.lm_head = torch.nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, torch.nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        
        loss = None
        if targets is not None:
            loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        
        return logits, loss

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]
        probs = torch.nn.functional.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

# Load model configuration
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

# Initialize model
model = GPTModel(GPT_CONFIG_124M)

# Load the trained weights
model.load_state_dict(torch.load("my_gpt_model.pth", map_location=torch.device('cpu')))
model.eval()

tokenizer = tiktoken.get_encoding("gpt2")

def generate(prompt, max_new_tokens):
    token_ids = tokenizer.encode(prompt)
    input_ids = torch.tensor(token_ids).unsqueeze(0)
    output_ids = generate_text_simple(
        model=model,
        idx=input_ids,
        max_new_tokens=max_new_tokens,
        context_size=GPT_CONFIG_124M["context_length"]
    )
    return tokenizer.decode(output_ids.squeeze(0).tolist())

iface = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="Prompt"),
        gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Max New Tokens")
    ],
    outputs=gr.Textbox(label="Generated Text"),
    title="SamGPT Text Generation",
    description="Enter a prompt to generate text with the custom language model."
)

iface.launch()