Spaces:

jefsnacker
/

surnamerator

Runtime error

App Files Files Community

jefsnacker commited on Dec 2, 2022

Commit

3f362c0

1 Parent(s): 3027a6c

add gpt nano model

Browse files

Files changed (1) hide show

app.py +207 -3

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import huggingface_hub
@@ -6,8 +9,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import yaml
 mlp_config_path = huggingface_hub.hf_hub_download(
     "jefsnacker/surname_generator",
@@ -25,12 +26,27 @@ wavenet_weights_path = huggingface_hub.hf_hub_download(
     "jefsnacker/surname_generator",
     "wavenet_weights.pt")
 with open(mlp_config_path, 'r') as file:
     mlp_config = yaml.safe_load(file)
 with open(wavenet_config_path, 'r') as file:
     wavenet_config = yaml.safe_load(file)
 class MLP(nn.Module):
     def __init__(self, num_char, hidden_nodes, embeddings, window, num_layers):
         super(MLP, self).__init__()
@@ -75,6 +91,10 @@ mlp = MLP(mlp_config['num_char'],
 mlp.load_state_dict(torch.load(mlp_weights_path))
 mlp.eval()
 class WaveNet(nn.Module):
     def __init__(self, num_char, hidden_nodes, embeddings, window, num_layers):
         super(WaveNet, self).__init__()
@@ -119,6 +139,185 @@ wavenet = WaveNet(wavenet_config['num_char'],
 wavenet.load_state_dict(torch.load(wavenet_weights_path))
 wavenet.eval()
 def generate_names(name_start, number_of_names, model):
     if model == "MLP":
         stoi = mlp_config['stoi']
@@ -126,6 +325,9 @@ def generate_names(name_start, number_of_names, model):
     elif model == "WaveNet":
         stoi = wavenet_config['stoi']
         window = wavenet_config['window']
     else:
         raise Exception("Model not selected")
@@ -148,6 +350,8 @@ def generate_names(name_start, number_of_names, model):
                 ix = mlp.sample_char(x)
             elif model == "WaveNet":
                 ix = wavenet.sample_char(x)
             else:
                 raise Exception("Model not selected")
@@ -166,7 +370,7 @@ demo = gr.Interface(
     inputs=[
         gr.Textbox(placeholder="Start name with..."),
         gr.Number(value=5),
-        gr.Dropdown(["MLP", "WaveNet"], value="WaveNet"),
     ],
     outputs="text",
 )

+import math
+import yaml
 import gradio as gr
 import huggingface_hub
 import torch.nn as nn
 import torch.nn.functional as F
 mlp_config_path = huggingface_hub.hf_hub_download(
     "jefsnacker/surname_generator",
     "jefsnacker/surname_generator",
     "wavenet_weights.pt")
+gpt_nano_config_path = huggingface_hub.hf_hub_download(
+    "jefsnacker/surname_generator",
+    "gpt_config.yaml")
+gpt_nano_weights_path = huggingface_hub.hf_hub_download(
+    "jefsnacker/surname_generator",
+    "gpt_weights.pt")
 with open(mlp_config_path, 'r') as file:
     mlp_config = yaml.safe_load(file)
 with open(wavenet_config_path, 'r') as file:
     wavenet_config = yaml.safe_load(file)
+with open(gpt_nano_config_path, 'r') as file:
+    gpt_nano_config = yaml.safe_load(file)
+##################################################################################
+## MLP
+##################################################################################
 class MLP(nn.Module):
     def __init__(self, num_char, hidden_nodes, embeddings, window, num_layers):
         super(MLP, self).__init__()
 mlp.load_state_dict(torch.load(mlp_weights_path))
 mlp.eval()
+##################################################################################
+## WaveNet
+##################################################################################
 class WaveNet(nn.Module):
     def __init__(self, num_char, hidden_nodes, embeddings, window, num_layers):
         super(WaveNet, self).__init__()
 wavenet.load_state_dict(torch.load(wavenet_weights_path))
 wavenet.eval()
+##################################################################################
+## Transformer
+##################################################################################
+class NewGELU(nn.Module):
+    """
+    Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    """
+    def forward(self, x):
+        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+class GptAttention(nn.Module):
+    """
+    For this attention module k = v = q are all the same.
+    It's for encoder/decoder only transfomers.
+    """
+    def __init__(self, config):
+        super(GptAttention, self).__init__()
+        self.config = config
+        assert self.config["d_model"] % self.config["heads"] == 0
+        self.heads = self.config["heads"]
+        self.w_attn = nn.Linear(self.config["d_model"], 3*self.config["d_model"])
+        self.head = nn.Linear(self.config["d_model"], self.config["d_model"])
+        self.attn_dropout = nn.Dropout(config["attn_pdrop"])
+        self.resid_dropout = nn.Dropout(config["resid_pdrop"])
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "bias",
+            torch.tril(
+                torch.ones(
+                    self.config["window"],
+                    self.config["window"])
+                ).view(1, 1, self.config["window"], self.config["window"])
+        )
+    def forward(self, x):
+        B, window, embs = x.shape
+        q, v, k = self.w_attn(x).split(self.config["d_model"], dim=2)
+        # (B, heads, window, embs)
+        q = q.view(
+            B,
+            window,
+            self.config["heads"],
+            embs // self.config["heads"]
+        ).transpose(1, 2)
+        k = k.view(
+            B,
+            window,
+            self.config["heads"],
+            embs // self.config["heads"]
+        ).transpose(1, 2)
+        v = v.view(
+            B,
+            window,
+            self.config["heads"],
+            embs // self.config["heads"]
+        ).transpose(1, 2)
+        # Self-attend: (B, heads, window, embs) x (B, heads, embs, window) -> (B, heads, window, window)
+        scores = q @ k.transpose(-2, -1) / math.sqrt(k.size(-1))
+        mask = scores.masked_fill(self.bias[:,:,:window,:window] == 0, float('-inf'))
+        probs = F.softmax(mask, dim=-1)
+        attn = self.attn_dropout(probs)
+        attn = probs @ v
+        attn = attn.transpose(1, 2).contiguous().view(B, window, embs)
+        return self.resid_dropout(self.head(attn))
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super(FeedForward, self).__init__()
+        self.l1 = nn.Linear(config["d_model"], 4*config["d_model"])
+        self.l2 = nn.Linear(4*config["d_model"], config["d_model"])
+        self.dropout = nn.Dropout(config["resid_pdrop"])
+    def forward(self, x):
+        x = NewGELU()(self.l1(x))
+        return self.dropout(self.l2(x))
+class Block(nn.Module):
+    def __init__(self, config):
+        super(Block, self).__init__()
+        self.attn = GptAttention(config)
+        self.norm1 = nn.LayerNorm(config["d_model"])
+        self.ff = FeedForward(config)
+        self.norm2 = nn.LayerNorm(config["d_model"])
+    def forward(self, x):
+        x = self.norm1(x + self.attn(x))
+        x = self.norm2(x + self.ff(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, config):
+        super(GPT, self).__init__()
+        self.config = config
+        self.vocab_emb = nn.Embedding(self.config["vocab"], self.config["d_model"])
+        self.pos_emb = nn.Embedding(self.config["window"], self.config["d_model"])
+        self.emb_dropout = nn.Dropout(config["embd_pdrop"])
+        self.blocks = nn.ModuleList([Block(self.config) for _ in range(self.config["blocks"])])
+        self.head_layer_norm = nn.LayerNorm(config["d_model"])
+        self.head = nn.Linear(self.config["d_model"], self.config["vocab"])
+    def forward(self, x):
+        vocab_emb = self.vocab_emb(x)
+        pos_emb = self.pos_emb(torch.arange(0, x.shape[1], dtype=torch.long, device=x.device))
+        x = self.emb_dropout(vocab_emb + pos_emb)
+        for b in self.blocks:
+            x = b(x)
+        x = self.head_layer_norm(x)
+        x = self.head(x)
+        return x
+    def configure_opt(self):
+        p_decay = set()
+        p_no_decay = set()
+        whitelist_weight_modules = (torch.nn.Linear, )
+        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    p_no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    p_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    p_no_decay.add(fpn)
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = p_decay & p_no_decay
+        union_params = p_decay | p_no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(p_decay))], "weight_decay": self.config["weight_decay"]},
+            {"params": [param_dict[pn] for pn in sorted(list(p_no_decay))], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(
+            optim_groups,
+            lr=self.config["lr"],
+            betas=(self.config["b1"], self.config["b2"])
+        )
+        return optimizer
+    def sample_char(self, x):
+        logits = self(x)
+        probs = F.softmax(logits[:,-1,:], dim=1)
+        return torch.multinomial(probs, num_samples=1).item()
+gpt_nano = GPT(gpt_nano_config)
+gpt_nano.load_state_dict(torch.load(gpt_nano_weights_path))
+gpt_nano.eval()
+##################################################################################
+## Gradio App
+##################################################################################
 def generate_names(name_start, number_of_names, model):
     if model == "MLP":
         stoi = mlp_config['stoi']
     elif model == "WaveNet":
         stoi = wavenet_config['stoi']
         window = wavenet_config['window']
+    elif model == "GPT Nano":
+        stoi = gpt_nano_config['stoi']
+        window = gpt_nano_config['window']
     else:
         raise Exception("Model not selected")
                 ix = mlp.sample_char(x)
             elif model == "WaveNet":
                 ix = wavenet.sample_char(x)
+            elif model == "GPT Nano":
+                ix = gpt_nano.sample_char(x)
             else:
                 raise Exception("Model not selected")
     inputs=[
         gr.Textbox(placeholder="Start name with..."),
         gr.Number(value=5),
+        gr.Dropdown(["MLP", "WaveNet", "GPT Nano"], value="GPT Nano"),
     ],
     outputs="text",
 )