# **GEM-1o**

---

## Model Architecture Design + Model Training

---



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/GEM_Project/

/content/drive/MyDrive/GEM_Project


In [None]:
!touch models/__init__.py models/gem_model.py
!touch utils/__init__.py utils/data_preprocessing.py utils/text_generation.py
!touch configs/config.py
!touch train.py generate.py requirements.txt

In [None]:
%%writefile requirements.txt
torch
transformers
datasets
tensorboard
tokenizers
tqdm
wandb

Overwriting requirements.txt


In [7]:
!pip install -r requirements.txt



In [None]:
%%writefile configs/config.py

import torch

MODEL_CONFIG = {
    'VOCAB_SIZE': 50000,
    'D_MODEL': 1024,
    'N_HEADS': 32,
    'D_FF': 4096,
    'N_LAYERS': 32,
    'MAX_SEQ_LEN': 512,
    'BATCH_SIZE': 32,
    'LEARNING_RATE': 1e-4,
    'NUM_EPOCHS': 20,
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
    'WARMUP_STEPS': 4000,
    'ADAM_EPSILON': 1e-8,
    'WEIGHT_DECAY': 0.01,
    'GRADIENT_ACCUMULATION_STEPS': 2,
    'MAX_GRAD_NORM': 1.0,
    'DROPOUT': 0.1,
}

TRAINING_CONFIG = {
    'CHECKPOINT_SAVE_STEPS': 5000,
    'LOGGING_STEPS': 100,
    'EVAL_STEPS': 1000,
    'SAVE_TOTAL_LIMIT': 5
}

Overwriting configs/config.py


In [None]:
%%writefile configs/config.py

import torch

MODEL_CONFIG = {
    'VOCAB_SIZE': 10000,
    'D_MODEL': 768,
    'N_HEADS': 6,
    'D_FF': 1028,
    'N_LAYERS': 6,
    'MAX_SEQ_LEN': 128,
    'BATCH_SIZE': 32,
    'LEARNING_RATE': 1e-4,
    'NUM_EPOCHS': 10,
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu',
    'WARMUP_STEPS': 4000,
    'ADAM_EPSILON': 1e-8,
    'WEIGHT_DECAY': 0.01,
    'GRADIENT_ACCUMULATION_STEPS': 1,
    'MAX_GRAD_NORM': 1.0,
    'DROPOUT': 0.1,
}

TRAINING_CONFIG = {
    'CHECKPOINT_SAVE_STEPS': 5000,
    'LOGGING_STEPS': 100,
    'EVAL_STEPS': 1000,
    'SAVE_TOTAL_LIMIT': 5
}

Overwriting configs/config.py


In [None]:
%%writefile models/gem_model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class GEM(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers, dropout=0.1):
        super(GEM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_heads, d_ff, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids) * math.sqrt(self.d_model)
        x = self.positional_encoding(x)

        if attention_mask is not None:
            # Ensure attention_mask is in the shape (batch_size, sequence_length)
            # Convert attention_mask to (batch_size, sequence_length) format
            attention_mask = attention_mask.bool()  # Ensure it's a boolean tensor
            x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
        else:
            x = self.transformer_encoder(x)

        x = self.fc_out(x)
        return x

    def generate(self, input_ids, max_length, temperature=1.0):
        self.eval()
        with torch.no_grad():
            for _ in range(max_length - input_ids.size(1)):
                outputs = self(input_ids)
                next_token_logits = outputs[:, -1, :] / temperature
                next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
                input_ids = torch.cat([input_ids, next_token], dim=-1)
        return input_ids


Overwriting models/gem_model.py


In [None]:
%%writefile utils/text_generation.py

import torch

def generate_text(model, tokenizer, prompt, max_length=50, device='cuda'):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            next_token_logits = outputs[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
            input_ids = torch.cat([input_ids, next_token], dim=-1)

            if next_token.item() == tokenizer.eos_token_id:
                break

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)

Overwriting utils/text_generation.py


In [None]:
%%writefile generate.py

import torch
from models.gem_model import GEM
from utils.data_preprocessing import load_tokenizer
from configs.config import MODEL_CONFIG

def generate_text(model, tokenizer, prompt, max_length=100, temperature=0.7):
    device = torch.device(MODEL_CONFIG['DEVICE'])
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    generated = model.generate(input_ids, max_length=max_length, temperature=temperature)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

def main():
    device = torch.device(MODEL_CONFIG['DEVICE'])

    tokenizer = load_tokenizer()

    model = GEM(
        vocab_size=MODEL_CONFIG['VOCAB_SIZE'],
        d_model=MODEL_CONFIG['D_MODEL'],
        n_heads=MODEL_CONFIG['N_HEADS'],
        d_ff=MODEL_CONFIG['D_FF'],
        n_layers=MODEL_CONFIG['N_LAYERS'],
        max_seq_len=MODEL_CONFIG['MAX_SEQ_LEN'],
        dropout=MODEL_CONFIG['DROPOUT']
    ).to(device)

    checkpoint = torch.load('final_model/model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    prompt = "Once upon a time"
    generated_text = generate_text(model, tokenizer, prompt, max_length=100)
    print(f"Generated text:\n{generated_text}")

if __name__ == "__main__":
    main()

Overwriting generate.py


In [None]:
%%writefile utils/data_preprocessing.py

import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer

def train_tokenizer(texts, vocab_size=50000, min_frequency=2):
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer = tokenizer.train_new_from_iterator(texts, vocab_size=vocab_size, min_frequency=min_frequency)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.save_pretrained("./tokenizer")
    return tokenizer

def load_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    return tokenizer

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)
        return torch.tensor(encodings['input_ids'])

def get_dataloader(dataset_name, config_name, tokenizer, max_length, batch_size):
    dataset = load_dataset(dataset_name, config_name)
    texts = dataset['train']['text'][:50] #delete [:500 for actual training set w/ full voxabsize]
    dataset = TextDataset(texts, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

Overwriting utils/data_preprocessing.py


In [None]:
%%writefile train.py

import torch
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import wandb
from transformers import get_linear_schedule_with_warmup
from utils.data_preprocessing import get_dataloader, load_tokenizer
from models.gem_model import GEM
from configs.config import MODEL_CONFIG, TRAINING_CONFIG

def train():
    wandb.init(project="GEM_Project", config=MODEL_CONFIG, mode="offline")
    print("WandB initialized in offline mode.")

    tokenizer = load_tokenizer()
    print("Tokenizer loaded.")

    dataloader = get_dataloader('wikitext', 'wikitext-2-raw-v1', tokenizer, MODEL_CONFIG['MAX_SEQ_LEN'], MODEL_CONFIG['BATCH_SIZE'])
    print("Dataloader created.")

    model = GEM(
        vocab_size=len(tokenizer),
        d_model=MODEL_CONFIG['D_MODEL'],
        n_heads=MODEL_CONFIG['N_HEADS'],
        d_ff=MODEL_CONFIG['D_FF'],
        n_layers=MODEL_CONFIG['N_LAYERS'],
        dropout=MODEL_CONFIG['DROPOUT']
    ).to(MODEL_CONFIG['DEVICE'])
    print("Model initialized.")

    optimizer = optim.AdamW(model.parameters(), lr=MODEL_CONFIG['LEARNING_RATE'], eps=MODEL_CONFIG['ADAM_EPSILON'])
    total_steps = len(dataloader) * MODEL_CONFIG['NUM_EPOCHS'] // MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=MODEL_CONFIG['WARMUP_STEPS'],
        num_training_steps=total_steps
    )
    print("Optimizer and scheduler set up.")

    # Mixed precision setup
    scaler = torch.cuda.amp.GradScaler()

    model.train()
    print("Starting training loop.")
    for epoch in range(MODEL_CONFIG['NUM_EPOCHS']):
        print(f"Epoch {epoch + 1}/{MODEL_CONFIG['NUM_EPOCHS']} started.")
        for step, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}")):
            batch = batch.to(MODEL_CONFIG['DEVICE'])

            # Mixed precision training
            with torch.cuda.amp.autocast():
                outputs = model(batch)
                loss = F.cross_entropy(outputs.view(-1, outputs.size(-1)), batch.view(-1))

            # Gradient accumulation
            loss = loss / MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']
            scaler.scale(loss).backward()

            if (step + 1) % MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS'] == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), MODEL_CONFIG['MAX_GRAD_NORM'])
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

                if step % TRAINING_CONFIG['LOGGING_STEPS'] == 0:
                    wandb.log({"loss": loss.item() * MODEL_CONFIG['GRADIENT_ACCUMULATION_STEPS']})

                if step % TRAINING_CONFIG['EVAL_STEPS'] == 0:
                    model.eval()
                    with torch.no_grad():
                        val_loss = sum(F.cross_entropy(model(batch).view(-1, outputs.size(-1)), batch.view(-1)).item() for batch in dataloader)
                    wandb.log({"val_loss": val_loss / len(dataloader)})
                    model.train()

                if step % TRAINING_CONFIG['CHECKPOINT_SAVE_STEPS'] == 0:
                    torch.save(model.state_dict(), f"checkpoint_{epoch}_{step}.pt")

    torch.save(model.state_dict(), "GEM_1o_Aug_15.pt")
    print("Training complete. Final model saved.")

if __name__ == "__main__":
    train()

Overwriting train.py


In [None]:
!python train.py

[34m[1mwandb[0m: Tracking run with wandb version 0.17.7
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0m in this directory.  
[34m[1mwandb[0m: Run [1m`wandb online`[0m or set [1mWANDB_MODE=online[0m to enable cloud syncing.
WandB initialized in offline mode.
Tokenizer loaded.
Dataloader created.
Model initialized.
Optimizer and scheduler set up.
Starting training loop.
Epoch 1/20 started.
Epoch 1: 100% 2/2 [02:31<00:00, 75.53s/it]
Epoch 2/20 started.
Epoch 2: 100% 2/2 [02:25<00:00, 72.90s/it]
Epoch 3/20 started.
Epoch 3: 100% 2/2 [02:25<00:00, 72.73s/it]
Epoch 4/20 started.
Epoch 4: 100% 2/2 [02:23<00:00, 71.87s/it]
Epoch 5/20 started.
Epoch 5: 100% 2/2 [02:22<00:00, 71.46s/it]
Epoch 6/20 started.
Epoch 6: 100% 2/2 [02:24<00:00, 72.17s/it]
Epoch 7/20 started.
Epoch 7: 100% 2/2 [02:26<00:00, 73.02s/it]
Epoch 8/20 started.
Epoch 8: 100% 2/2 [02:25<00:00, 72.98s/it]
Epoch 9/20 started.
Epoch 9: 100% 2/2 [02:24<00:00, 72.41s/it]
Epoch 10/20 started.
Epoch 10: 100% 2/2 

---

## Model Testing

In [None]:
%%writefile Testings/testing.py

import torch
import sys
import os

# Add the parent directory of the model folder to the system path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../models')))

from gem_model import GEM

# Configuration parameters for GEM
vocab_size = 50001  # Example vocab size, adjust if necessary
d_model = 1024      # Dimension of the model
n_heads = 32        # Number of attention heads
d_ff = 4096         # Dimension of the feedforward network
n_layers = 32       # Number of transformer layers
dropout = 0.1       # Dropout rate

# Initialize the model
model = GEM(vocab_size, d_model, n_heads, d_ff, n_layers, dropout)

# Load pre-trained weights
model_path = '/content/drive/MyDrive/GEM_Project/GEM_1o_Aug_15.pt'
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Set the model to evaluation mode
model.eval()

# Define a function to convert text to token IDs (example)
def text_to_ids(tokenizer, text):
    # Implement this function based on your tokenizer's method
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

# Define a tokenizer or token conversion function (example placeholder)
class DummyTokenizer:
    def tokenize(self, text):
        # Simple tokenization example, replace with actual tokenizer
        return text.split()

    def convert_tokens_to_ids(self, tokens):
        # Simple mapping example, replace with actual ID mapping
        return [ord(token[0]) % 50000 for token in tokens]

# Initialize tokenizer
tokenizer = DummyTokenizer()

# Test input
test_prompt = "This is a test."
test_input_ids = torch.tensor(text_to_ids(tokenizer, test_prompt), dtype=torch.long).unsqueeze(0)  # Add batch dimension
attention_mask = torch.ones(test_input_ids.shape, dtype=torch.bool)

# Perform a forward pass
with torch.no_grad():
    outputs = model(test_input_ids, attention_mask)
    print("Model outputs:")
    print(outputs)

# Test the generate method
generation_prompt = "Once upon a time"
input_ids = torch.tensor(text_to_ids(tokenizer, generation_prompt), dtype=torch.long).unsqueeze(0)  # Add batch dimension
generated_ids = model.generate(input_ids, max_length=10, temperature=1.0)
print("Generated IDs:")
print(generated_ids)


Overwriting Testings/testing.py


In [None]:
!python Testings/testing.py

Traceback (most recent call last):
  File "/content/drive/MyDrive/GEM_Project/Testings/testing.py", line 54, in <module>
    outputs = model(test_input_ids, attention_mask)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/content/drive/MyDrive/GEM_Project/models/gem_model.py", line 39, in forward
    x = self.transformer_encoder(x, src_key_padding_mask=attention_mask)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/transforme