|
""" |
|
Quantumaurora: Advanced Transformer-based Language Model |
|
Version: 1.0.0 |
|
Created: 2025 |
|
""" |
|
|
|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from torch.utils.data import Dataset, DataLoader |
|
from transformers import PreTrainedTokenizerFast |
|
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders |
|
import math |
|
from typing import Optional, Dict, List, Tuple |
|
from torch.cuda.amp import autocast, GradScaler |
|
from torch.nn.parallel import DistributedDataParallel |
|
import torch.distributed as dist |
|
import torch.multiprocessing as mp |
|
from torch.utils.checkpoint import checkpoint |
|
import json |
|
import os |
|
from datetime import datetime |
|
|
|
class QuantumauroraConfig: |
|
"""Configuration class for Quantumaurora model""" |
|
def __init__(self, |
|
vocab_size: int = 50000, |
|
d_model: int = 512, |
|
num_heads: int = 8, |
|
num_layers: int = 6, |
|
d_ff: int = 2048, |
|
dropout: float = 0.1, |
|
attention_type: str = "full", |
|
use_checkpointing: bool = True, |
|
max_sequence_length: int = 2048, |
|
model_version: str = "1.0.0"): |
|
self.vocab_size = vocab_size |
|
self.d_model = d_model |
|
self.num_heads = num_heads |
|
self.num_layers = num_layers |
|
self.d_ff = d_ff |
|
self.dropout = dropout |
|
self.attention_type = attention_type |
|
self.use_checkpointing = use_checkpointing |
|
self.max_sequence_length = max_sequence_length |
|
self.model_version = model_version |
|
self.model_type = "quantumaurora" |
|
|
|
def save(self, path: str): |
|
"""Save configuration to JSON file""" |
|
config_dict = self.__dict__ |
|
config_dict['timestamp'] = datetime.now().isoformat() |
|
|
|
with open(path, 'w') as f: |
|
json.dump(config_dict, f, indent=2) |
|
|
|
@classmethod |
|
def load(cls, path: str) -> 'QuantumauroraConfig': |
|
"""Load configuration from JSON file""" |
|
with open(path, 'r') as f: |
|
config_dict = json.load(f) |
|
|
|
|
|
if 'timestamp' in config_dict: |
|
del config_dict['timestamp'] |
|
|
|
return cls(**config_dict) |
|
|
|
class Quantumaurora(nn.Module): |
|
""" |
|
Quantumaurora: Advanced Transformer-based Language Model |
|
|
|
A state-of-the-art language model featuring: |
|
- Multi-head attention with sparse/local patterns |
|
- Multiple pre-training objectives |
|
- Gradient checkpointing |
|
- Mixed precision training |
|
- Distributed training support |
|
""" |
|
|
|
def __init__(self, config: QuantumauroraConfig): |
|
super().__init__() |
|
self.config = config |
|
|
|
|
|
self.token_embedding = nn.Embedding(config.vocab_size, config.d_model) |
|
self.positional_encoding = PositionalEncoding(config.d_model) |
|
|
|
self.transformer_blocks = nn.ModuleList([ |
|
TransformerBlock( |
|
config.d_model, |
|
config.num_heads, |
|
config.d_ff, |
|
config.dropout, |
|
config.attention_type |
|
) for _ in range(config.num_layers) |
|
]) |
|
|
|
self.pretraining_objectives = PreTrainingObjectives( |
|
config.d_model, |
|
config.vocab_size |
|
) |
|
|
|
self.dropout = nn.Dropout(config.dropout) |
|
|
|
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: |
|
x = self.token_embedding(x) |
|
x = self.positional_encoding(x) |
|
x = self.dropout(x) |
|
|
|
for transformer_block in self.transformer_blocks: |
|
if self.config.use_checkpointing and self.training: |
|
x = checkpoint(transformer_block, x, mask) |
|
else: |
|
x = transformer_block(x, mask) |
|
|
|
return self.pretraining_objectives(x) |
|
|
|
def save_pretrained(self, path: str): |
|
"""Save model and configuration""" |
|
os.makedirs(path, exist_ok=True) |
|
|
|
|
|
config_path = os.path.join(path, 'config.json') |
|
self.config.save(config_path) |
|
|
|
|
|
model_path = os.path.join(path, 'model.pt') |
|
torch.save(self.state_dict(), model_path) |
|
|
|
|
|
if hasattr(self, 'tokenizer'): |
|
tokenizer_path = os.path.join(path, 'tokenizer.json') |
|
self.tokenizer.save(tokenizer_path) |
|
|
|
@classmethod |
|
def from_pretrained(cls, path: str) -> 'Quantumaurora': |
|
"""Load pretrained model and configuration""" |
|
config = QuantumauroraConfig.load(os.path.join(path, 'config.json')) |
|
model = cls(config) |
|
|
|
model_path = os.path.join(path, 'model.pt') |
|
model.load_state_dict(torch.load(model_path)) |
|
|
|
|
|
tokenizer_path = os.path.join(path, 'tokenizer.json') |
|
if os.path.exists(tokenizer_path): |
|
model.tokenizer = PreTrainedTokenizerFast.from_file(tokenizer_path) |
|
|
|
return model |
|
|
|
class QuantumauroraTrainer: |
|
"""Training manager for Quantumaurora model""" |
|
|
|
def __init__(self, |
|
model: Quantumaurora, |
|
train_dataloader: DataLoader, |
|
optimizer: torch.optim.Optimizer, |
|
device: str = "cuda", |
|
use_mixed_precision: bool = True, |
|
distributed: bool = True): |
|
self.model = model |
|
self.train_dataloader = train_dataloader |
|
self.optimizer = optimizer |
|
self.device = device |
|
self.use_mixed_precision = use_mixed_precision |
|
self.distributed = distributed |
|
|
|
if use_mixed_precision: |
|
self.scaler = GradScaler() |
|
|
|
if distributed: |
|
self.model = DistributedDataParallel(model) |
|
|
|
def train(self, num_epochs: int, save_dir: str = None): |
|
"""Main training loop""" |
|
best_loss = float('inf') |
|
|
|
for epoch in range(num_epochs): |
|
losses = self.train_epoch(epoch) |
|
|
|
|
|
if save_dir and losses['total'] < best_loss: |
|
best_loss = losses['total'] |
|
self.model.save_pretrained(os.path.join(save_dir, f'checkpoint-{epoch}')) |
|
|
|
print(f"Epoch {epoch+1}/{num_epochs}") |
|
for loss_name, loss_value in losses.items(): |
|
print(f"{loss_name}: {loss_value:.4f}") |
|
|
|
def main(): |
|
"""Example usage of Quantumaurora""" |
|
|
|
|
|
config = QuantumauroraConfig( |
|
vocab_size=50000, |
|
d_model=768, |
|
num_heads=12, |
|
num_layers=12, |
|
attention_type="sparse" |
|
) |
|
|
|
|
|
model = Quantumaurora(config) |
|
|
|
|
|
world_size = torch.cuda.device_count() |
|
if world_size > 1: |
|
mp.spawn( |
|
train_distributed, |
|
args=(world_size, model, dataset), |
|
nprocs=world_size, |
|
join=True |
|
) |
|
else: |
|
|
|
trainer = QuantumauroraTrainer( |
|
model=model, |
|
train_dataloader=train_dataloader, |
|
optimizer=torch.optim.Adam(model.parameters()), |
|
use_mixed_precision=True, |
|
distributed=False |
|
) |
|
|
|
trainer.train( |
|
num_epochs=10, |
|
save_dir='quantumaurora_checkpoints' |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |