# app.py (second app by claude) import gradio as gr import torch # ==================================================================================- # inference code part-1 # ------------------------------------------------- # 1. Download the model weights (from huggingface) # ------------------------------------------------- from huggingface_hub import hf_hub_download hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./") # ---------------------- # 2. Load The tokenizer # ---------------------- from transformers import PreTrainedTokenizerFast # Load the tokenizer tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali") tokenizer.save_pretrained("NepaliBPE") # Llama 3.2 ~300M Scaled Version LLAMA32_CONFIG = { "vocab_size": 50006, # 128_256 reduced vocabulary size "context_length": 512, # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM) "emb_dim": 1320, # 2048 reduced Embedding dimension "n_heads": 20, # 32 reduced Number of attention heads "n_layers": 10, # 16 reduced Number of layers "hidden_dim": 5280, # 8192 Size of the intermediate dimension in FeedForward "n_kv_groups": 5, # 8 Key-Value groups for grouped-query attention "rope_base": 500_000.0, # 500_000 The base in RoPE's "theta" "dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage "rope_freq": { # RoPE frequency scaling "factor": 32.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_context_length": 8192, } } # ==================================================================================- # ==================================================================================- # ==================================================================================- # ==================================================================================- # could not make importing from previous_chapters.py work, so i copied the all code # from previous_chapters.py here # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt). # Source for "Build a Large Language Model From Scratch" # - https://www.manning.com/books/build-a-large-language-model-from-scratch # Code: https://github.com/rasbt/LLMs-from-scratch # This file collects all the relevant code that we covered thus far # throughout Chapters 2-4. # This file can be run as a standalone script. import json # modified from `import tiktoken` from transformers import PreTrainedTokenizerFast import torch.nn as nn from torch.utils.data import Dataset, DataLoader import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator # modified. added for create_dataloader_v2 from datasets import load_dataset ##################################### # 1. Dataloader ##################################### def create_dataloader_v3(batch_size, shuffle=True, drop_last=True, num_workers=0): ''' modified. * parameter: text removed * parameters: max_length and stride removed : they were set during preparing tokenized_datasets * parameter: context_length removed (as dataset is pre-tokenized) ''' print('downloading dataset...') # Download the whole dataset base_url = "https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/resolve/main/pre_tokenized/" # data_files = {"train": base_url + "nepberta_" + str(context_length) + ".parquet"} # previous version: stride = .75*512, context_len.512 # data_files = { # "train": base_url + "iriisnepal_u_nepberta_train_512.parquet", # "test": base_url + "iriisnepal_u_nepberta_test_512.parquet" # } # context_len.512, stride=512 data_files={"train": base_url + "iriis_u_nepbert_512_512_train.parquet", "validation": base_url + "iriis_u_nepbert_512_512_test.parquet"} dataset = load_dataset("parquet", data_files=data_files, cache_dir='hf_cache', streaming=True) print(dataset) # and split it later # dataset = dataset.train_test_split(train_size=train_ratio, seed=42) # Convert Hugging Face Dataset to PyTorch tensors (we can directly use the dataset as it is already in the correct format) # dataset.set_format(type="torch", columns=['input_ids,target_ids']) # Directly set columns to torch tensors # Define the custom collate_fn function def collate_fn(batch): # Extract the 'input_ids' and 'target_ids' from the batch and return them as a list of tensors input_ids = [] target_ids = [] for data_item in batch: splitted_data_item = data_item['input_ids,target_ids'].split("\",") input_ids.append(torch.tensor(json.loads(splitted_data_item[0].replace('\"','')))) # print(f'input_ids: {type(input_ids)} {input_ids}') target_ids.append(torch.tensor(json.loads(splitted_data_item[1].replace('\"','')))) # print(f'target_ids: {type(target_ids)} {target_ids}') # Convert to tensors (if not already) input_ids_tensor = torch.stack(input_ids) target_ids_tensor = torch.stack(target_ids) return [input_ids_tensor, target_ids_tensor] # Creating the DataLoader for the 'train' split of the dataset with the custom collate_fn train_loader = DataLoader( dataset['train'], batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, collate_fn=collate_fn ) val_loader = DataLoader( dataset['validation'], batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, collate_fn=collate_fn ) return train_loader, val_loader ##################################### # 2. Architecture Code ##################################### import torch import torch.nn as nn class FeedForward(nn.Module): def __init__(self, cfg): super().__init__() self.fc1 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False) self.fc2 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False) self.fc3 = nn.Linear(cfg["hidden_dim"], cfg["emb_dim"], dtype=cfg["dtype"], bias=False) def forward(self, x): x_fc1 = self.fc1(x) x_fc2 = self.fc2(x) x = nn.functional.silu(x_fc1) * x_fc2 return self.fc3(x) def precompute_rope_params(head_dim, theta_base=10_000, context_length=4096, freq_config=None): assert head_dim % 2 == 0, "Embedding dimension must be even" # Compute the inverse frequencies inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim)) # Frequency adjustments if freq_config is not None: low_freq_wavelen = freq_config["original_context_length"] / freq_config["low_freq_factor"] high_freq_wavelen = freq_config["original_context_length"] / freq_config["high_freq_factor"] wavelen = 2 * torch.pi / inv_freq inv_freq_llama = torch.where( wavelen > low_freq_wavelen, inv_freq / freq_config["factor"], inv_freq ) smooth_factor = (freq_config["original_context_length"] / wavelen - freq_config["low_freq_factor"]) / ( freq_config["high_freq_factor"] - freq_config["low_freq_factor"] ) smoothed_inv_freq = ( (1 - smooth_factor) * (inv_freq / freq_config["factor"]) + smooth_factor * inv_freq ) is_medium_freq = (wavelen <= low_freq_wavelen) & (wavelen >= high_freq_wavelen) inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) inv_freq = inv_freq_llama # Generate position indices positions = torch.arange(context_length) # Compute the angles angles = positions[:, None] * inv_freq[None, :] # Shape: (context_length, head_dim // 2) # Expand angles to match the head_dim angles = torch.cat([angles, angles], dim=1) # Shape: (context_length, head_dim) # Precompute sine and cosine cos = torch.cos(angles) sin = torch.sin(angles) return cos, sin def compute_rope(x, cos, sin): # x: (batch_size, num_heads, seq_len, head_dim) batch_size, num_heads, seq_len, head_dim = x.shape assert head_dim % 2 == 0, "Head dimension must be even" # Split x into first half and second half x1 = x[..., : head_dim // 2] # First half x2 = x[..., head_dim // 2 :] # Second half # Adjust sin and cos shapes cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim) sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0) # Apply the rotary transformation rotated = torch.cat((-x2, x1), dim=-1) x_rotated = (x * cos) + (rotated * sin) return x_rotated.to(dtype=x.dtype) class SharedBuffers: _buffers = {} @staticmethod def get_buffers(context_length, head_dim, rope_base, freq_config, dtype=torch.float32): key = (context_length, head_dim, rope_base, tuple(freq_config.values()) if freq_config else freq_config, dtype) if key not in SharedBuffers._buffers: # Create or fetch the buffers mask = torch.triu(torch.ones(context_length, context_length), diagonal=1) cos, sin = precompute_rope_params(head_dim, rope_base, context_length, freq_config) if dtype is not None: cos = cos.to(dtype) sin = sin.to(dtype) SharedBuffers._buffers[key] = (mask, cos, sin) return SharedBuffers._buffers[key] class GroupedQueryAttention(nn.Module): def __init__( self, d_in, d_out, context_length, num_heads, num_kv_groups, rope_base=10_000, rope_config=None, dtype=None ): super().__init__() assert d_out % num_heads == 0, f"d_out:{d_out} must be divisible by num_heads:{num_heads}" assert num_heads % num_kv_groups == 0, "num_heads must be divisible by num_kv_groups" self.d_out = d_out self.num_heads = num_heads self.head_dim = d_out // num_heads self.W_key = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype) self.W_value = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype) self.num_kv_groups = num_kv_groups self.group_size = num_heads // num_kv_groups self.W_query = nn.Linear(d_in, d_out, bias=False, dtype=dtype) self.out_proj = nn.Linear(d_out, d_out, bias=False, dtype=dtype) # Fetch buffers using SharedBuffers mask, cos, sin = SharedBuffers.get_buffers(context_length, self.head_dim, rope_base, rope_config, dtype) self.register_buffer("mask", mask) self.register_buffer("cos", cos) self.register_buffer("sin", sin) def forward(self, x): b, num_tokens, d_in = x.shape queries = self.W_query(x) # Shape: (b, num_tokens, d_out) keys = self.W_key(x) # Shape: (b, num_tokens, num_kv_groups * head_dim) values = self.W_value(x) # Shape: (b, num_tokens, num_kv_groups * head_dim) # Reshape queries, keys, and values queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) keys = keys.view(b, num_tokens, self.num_kv_groups, self.head_dim) values = values.view(b, num_tokens, self.num_kv_groups, self.head_dim) # Transpose keys, values, and queries keys = keys.transpose(1, 2) # Shape: (b, num_heads, num_tokens, head_dim) values = values.transpose(1, 2) # Shape: (b, num_heads, num_tokens, head_dim) queries = queries.transpose(1, 2) # Shape: (b, num_query_groups, num_tokens, head_dim) # Apply RoPE keys = compute_rope(keys, self.cos, self.sin) queries = compute_rope(queries, self.cos, self.sin) # Expand keys and values to match the number of heads # Shape: (b, num_heads, num_tokens, head_dim) keys = keys.repeat_interleave(self.group_size, dim=1) # Shape: (b, num_heads, num_tokens, head_dim) values = values.repeat_interleave(self.group_size, dim=1) # Shape: (b, num_heads, num_tokens, head_dim) # For example, before repeat_interleave along dim=1 (query groups): # [K1, K2] # After repeat_interleave (each query group is repeated group_size times): # [K1, K1, K2, K2] # If we used regular repeat instead of repeat_interleave, we'd get: # [K1, K2, K1, K2] # Compute scaled dot-product attention (aka self-attention) with a causal mask # Shape: (b, num_heads, num_tokens, num_tokens) attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head # Original mask truncated to the number of tokens and converted to boolean mask_bool = self.mask.bool()[:num_tokens, :num_tokens] # Use the mask to fill attention scores attn_scores.masked_fill_(mask_bool, -torch.inf) attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) assert keys.shape[-1] == self.head_dim # Shape: (b, num_tokens, num_heads, head_dim) context_vec = (attn_weights @ values).transpose(1, 2) # Combine heads, where self.d_out = self.num_heads * self.head_dim context_vec = context_vec.reshape(b, num_tokens, self.d_out) context_vec = self.out_proj(context_vec) # optional projection return context_vec class TransformerBlock(nn.Module): def __init__(self, cfg): super().__init__() self.att = GroupedQueryAttention( d_in=cfg["emb_dim"], d_out=cfg["emb_dim"], context_length=cfg["context_length"], num_heads=cfg["n_heads"], num_kv_groups=cfg["n_kv_groups"], rope_base=cfg["rope_base"], rope_config=cfg["rope_freq"], dtype=cfg["dtype"] ) self.ff = FeedForward(cfg) self.norm1 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5) self.norm2 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5) def forward(self, x): # Shortcut connection for attention block shortcut = x x = self.norm1(x) x = self.att(x.to(torch.bfloat16)) # Shape [batch_size, num_tokens, emb_size] x = x + shortcut # Add the original input back # Shortcut connection for feed-forward block shortcut = x x = self.norm2(x) x = self.ff(x.to(torch.bfloat16)) x = x + shortcut # Add the original input back return x class Llama3Model(nn.Module): def __init__(self, cfg): super().__init__() self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"], dtype=cfg["dtype"]) self.trf_blocks = nn.Sequential( *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) self.final_norm = nn.RMSNorm(cfg["emb_dim"], eps=1e-5) self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"]) def forward(self, in_idx): tok_embeds = self.tok_emb(in_idx) x = tok_embeds x = self.trf_blocks(x) x = self.final_norm(x) logits = self.out_head(x.to(torch.bfloat16)) return logits ##################################### # 3. Load Tokenizer ##################################### import os from transformers import PreTrainedTokenizerFast class Tokenizer: def __init__(self, tokenizer_model_path): assert os.path.isfile(tokenizer_model_path), f"Tokenizer Model file {tokenizer_model_path} not found" # load the tokenizehere self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_model_path) # previously added # self.special_tokens = { # "<|begin_of_text|>": 128000, # "<|end_of_text|>": 128001, # "<|start_header_id|>": 128006, # "<|end_header_id|>": 128007, # "<|eot_id|>": 128009, # } def encode(self, text, bos=False, eos=False): ''' parameter: allowed_special removed parameter: disallowed_special removed ''' if bos: # tokens = [self.special_tokens["<|begin_of_text|>"]] tokens = self.tokenizer.encode('<|begin_of_text|>') # [50000] else: tokens = [] tokens += self.tokenizer.encode(text) if eos: # tokens.append(self.special_tokens["<|end_of_text|>"]) tokens.append(self.tokenizer.encode('<|end_of_text|>')[0]) # [50001] return tokens def decode(self, tokens): # return self.model.decode(tokens) return self.tokenizer.decode(tokens) class ChatFormat: def __init__(self, tokenizer): self.tokenizer = tokenizer def encode_header(self, message): tokens = [] tokens.append(self.tokenizer.tokenizer.encode('<|start_header_id|>')[0]) # 50002 tokens.extend(self.tokenizer.encode(message["भूमिका"], bos=False, eos=False)) tokens.append(self.tokenizer.tokenizer.encode('<|end_header_id|>')[0]) tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False)) # tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"]) # tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False)) # tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"]) # tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False)) return tokens def encode(self, text): message = { "भूमिका": "प्रयोगकर्ता", "सन्दर्भ": text } tokens = self.encode_header(message) tokens.extend( self.tokenizer.encode(message["सन्दर्भ"].strip(), bos=False, eos=False) ) # tokens.append(self.tokenizer.special_tokens["<|eot_id|>"]) tokens.append(self.tokenizer.tokenizer.encode('<|eot_id|>')[0]) return tokens def decode(self, token_ids): return self.tokenizer.decode(token_ids) _tokenizer = Tokenizer("NepaliBPE/tokenizer.json") chat_tokenizer = ChatFormat(_tokenizer) # text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।" # # normal tokenizer # print([tokenizer.tokenizer.decode([token]) for token in tokenizer.encode(text)]) # # formatted tokenizer # print([tokenizer.tokenizer.decode([token]) for token in chat_tokenizer.encode(text)]) ##################################### # 4. Generate Text #################################### def text_to_token_ids(text, tokenizer): encoded = tokenizer.encode(text) encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension return encoded_tensor # ''' # we have modified above return statement by sebastian because there are no tokens like 'start_header_id', 'end_header_id' and tokenizer is returning None which inturn is giving error # TODO: add special tokens: 'start_header_id', 'end_header_id' and uncomment above return statement # ''' # print(encoded_tensor) # return torch.tensor([token for token in encoded_tensor]) # TODO: use additional vocab like encoded_tensor def token_ids_to_text(token_ids, tokenizer): flat = token_ids.squeeze(0) # remove batch dimension return tokenizer.decode(flat.tolist()) def generate(model, idx, max_new_tokens, context_length, temperature=0.0, top_k=None, eos_id=None): # For-loop is the same as before: Get logits, and only focus on last time step for _ in range(max_new_tokens): idx_cond = idx[:, -context_length:] with torch.no_grad(): logits = model(idx_cond) logits = logits[:, -1, :] # New: Filter logits with top_k sampling if top_k is not None: # Keep only top_k values top_logits, _ = torch.topk(logits, top_k) min_val = top_logits[:, -1] logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits) # New: Apply temperature scaling if temperature > 0.0: logits = logits / temperature # Apply softmax to get probabilities probs = torch.softmax(logits, dim=-1) # (batch_size, context_len) # Sample from the distribution idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1) # Otherwise same as before: get idx of the vocab entry with the highest logits value else: idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1) if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified break # Same as before: append sampled index to the running sequence idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1) return idx def generate_and_print_sample(PROMPT, tokenizer, chat_tokenizer, model, device, context_length): # PROMPT = "What do llamas eat?" # PROMPT="रामले भात" torch.manual_seed(123) # token_ids = generate( # model=model, # idx=text_to_token_ids(PROMPT, chat_tokenizer).to(device), # max_new_tokens=150, # context_length=context_length, # temperature=0.5, # top_k=1, # eos_id=tokenizer.eos_token_id # ) # output_text = token_ids_to_text(token_ids, tokenizer) # We have re-defined generate function below. output_text = generate( model=model, prompt=PROMPT, tokenizer=tokenizer, max_new_tokens=150, ) print("Output text:\n", clean_text(output_text)) # ------------------------------------------------------------- # Generte sample text # PROMPT = "लामा हरु ले के खान्छन् ?" # torch.manual_seed(123) # token_ids = generate( # model=model, # idx=text_to_token_ids(PROMPT, chat_tokenizer).to(device), # max_new_tokens=150, # context_size=LLAMA32_CONFIG["context_length"], # top_k=1, # temperature=0. # ) # output_text = token_ids_to_text(token_ids, tokenizer) # ------------------------------------------------------------- def clean_text(text, header_end="प्रयोगकर्ता <|end_header_id|>\n\n"): # Find the index of the first occurrence of "<|end_header_id|>" index = text.find(header_end) if index != -1: # Return the substring starting after "<|end_header_id|>" return text[index + len(header_end):].strip() # Strip removes leading/trailing whitespace else: # If the token is not found, return the original text return text # print("Output text:\n", clean_text(output_text)) ########################################################################## # Chapter 5 (keep everything as it is except `generate_and_print_sample` function) ######################################################################## def calc_loss_batch(input_batch, target_batch, model, device): input_batch, target_batch = input_batch.to(device), target_batch.to(device) logits = model(input_batch) loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten()) return loss def calc_loss_loader(data_loader, model, device, num_batches=None, len_data_loader=0): ''' - parameter: len_data_loader=None ''' total_loss = 0. if len_data_loader == 0: # len(data_loader) return float("nan") elif num_batches is None: num_batches = len_data_loader else: num_batches = min(num_batches, len_data_loader) for i, (input_batch, target_batch) in enumerate(data_loader): if i < num_batches: loss = calc_loss_batch(input_batch, target_batch, model, device) total_loss += loss.item() else: break return total_loss / num_batches def evaluate_model(model, train_loader, val_loader, device, eval_iter, len_train_loader=0, len_val_loader=0): model.eval() with torch.no_grad(): train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, len_data_loader = len_train_loader) val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, len_data_loader = len_val_loader) model.train() return train_loss, val_loss def generate( model, prompt, tokenizer, max_new_tokens, temperature=0.7, top_k=50, top_p=None, # New parameter for nucleus sampling eos_id=None, repetition_penalty=1.2, penalize_len_below=50, context_size = 512 ): # context_size = GPT_CONFIG_124M['context_length'] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") idx = text_to_token_ids(prompt, tokenizer).to(device) if not eos_id: encoded_endoftext = tokenizer.encode("<|endoftext|>") eos_id = encoded_endoftext[0] if encoded_endoftext else None token_freq = {} for step in range(max_new_tokens): idx_cond = idx[:, -context_size:] with torch.no_grad(): logits = model(idx_cond) logits = logits[:, -1, :] # Apply repetition penalty for token_id in idx[0].tolist(): if token_id in token_freq: logits[0, token_id] /= repetition_penalty else: token_freq[token_id] = 1 # Penalize EOT token for shorter sequences if eos_id is not None and step < penalize_len_below: logits[0, eos_id] /= (penalize_len_below - step) / penalize_len_below # Apply temperature scaling if temperature > 0.0: logits = logits / temperature # Convert logits to probabilities probs = torch.softmax(logits, dim=-1) # Apply top-p (nucleus) sampling if specified if top_p: sorted_probs, sorted_indices = torch.sort(probs, descending=True) cumulative_probs = torch.cumsum(sorted_probs, dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # Create a mask for indices to remove indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove) probs = probs.masked_fill(indices_to_remove, 0.0) # Renormalize probabilities probs = probs / probs.sum(dim=-1, keepdim=True) # If top_p is None, apply top-k sampling elif top_k: top_probs, top_indices = torch.topk(probs, top_k) probs = torch.zeros_like(probs).scatter_(-1, top_indices, top_probs) # Renormalize probabilities probs = probs / probs.sum(dim=-1, keepdim=True) # Sample from the filtered distribution if temperature > 0.0: idx_next = torch.multinomial(probs, num_samples=1) else: idx_next = torch.argmax(probs, dim=-1, keepdim=True) if idx_next == eos_id: break idx = torch.cat((idx, idx_next), dim=1) text = token_ids_to_text(idx, tokenizer) return text def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir): fig, ax1 = plt.subplots() # Plot training and validation loss against epochs ax1.plot(epochs_seen, train_losses, label="Training loss") ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss") ax1.set_xlabel("Epochs") ax1.set_ylabel("Loss") ax1.legend(loc="upper right") ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) # Create a second x-axis for tokens seen ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks ax2.set_xlabel("Tokens seen") fig.tight_layout() # Adjust layout to make room plt.savefig(output_dir / "losses.pdf") # -------------------------------------------------------------------------------- # -------------------------- New Chat function --------------------- # -------------------------------------------------------------------------------- def generate_chat_optimized( model, prompt, tokenizer, chat_tokenizer, max_new_tokens, context_size, temperature=0.7, top_k=50, top_p=None, eos_id=None, repetition_penalty=1.2, penalize_len_below=50, device=None, batch_size=1, # Added parameter clean_the_text=True ): if device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") idx = text_to_token_ids(prompt, chat_tokenizer).to(device) # Find EOS token once instead of checking every time if not eos_id: if "<|endoftext|>" in tokenizer.get_vocab(): encoded_endoftext = tokenizer.encode("<|endoftext|>") eos_id = encoded_endoftext[0] if encoded_endoftext else None elif "<|eot_id|>" in tokenizer.get_vocab(): encoded_endoftext = tokenizer.encode("<|eot_id|>") eos_id = encoded_endoftext[0] if encoded_endoftext else None # Pre-compute token frequencies for the initial context token_freq = {} for token_id in idx[0].tolist(): if token_id in token_freq: token_freq[token_id] += 1 else: token_freq[token_id] = 1 # Process tokens in batches for efficiency with torch.no_grad(): # Move this outside the loop for step in range(0, max_new_tokens, batch_size): batch_end = min(step + batch_size, max_new_tokens) current_batch_size = batch_end - step idx_cond = idx[:, -context_size:] logits = model(idx_cond) logits = logits[:, -1, :] # Apply repetition penalty once for the batch for token_id in idx[0].tolist()[-current_batch_size:]: if token_id in token_freq: token_freq[token_id] += 1 logits[0, token_id] /= repetition_penalty else: token_freq[token_id] = 1 # Process each token in the batch for i in range(current_batch_size): current_step = step + i # Penalize EOT token for shorter sequences current_logits = logits.clone() # Work with a copy if eos_id is not None and current_step < penalize_len_below: penalty_factor = 1.0 + (penalize_len_below - current_step) / penalize_len_below current_logits[0, eos_id] /= penalty_factor # Apply temperature scaling if temperature > 0.0: current_logits = current_logits / temperature # Convert logits to probabilities probs = torch.softmax(current_logits, dim=-1) # Apply sampling strategies if top_p and top_p > 0.0: # Nucleus sampling implementation sorted_probs, sorted_indices = torch.sort(probs, descending=True) cumulative_probs = torch.cumsum(sorted_probs, dim=-1) sorted_indices_to_remove = cumulative_probs > top_p sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove) probs = probs.masked_fill(indices_to_remove, 0.0) probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8) elif top_k and top_k > 0: # Top-k sampling implementation top_probs, top_indices = torch.topk(probs, min(top_k, probs.size(-1))) probs = torch.zeros_like(probs).scatter_(-1, top_indices, top_probs) probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8) # Sample from the filtered distribution if temperature > 0.0: idx_next = torch.multinomial(probs, num_samples=1) else: idx_next = torch.argmax(probs, dim=-1, keepdim=True) # Add the next token to the sequence idx = torch.cat((idx, idx_next), dim=1) # Check for end of sequence token and break if needed if idx_next.item() == eos_id: output_text = token_ids_to_text(idx, tokenizer) if clean_the_text: # Clean the output # cleaned_text = clean_chat_output(output_text) cleaned_text = clean_text(output_text) if '<|eot_id|>' in cleaned_text: cleaned_text = cleaned_text.replace('<|eot_id|>','') # print("Generated text:\n", cleaned_text) return cleaned_text return output_text # Not end of text token. terminate early since it exceeds max_new_tokens output_text = token_ids_to_text(idx, tokenizer) if clean_the_text: # Clean the output # cleaned_text = clean_chat_output(output_text) cleaned_text = clean_text(output_text) if '<|eot_id|>' in cleaned_text: cleaned_text = cleaned_text.replace('<|eot_id|>','') # print("Generated text:\n", cleaned_text) return cleaned_text return output_text # ==================================================================================- # ==================================================================================- # ==================================================================================- # ==================================================================================- # ==================================================================================- # ============================================= # ============================================= # ============================================= # Below is the Code to initialize the model # ============================================= # ============================================= # ============================================= # import torch # already imported # from previous_chapters4 import ( # Llama3Model, # ChatFormat, # Tokenizer, # generate_and_print_sample # ) old_context_length = 131_072 # original context length of llama3.2 model new_context_length = LLAMA32_CONFIG["context_length"] # 512 our new context length def rescale_theta(theta_old, context_length_old, context_length_new): # original linear scaling scaling_factor = context_length_new / context_length_old theta_new = theta_old * scaling_factor return theta_new LLAMA32_CONFIG["rope_base"] = rescale_theta( LLAMA32_CONFIG["rope_base"], old_context_length, new_context_length ) print("New RoPE theta (i.e. LLAMA32_CONFIG[\"rope_base\"]):", LLAMA32_CONFIG["rope_base"]) model = Llama3Model(LLAMA32_CONFIG) # Todo: don't compile? (claude sonnet 3.7 said compiling would speed up inference speed) # compile the model if True: print("compiling the model... (takes a ~minute)") unoptimized_model = model model = torch.compile(model) # requires PyTorch 2.0 model.eval() # eval mode # Check buffers # -------------- print('The following is expected to print True to confirm buffers are reused instead of being (wastefully) recreated:') print(model.trf_blocks[0].att.mask is model.trf_blocks[-1].att.mask) print(model.trf_blocks[0].att.cos is model.trf_blocks[-1].att.cos) print(model.trf_blocks[0].att.sin is model.trf_blocks[-1].att.sin) # Display number of parameters # ----------------------------- total_params = sum(p.numel() for p in model.parameters()) print(f"Total number of parameters: {total_params:,}") # Account for weight tying total_params_normalized = total_params - model.tok_emb.weight.numel() print(f"\nTotal number of unique parameters: {total_params_normalized:,}") # Display model_memory_size # ----------------------------------------------------------------------- def model_memory_size(model, input_dtype=torch.float32): total_params = 0 total_grads = 0 for param in model.parameters(): # Calculate total number of elements per parameter param_size = param.numel() total_params += param_size # Check if gradients are stored for this parameter if param.requires_grad: total_grads += param_size # Calculate buffer size (non-parameters that require memory) total_buffers = sum(buf.numel() for buf in model.buffers()) # Size in bytes = (Number of elements) * (Size of each element in bytes) # We assume parameters and gradients are stored in the same type as input dtype element_size = torch.tensor(0, dtype=input_dtype).element_size() total_memory_bytes = (total_params + total_grads + total_buffers) * element_size # Convert bytes to gigabytes total_memory_gb = total_memory_bytes / (1024**3) return total_memory_gb print(f"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB") print(f"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB") # ----------------------------------------------------------------------- if torch.cuda.is_available(): device = torch.device("cuda") elif torch.backends.mps.is_available(): device = torch.device("mps") else: device = torch.device("cpu") model.to(device) print(f'device: {device}') latest_model_checkpoint = "parameters_300m/model_pg_398000_steps.pth" checkpoint = torch.load(latest_model_checkpoint, map_location=device, weights_only=False) # modified (added model loading code) model.load_state_dict(checkpoint["model_state_dict"]) # generate_and_print_sample(PROMPT="रामले भात", tokenizer=_tokenizer, chat_tokenizer=chat_tokenizer, model=model, device=device, context_length = LLAMA32_CONFIG["context_length"]) # from previous_chapters import generate_and_print_chat # generated_text = generate_and_print_chat( # prompt="रामले भात", # tokenizer=tokenizer, # chat_tokenizer=chat_tokenizer, # model=model, # device=None, # max_new_tokens=150, # context_length=None, # temperature=0.1, # top_k=50, # top_p=0.9, # repetition_penalty=1.2, # clean_the_text=True # ) # print(generated_text) # ============================================= # ============================================= # ============================================= def generate_text(prompt, max_new_tokens, top_k, top_p, temperature, repetition_penalty, penalize_len_below): return generate_chat_optimized( model=model, prompt=prompt, tokenizer=tokenizer, chat_tokenizer=chat_tokenizer, max_new_tokens=max_new_tokens, context_size=LLAMA32_CONFIG['context_length'], temperature=temperature, top_k=top_k, top_p=top_p, eos_id=None, repetition_penalty=repetition_penalty, penalize_len_below=penalize_len_below, device=device, batch_size=1 ) css = """ #bright-textbox { background-color: #ffeb3b; /* Bright yellow */ color: #000000; /* Black text for contrast */ border: 2px solid #fbc02d; /* Slightly darker yellow for the border */ font-size: 16px; padding: 10px; border-radius: 5px; } """ # Create Gradio interface with gr.Blocks(title="LLAMA3_Nepali_318M Text Generator", css=css) as interface: gr.Markdown("# LLAMA3_Nepali_318M Text Generator") gr.Markdown("Enter Nepali (नेपाली) text to generate content using the custom LLAMA3_Nepali_318M model.") with gr.Row(): with gr.Column(): prompt = gr.Textbox( label="Prompt", placeholder="यहाँ नेपाली मा इन्पुट दिनु होस् ... (please Enter Nepali text here...)" #, # value="रामले भात" ) max_tokens = gr.Slider(minimum=1, maximum=512, value=25, step=1, label="Max New Tokens") with gr.Row(): with gr.Column(): temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.3, step=0.1, label="Temperature") repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty") with gr.Column(): top_k = gr.Slider(minimum=0, maximum=100, value=5, step=1, label="Top K (set to 0 to use Top P)") top_p = gr.Slider(minimum=0, maximum=1.0, value=0.9, step=0.05, label="Top P (set above 0 to use instead of Top K)") min_length = gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Minimum Length Penalty") generate_btn = gr.Button("Generate Text") with gr.Column(): output = gr.Textbox(label="Generated Text", lines=10) # Add examples if you have any gr.Examples( examples=[ # ["रामले भात", 25, 10, 0, 0.7, 1.2, 15], # ["नेपाल एउटा", 25, 10, 0.9, 0.5, 1.2, 10], ["नेपाल का वर्तमान प्रधानमन्त्री ", 25, 10, 0.4, 0.8, 1.2, 10], ["भारतीय प्रधानमन्त्री ", 25, 10, 0.9, 0.5, 1.2, 15], ["अमिरिकी रास्ट्रपति डोनाल्ड", 25, 10, 0.9, 0.6, 1.2, 15], ], inputs=[prompt, max_tokens, top_k, top_p, temperature, repetition_penalty, min_length], outputs=output, fn=generate_text, cache_examples=True, ) generate_btn.click( fn=generate_text, inputs=[prompt, max_tokens, top_p, top_k, temperature, repetition_penalty, min_length], outputs=output ) interface.launch()