Spaces:

Aananda-giri
/

LLAMA3_Nepali_318M

Sleeping

41de865 4 months ago

43.7 kB

	# app.py (second app by claude)
	import gradio as gr
	import torch

	# ==================================================================================-
	# inference code part-1
	# -------------------------------------------------
	# 1. Download the model weights (from huggingface)
	# -------------------------------------------------
	from huggingface_hub import hf_hub_download
	hf_hub_download(repo_id="Aananda-giri/LLAMA3-Nepali", filename="parameters_300m/model_pg_398000_steps.pth", local_dir="./")

	# ----------------------
	# 2. Load The tokenizer
	# ----------------------
	from transformers import PreTrainedTokenizerFast

	# Load the tokenizer
	tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/LLAMA3-Nepali")
	tokenizer.save_pretrained("NepaliBPE")


	# Llama 3.2 ~300M Scaled Version
	LLAMA32_CONFIG = {
	"vocab_size": 50006, # <len(tokenizer.tokenizer)=50006> 128_256 reduced vocabulary size
	"context_length": 512, # 131_072 reduced Context length (unrelated to model size but higheer context length consumes more RAM)
	"emb_dim": 1320, # 2048 reduced Embedding dimension
	"n_heads": 20, # 32 reduced Number of attention heads
	"n_layers": 10, # 16 reduced Number of layers
	"hidden_dim": 5280, # 8192 Size of the intermediate dimension in FeedForward
	"n_kv_groups": 5, # 8 Key-Value groups for grouped-query attention
	"rope_base": 500_000.0, # 500_000 The base in RoPE's "theta"
	"dtype": torch.bfloat16, # Lower-precision dtype to reduce memory usage
	"rope_freq": { # RoPE frequency scaling
	"factor": 32.0,
	"low_freq_factor": 1.0,
	"high_freq_factor": 4.0,
	"original_context_length": 8192,
	}
	}


	# ==================================================================================-
	# ==================================================================================-
	# ==================================================================================-
	# ==================================================================================-
	# could not make importing from previous_chapters.py work, so i copied the all code
	# from previous_chapters.py here

	# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
	# Source for "Build a Large Language Model From Scratch"
	# - https://www.manning.com/books/build-a-large-language-model-from-scratch
	# Code: https://github.com/rasbt/LLMs-from-scratch

	# This file collects all the relevant code that we covered thus far
	# throughout Chapters 2-4.
	# This file can be run as a standalone script.

	import json

	# modified from `import tiktoken`
	from transformers import PreTrainedTokenizerFast


	import torch.nn as nn
	from torch.utils.data import Dataset, DataLoader
	import matplotlib.pyplot as plt
	from matplotlib.ticker import MaxNLocator

	# modified. added for create_dataloader_v2
	from datasets import load_dataset


	#####################################
	# 1. Dataloader
	#####################################

	def create_dataloader_v3(batch_size, shuffle=True, drop_last=True, num_workers=0):
	'''
	modified.
	* parameter: text removed
	* parameters: max_length and stride removed : they were set during preparing tokenized_datasets
	* parameter: context_length removed (as dataset is pre-tokenized)
	'''
	print('downloading dataset...')
	# Download the whole dataset
	base_url = "https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/resolve/main/pre_tokenized/"
	# data_files = {"train": base_url + "nepberta_" + str(context_length) + ".parquet"}

	# previous version: stride = .75*512, context_len.512
	# data_files = {
	# "train": base_url + "iriisnepal_u_nepberta_train_512.parquet",
	# "test": base_url + "iriisnepal_u_nepberta_test_512.parquet"
	# }

	# context_len.512, stride=512
	data_files={"train": base_url + "iriis_u_nepbert_512_512_train.parquet", "validation": base_url + "iriis_u_nepbert_512_512_test.parquet"}
	dataset = load_dataset("parquet", data_files=data_files, cache_dir='hf_cache', streaming=True)

	print(dataset)

	# and split it later
	# dataset = dataset.train_test_split(train_size=train_ratio, seed=42)
	# Convert Hugging Face Dataset to PyTorch tensors (we can directly use the dataset as it is already in the correct format)
	# dataset.set_format(type="torch", columns=['input_ids,target_ids']) # Directly set columns to torch tensors



	# Define the custom collate_fn function
	def collate_fn(batch):
	# Extract the 'input_ids' and 'target_ids' from the batch and return them as a list of tensors
	input_ids = []
	target_ids = []
	for data_item in batch:
	splitted_data_item = data_item['input_ids,target_ids'].split("\",")
	input_ids.append(torch.tensor(json.loads(splitted_data_item[0].replace('\"',''))))
	# print(f'input_ids: {type(input_ids)} {input_ids}')
	target_ids.append(torch.tensor(json.loads(splitted_data_item[1].replace('\"',''))))
	# print(f'target_ids: {type(target_ids)} {target_ids}')

	# Convert to tensors (if not already)
	input_ids_tensor = torch.stack(input_ids)
	target_ids_tensor = torch.stack(target_ids)

	return [input_ids_tensor, target_ids_tensor]


	# Creating the DataLoader for the 'train' split of the dataset with the custom collate_fn
	train_loader = DataLoader(
	dataset['train'],
	batch_size=batch_size,
	shuffle=shuffle,
	drop_last=drop_last,
	num_workers=num_workers,
	collate_fn=collate_fn
	)

	val_loader = DataLoader(
	dataset['validation'],
	batch_size=batch_size,
	shuffle=shuffle,
	drop_last=drop_last,
	num_workers=num_workers,
	collate_fn=collate_fn
	)

	return train_loader, val_loader




	#####################################
	# 2. Architecture Code
	#####################################
	import torch
	import torch.nn as nn


	class FeedForward(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.fc1 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False)
	self.fc2 = nn.Linear(cfg["emb_dim"], cfg["hidden_dim"], dtype=cfg["dtype"], bias=False)
	self.fc3 = nn.Linear(cfg["hidden_dim"], cfg["emb_dim"], dtype=cfg["dtype"], bias=False)

	def forward(self, x):
	x_fc1 = self.fc1(x)
	x_fc2 = self.fc2(x)
	x = nn.functional.silu(x_fc1) * x_fc2
	return self.fc3(x)

	def precompute_rope_params(head_dim, theta_base=10_000, context_length=4096, freq_config=None):
	assert head_dim % 2 == 0, "Embedding dimension must be even"

	# Compute the inverse frequencies
	inv_freq = 1.0 / (theta_base ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim))

	# Frequency adjustments
	if freq_config is not None:
	low_freq_wavelen = freq_config["original_context_length"] / freq_config["low_freq_factor"]
	high_freq_wavelen = freq_config["original_context_length"] / freq_config["high_freq_factor"]

	wavelen = 2 * torch.pi / inv_freq

	inv_freq_llama = torch.where(
	wavelen > low_freq_wavelen, inv_freq / freq_config["factor"], inv_freq
	)

	smooth_factor = (freq_config["original_context_length"] / wavelen - freq_config["low_freq_factor"]) / (
	freq_config["high_freq_factor"] - freq_config["low_freq_factor"]
	)

	smoothed_inv_freq = (
	(1 - smooth_factor) * (inv_freq / freq_config["factor"]) + smooth_factor * inv_freq
	)

	is_medium_freq = (wavelen <= low_freq_wavelen) & (wavelen >= high_freq_wavelen)
	inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
	inv_freq = inv_freq_llama

	# Generate position indices
	positions = torch.arange(context_length)

	# Compute the angles
	angles = positions[:, None] * inv_freq[None, :] # Shape: (context_length, head_dim // 2)

	# Expand angles to match the head_dim
	angles = torch.cat([angles, angles], dim=1) # Shape: (context_length, head_dim)

	# Precompute sine and cosine
	cos = torch.cos(angles)
	sin = torch.sin(angles)

	return cos, sin


	def compute_rope(x, cos, sin):
	# x: (batch_size, num_heads, seq_len, head_dim)
	batch_size, num_heads, seq_len, head_dim = x.shape
	assert head_dim % 2 == 0, "Head dimension must be even"

	# Split x into first half and second half
	x1 = x[..., : head_dim // 2] # First half
	x2 = x[..., head_dim // 2 :] # Second half

	# Adjust sin and cos shapes
	cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, head_dim)
	sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)

	# Apply the rotary transformation
	rotated = torch.cat((-x2, x1), dim=-1)
	x_rotated = (x * cos) + (rotated * sin)

	return x_rotated.to(dtype=x.dtype)

	class SharedBuffers:
	_buffers = {}

	@staticmethod
	def get_buffers(context_length, head_dim, rope_base, freq_config, dtype=torch.float32):
	key = (context_length, head_dim, rope_base, tuple(freq_config.values()) if freq_config else freq_config, dtype)

	if key not in SharedBuffers._buffers:
	# Create or fetch the buffers
	mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
	cos, sin = precompute_rope_params(head_dim, rope_base, context_length, freq_config)
	if dtype is not None:
	cos = cos.to(dtype)
	sin = sin.to(dtype)
	SharedBuffers._buffers[key] = (mask, cos, sin)

	return SharedBuffers._buffers[key]


	class GroupedQueryAttention(nn.Module):
	def __init__(
	self, d_in, d_out, context_length, num_heads,
	num_kv_groups,
	rope_base=10_000,
	rope_config=None,
	dtype=None
	):
	super().__init__()
	assert d_out % num_heads == 0, f"d_out:{d_out} must be divisible by num_heads:{num_heads}"
	assert num_heads % num_kv_groups == 0, "num_heads must be divisible by num_kv_groups"

	self.d_out = d_out
	self.num_heads = num_heads
	self.head_dim = d_out // num_heads

	self.W_key = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype)
	self.W_value = nn.Linear(d_in, num_kv_groups * self.head_dim, bias=False, dtype=dtype)
	self.num_kv_groups = num_kv_groups
	self.group_size = num_heads // num_kv_groups

	self.W_query = nn.Linear(d_in, d_out, bias=False, dtype=dtype)
	self.out_proj = nn.Linear(d_out, d_out, bias=False, dtype=dtype)

	# Fetch buffers using SharedBuffers
	mask, cos, sin = SharedBuffers.get_buffers(context_length, self.head_dim, rope_base, rope_config, dtype)
	self.register_buffer("mask", mask)

	self.register_buffer("cos", cos)
	self.register_buffer("sin", sin)

	def forward(self, x):
	b, num_tokens, d_in = x.shape

	queries = self.W_query(x) # Shape: (b, num_tokens, d_out)
	keys = self.W_key(x) # Shape: (b, num_tokens, num_kv_groups * head_dim)
	values = self.W_value(x) # Shape: (b, num_tokens, num_kv_groups * head_dim)

	# Reshape queries, keys, and values
	queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
	keys = keys.view(b, num_tokens, self.num_kv_groups, self.head_dim)
	values = values.view(b, num_tokens, self.num_kv_groups, self.head_dim)

	# Transpose keys, values, and queries
	keys = keys.transpose(1, 2) # Shape: (b, num_heads, num_tokens, head_dim)
	values = values.transpose(1, 2) # Shape: (b, num_heads, num_tokens, head_dim)
	queries = queries.transpose(1, 2) # Shape: (b, num_query_groups, num_tokens, head_dim)

	# Apply RoPE
	keys = compute_rope(keys, self.cos, self.sin)
	queries = compute_rope(queries, self.cos, self.sin)

	# Expand keys and values to match the number of heads
	# Shape: (b, num_heads, num_tokens, head_dim)
	keys = keys.repeat_interleave(self.group_size, dim=1) # Shape: (b, num_heads, num_tokens, head_dim)
	values = values.repeat_interleave(self.group_size, dim=1) # Shape: (b, num_heads, num_tokens, head_dim)
	# For example, before repeat_interleave along dim=1 (query groups):
	# [K1, K2]
	# After repeat_interleave (each query group is repeated group_size times):
	# [K1, K1, K2, K2]
	# If we used regular repeat instead of repeat_interleave, we'd get:
	# [K1, K2, K1, K2]

	# Compute scaled dot-product attention (aka self-attention) with a causal mask
	# Shape: (b, num_heads, num_tokens, num_tokens)
	attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head

	# Original mask truncated to the number of tokens and converted to boolean
	mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

	# Use the mask to fill attention scores
	attn_scores.masked_fill_(mask_bool, -torch.inf)

	attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
	assert keys.shape[-1] == self.head_dim

	# Shape: (b, num_tokens, num_heads, head_dim)
	context_vec = (attn_weights @ values).transpose(1, 2)

	# Combine heads, where self.d_out = self.num_heads * self.head_dim
	context_vec = context_vec.reshape(b, num_tokens, self.d_out)
	context_vec = self.out_proj(context_vec) # optional projection

	return context_vec

	class TransformerBlock(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.att = GroupedQueryAttention(
	d_in=cfg["emb_dim"],
	d_out=cfg["emb_dim"],
	context_length=cfg["context_length"],
	num_heads=cfg["n_heads"],
	num_kv_groups=cfg["n_kv_groups"],
	rope_base=cfg["rope_base"],
	rope_config=cfg["rope_freq"],
	dtype=cfg["dtype"]
	)
	self.ff = FeedForward(cfg)
	self.norm1 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)
	self.norm2 = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)

	def forward(self, x):
	# Shortcut connection for attention block
	shortcut = x
	x = self.norm1(x)
	x = self.att(x.to(torch.bfloat16)) # Shape [batch_size, num_tokens, emb_size]
	x = x + shortcut # Add the original input back

	# Shortcut connection for feed-forward block
	shortcut = x
	x = self.norm2(x)
	x = self.ff(x.to(torch.bfloat16))
	x = x + shortcut # Add the original input back

	return x

	class Llama3Model(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"], dtype=cfg["dtype"])

	self.trf_blocks = nn.Sequential(
	*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

	self.final_norm = nn.RMSNorm(cfg["emb_dim"], eps=1e-5)
	self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False, dtype=cfg["dtype"])

	def forward(self, in_idx):
	tok_embeds = self.tok_emb(in_idx)
	x = tok_embeds
	x = self.trf_blocks(x)
	x = self.final_norm(x)
	logits = self.out_head(x.to(torch.bfloat16))
	return logits

	#####################################
	# 3. Load Tokenizer
	#####################################



	import os

	from transformers import PreTrainedTokenizerFast
	class Tokenizer:
	def __init__(self, tokenizer_model_path):
	assert os.path.isfile(tokenizer_model_path), f"Tokenizer Model file {tokenizer_model_path} not found"

	# load the tokenizehere
	self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_model_path)

	# previously added
	# self.special_tokens = {
	# "<\|begin_of_text\|>": 128000,
	# "<\|end_of_text\|>": 128001,
	# "<\|start_header_id\|>": 128006,
	# "<\|end_header_id\|>": 128007,
	# "<\|eot_id\|>": 128009,
	# }



	def encode(self, text, bos=False, eos=False):
	'''
	parameter: allowed_special removed
	parameter: disallowed_special removed
	'''
	if bos:
	# tokens = [self.special_tokens["<\|begin_of_text\|>"]]
	tokens = self.tokenizer.encode('<\|begin_of_text\|>') # [50000]
	else:
	tokens = []

	tokens += self.tokenizer.encode(text)

	if eos:
	# tokens.append(self.special_tokens["<\|end_of_text\|>"])
	tokens.append(self.tokenizer.encode('<\|end_of_text\|>')[0]) # [50001]
	return tokens

	def decode(self, tokens):
	# return self.model.decode(tokens)
	return self.tokenizer.decode(tokens)


	class ChatFormat:
	def __init__(self, tokenizer):
	self.tokenizer = tokenizer

	def encode_header(self, message):
	tokens = []
	tokens.append(self.tokenizer.tokenizer.encode('<\|start_header_id\|>')[0]) # 50002
	tokens.extend(self.tokenizer.encode(message["भूमिका"], bos=False, eos=False))
	tokens.append(self.tokenizer.tokenizer.encode('<\|end_header_id\|>')[0])
	tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
	# tokens.append(self.tokenizer.special_tokens["<\|start_header_id\|>"])
	# tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
	# tokens.append(self.tokenizer.special_tokens["<\|end_header_id\|>"])
	# tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
	return tokens

	def encode(self, text):
	message = {
	"भूमिका": "प्रयोगकर्ता",
	"सन्दर्भ": text
	}

	tokens = self.encode_header(message)
	tokens.extend(
	self.tokenizer.encode(message["सन्दर्भ"].strip(), bos=False, eos=False)
	)
	# tokens.append(self.tokenizer.special_tokens["<\|eot_id\|>"])
	tokens.append(self.tokenizer.tokenizer.encode('<\|eot_id\|>')[0])
	return tokens

	def decode(self, token_ids):
	return self.tokenizer.decode(token_ids)


	_tokenizer = Tokenizer("NepaliBPE/tokenizer.json")
	chat_tokenizer = ChatFormat(_tokenizer)

	# text = "नेपाल विद्युत प्राधिकरणका कार्यकारी निर्देशक कुलमान घिसिङले माथिल्लो अरुण जलविद्युत आयोजना विश्व बैंक र एडीबीबाट वित्तीय व्यवस्थापन नभए नेपाली जनताको लगानीमा बनाउने तयारी रहेको बताएका छन् ।"
	# # normal tokenizer
	# print([tokenizer.tokenizer.decode([token]) for token in tokenizer.encode(text)])

	# # formatted tokenizer
	# print([tokenizer.tokenizer.decode([token]) for token in chat_tokenizer.encode(text)])


	#####################################
	# 4. Generate Text
	####################################
	def text_to_token_ids(text, tokenizer):
	encoded = tokenizer.encode(text)
	encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
	return encoded_tensor
	# '''
	# we have modified above return statement by sebastian because there are no tokens like 'start_header_id', 'end_header_id' and tokenizer is returning None which inturn is giving error
	# TODO: add special tokens: 'start_header_id', 'end_header_id' and uncomment above return statement
	# '''
	# print(encoded_tensor)
	# return torch.tensor([token for token in encoded_tensor]) # TODO: use additional vocab like encoded_tensor


	def token_ids_to_text(token_ids, tokenizer):
	flat = token_ids.squeeze(0) # remove batch dimension
	return tokenizer.decode(flat.tolist())


	def generate(model, idx, max_new_tokens, context_length, temperature=0.0, top_k=None, eos_id=None):

	# For-loop is the same as before: Get logits, and only focus on last time step
	for _ in range(max_new_tokens):
	idx_cond = idx[:, -context_length:]
	with torch.no_grad():
	logits = model(idx_cond)
	logits = logits[:, -1, :]

	# New: Filter logits with top_k sampling
	if top_k is not None:
	# Keep only top_k values
	top_logits, _ = torch.topk(logits, top_k)
	min_val = top_logits[:, -1]
	logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

	# New: Apply temperature scaling
	if temperature > 0.0:
	logits = logits / temperature

	# Apply softmax to get probabilities
	probs = torch.softmax(logits, dim=-1) # (batch_size, context_len)

	# Sample from the distribution
	idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)

	# Otherwise same as before: get idx of the vocab entry with the highest logits value
	else:
	idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1)

	if idx_next == eos_id: # Stop generating early if end-of-sequence token is encountered and eos_id is specified
	break

	# Same as before: append sampled index to the running sequence
	idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)

	return idx

	def generate_and_print_sample(PROMPT, tokenizer, chat_tokenizer, model, device, context_length):

	# PROMPT = "What do llamas eat?"
	# PROMPT="रामले भात"
	torch.manual_seed(123)

	# token_ids = generate(
	# model=model,
	# idx=text_to_token_ids(PROMPT, chat_tokenizer).to(device),
	# max_new_tokens=150,
	# context_length=context_length,
	# temperature=0.5,
	# top_k=1,
	# eos_id=tokenizer.eos_token_id
	# )

	# output_text = token_ids_to_text(token_ids, tokenizer)


	# We have re-defined generate function below.
	output_text = generate(
	model=model,
	prompt=PROMPT,
	tokenizer=tokenizer,
	max_new_tokens=150,

	)


	print("Output text:\n", clean_text(output_text))

	# -------------------------------------------------------------
	# Generte sample text
	# PROMPT = "लामा हरु ले के खान्छन् ?"

	# torch.manual_seed(123)

	# token_ids = generate(
	# model=model,
	# idx=text_to_token_ids(PROMPT, chat_tokenizer).to(device),
	# max_new_tokens=150,
	# context_size=LLAMA32_CONFIG["context_length"],
	# top_k=1,
	# temperature=0.
	# )

	# output_text = token_ids_to_text(token_ids, tokenizer)
	# -------------------------------------------------------------

	def clean_text(text, header_end="प्रयोगकर्ता <\|end_header_id\|>\n\n"):
	# Find the index of the first occurrence of "<\|end_header_id\|>"
	index = text.find(header_end)

	if index != -1:
	# Return the substring starting after "<\|end_header_id\|>"
	return text[index + len(header_end):].strip() # Strip removes leading/trailing whitespace
	else:
	# If the token is not found, return the original text
	return text

	# print("Output text:\n", clean_text(output_text))


	##########################################################################
	# Chapter 5 (keep everything as it is except `generate_and_print_sample` function)
	########################################################################


	def calc_loss_batch(input_batch, target_batch, model, device):
	input_batch, target_batch = input_batch.to(device), target_batch.to(device)
	logits = model(input_batch)
	loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
	return loss

	def calc_loss_loader(data_loader, model, device, num_batches=None, len_data_loader=0):
	'''
	- parameter: len_data_loader=None <added to set len_data_loader since we cant calulate len(data_loader) of Iterable>
	'''
	total_loss = 0.

	if len_data_loader == 0: # len(data_loader)
	return float("nan")
	elif num_batches is None:
	num_batches = len_data_loader
	else:
	num_batches = min(num_batches, len_data_loader)
	for i, (input_batch, target_batch) in enumerate(data_loader):
	if i < num_batches:
	loss = calc_loss_batch(input_batch, target_batch, model, device)
	total_loss += loss.item()
	else:
	break
	return total_loss / num_batches


	def evaluate_model(model, train_loader, val_loader, device, eval_iter, len_train_loader=0, len_val_loader=0):
	model.eval()
	with torch.no_grad():
	train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter, len_data_loader = len_train_loader)
	val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter, len_data_loader = len_val_loader)
	model.train()
	return train_loss, val_loss

	def generate(
	model,
	prompt,
	tokenizer,
	max_new_tokens,
	temperature=0.7,
	top_k=50,
	top_p=None, # New parameter for nucleus sampling
	eos_id=None,
	repetition_penalty=1.2,
	penalize_len_below=50,
	context_size = 512
	):
	# context_size = GPT_CONFIG_124M['context_length']
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	idx = text_to_token_ids(prompt, tokenizer).to(device)

	if not eos_id:
	encoded_endoftext = tokenizer.encode("<\|endoftext\|>")
	eos_id = encoded_endoftext[0] if encoded_endoftext else None

	token_freq = {}

	for step in range(max_new_tokens):
	idx_cond = idx[:, -context_size:]
	with torch.no_grad():
	logits = model(idx_cond)
	logits = logits[:, -1, :]

	# Apply repetition penalty
	for token_id in idx[0].tolist():
	if token_id in token_freq:
	logits[0, token_id] /= repetition_penalty
	else:
	token_freq[token_id] = 1

	# Penalize EOT token for shorter sequences
	if eos_id is not None and step < penalize_len_below:
	logits[0, eos_id] /= (penalize_len_below - step) / penalize_len_below

	# Apply temperature scaling
	if temperature > 0.0:
	logits = logits / temperature

	# Convert logits to probabilities
	probs = torch.softmax(logits, dim=-1)

	# Apply top-p (nucleus) sampling if specified
	if top_p:
	sorted_probs, sorted_indices = torch.sort(probs, descending=True)
	cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

	# Remove tokens with cumulative probability above the threshold
	sorted_indices_to_remove = cumulative_probs > top_p
	# Shift the indices to the right to keep also the first token above the threshold
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0

	# Create a mask for indices to remove
	indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
	probs = probs.masked_fill(indices_to_remove, 0.0)

	# Renormalize probabilities
	probs = probs / probs.sum(dim=-1, keepdim=True)

	# If top_p is None, apply top-k sampling
	elif top_k:
	top_probs, top_indices = torch.topk(probs, top_k)
	probs = torch.zeros_like(probs).scatter_(-1, top_indices, top_probs)
	# Renormalize probabilities
	probs = probs / probs.sum(dim=-1, keepdim=True)

	# Sample from the filtered distribution
	if temperature > 0.0:
	idx_next = torch.multinomial(probs, num_samples=1)
	else:
	idx_next = torch.argmax(probs, dim=-1, keepdim=True)

	if idx_next == eos_id:
	break

	idx = torch.cat((idx, idx_next), dim=1)
	text = token_ids_to_text(idx, tokenizer)

	return text


	def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, output_dir):
	fig, ax1 = plt.subplots()

	# Plot training and validation loss against epochs
	ax1.plot(epochs_seen, train_losses, label="Training loss")
	ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
	ax1.set_xlabel("Epochs")
	ax1.set_ylabel("Loss")
	ax1.legend(loc="upper right")
	ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

	# Create a second x-axis for tokens seen
	ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis
	ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks
	ax2.set_xlabel("Tokens seen")

	fig.tight_layout() # Adjust layout to make room
	plt.savefig(output_dir / "losses.pdf")



	# --------------------------------------------------------------------------------
	# -------------------------- New Chat function ---------------------
	# --------------------------------------------------------------------------------
	def generate_chat_optimized(
	model,
	prompt,
	tokenizer,
	chat_tokenizer,
	max_new_tokens,
	context_size,
	temperature=0.7,
	top_k=50,
	top_p=None,
	eos_id=None,
	repetition_penalty=1.2,
	penalize_len_below=50,
	device=None,
	batch_size=1, # Added parameter
	clean_the_text=True
	):
	if device is None:
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	idx = text_to_token_ids(prompt, chat_tokenizer).to(device)

	# Find EOS token once instead of checking every time
	if not eos_id:
	if "<\|endoftext\|>" in tokenizer.get_vocab():
	encoded_endoftext = tokenizer.encode("<\|endoftext\|>")
	eos_id = encoded_endoftext[0] if encoded_endoftext else None
	elif "<\|eot_id\|>" in tokenizer.get_vocab():
	encoded_endoftext = tokenizer.encode("<\|eot_id\|>")
	eos_id = encoded_endoftext[0] if encoded_endoftext else None

	# Pre-compute token frequencies for the initial context
	token_freq = {}
	for token_id in idx[0].tolist():
	if token_id in token_freq:
	token_freq[token_id] += 1
	else:
	token_freq[token_id] = 1

	# Process tokens in batches for efficiency
	with torch.no_grad(): # Move this outside the loop
	for step in range(0, max_new_tokens, batch_size):
	batch_end = min(step + batch_size, max_new_tokens)
	current_batch_size = batch_end - step

	idx_cond = idx[:, -context_size:]
	logits = model(idx_cond)
	logits = logits[:, -1, :]

	# Apply repetition penalty once for the batch
	for token_id in idx[0].tolist()[-current_batch_size:]:
	if token_id in token_freq:
	token_freq[token_id] += 1
	logits[0, token_id] /= repetition_penalty
	else:
	token_freq[token_id] = 1

	# Process each token in the batch
	for i in range(current_batch_size):
	current_step = step + i

	# Penalize EOT token for shorter sequences
	current_logits = logits.clone() # Work with a copy
	if eos_id is not None and current_step < penalize_len_below:
	penalty_factor = 1.0 + (penalize_len_below - current_step) / penalize_len_below
	current_logits[0, eos_id] /= penalty_factor

	# Apply temperature scaling
	if temperature > 0.0:
	current_logits = current_logits / temperature

	# Convert logits to probabilities
	probs = torch.softmax(current_logits, dim=-1)

	# Apply sampling strategies
	if top_p and top_p > 0.0:
	# Nucleus sampling implementation
	sorted_probs, sorted_indices = torch.sort(probs, descending=True)
	cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0

	indices_to_remove = sorted_indices_to_remove.scatter(dim=-1, index=sorted_indices, src=sorted_indices_to_remove)
	probs = probs.masked_fill(indices_to_remove, 0.0)
	probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

	elif top_k and top_k > 0:
	# Top-k sampling implementation
	top_probs, top_indices = torch.topk(probs, min(top_k, probs.size(-1)))
	probs = torch.zeros_like(probs).scatter_(-1, top_indices, top_probs)
	probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-8)

	# Sample from the filtered distribution
	if temperature > 0.0:
	idx_next = torch.multinomial(probs, num_samples=1)
	else:
	idx_next = torch.argmax(probs, dim=-1, keepdim=True)

	# Add the next token to the sequence
	idx = torch.cat((idx, idx_next), dim=1)

	# Check for end of sequence token and break if needed
	if idx_next.item() == eos_id:
	output_text = token_ids_to_text(idx, tokenizer)
	if clean_the_text:
	# Clean the output
	# cleaned_text = clean_chat_output(output_text)
	cleaned_text = clean_text(output_text)
	if '<\|eot_id\|>' in cleaned_text:
	cleaned_text = cleaned_text.replace('<\|eot_id\|>','')
	# print("Generated text:\n", cleaned_text)

	return cleaned_text
	return output_text

	# Not end of text token. terminate early since it exceeds max_new_tokens
	output_text = token_ids_to_text(idx, tokenizer)
	if clean_the_text:
	# Clean the output
	# cleaned_text = clean_chat_output(output_text)
	cleaned_text = clean_text(output_text)
	if '<\|eot_id\|>' in cleaned_text:
	cleaned_text = cleaned_text.replace('<\|eot_id\|>','')
	# print("Generated text:\n", cleaned_text)

	return cleaned_text
	return output_text
	# ==================================================================================-
	# ==================================================================================-
	# ==================================================================================-
	# ==================================================================================-
	# ==================================================================================-


	# =============================================
	# =============================================
	# =============================================
	# Below is the Code to initialize the model
	# =============================================
	# =============================================
	# =============================================
	# import torch # already imported

	# from previous_chapters4 import (
	# Llama3Model,
	# ChatFormat,
	# Tokenizer,
	# generate_and_print_sample
	# )



	old_context_length = 131_072 # original context length of llama3.2 model
	new_context_length = LLAMA32_CONFIG["context_length"] # 512 our new context length

	def rescale_theta(theta_old, context_length_old, context_length_new):
	# original linear scaling
	scaling_factor = context_length_new / context_length_old
	theta_new = theta_old * scaling_factor
	return theta_new

	LLAMA32_CONFIG["rope_base"] = rescale_theta(
	LLAMA32_CONFIG["rope_base"],
	old_context_length,
	new_context_length
	)

	print("New RoPE theta (i.e. LLAMA32_CONFIG[\"rope_base\"]):", LLAMA32_CONFIG["rope_base"])

	model = Llama3Model(LLAMA32_CONFIG)
	# Todo: don't compile? (claude sonnet 3.7 said compiling would speed up inference speed)
	# compile the model
	if True:
	print("compiling the model... (takes a ~minute)")
	unoptimized_model = model
	model = torch.compile(model) # requires PyTorch 2.0

	model.eval() # eval mode

	# Check buffers
	# --------------
	print('The following is expected to print True to confirm buffers are reused instead of being (wastefully) recreated:')
	print(model.trf_blocks[0].att.mask is model.trf_blocks[-1].att.mask)
	print(model.trf_blocks[0].att.cos is model.trf_blocks[-1].att.cos)
	print(model.trf_blocks[0].att.sin is model.trf_blocks[-1].att.sin)

	# Display number of parameters
	# -----------------------------
	total_params = sum(p.numel() for p in model.parameters())
	print(f"Total number of parameters: {total_params:,}")
	# Account for weight tying
	total_params_normalized = total_params - model.tok_emb.weight.numel()
	print(f"\nTotal number of unique parameters: {total_params_normalized:,}")

	# Display model_memory_size
	# -----------------------------------------------------------------------
	def model_memory_size(model, input_dtype=torch.float32):
	total_params = 0
	total_grads = 0
	for param in model.parameters():
	# Calculate total number of elements per parameter
	param_size = param.numel()
	total_params += param_size
	# Check if gradients are stored for this parameter
	if param.requires_grad:
	total_grads += param_size

	# Calculate buffer size (non-parameters that require memory)
	total_buffers = sum(buf.numel() for buf in model.buffers())

	# Size in bytes = (Number of elements) * (Size of each element in bytes)
	# We assume parameters and gradients are stored in the same type as input dtype
	element_size = torch.tensor(0, dtype=input_dtype).element_size()
	total_memory_bytes = (total_params + total_grads + total_buffers) * element_size

	# Convert bytes to gigabytes
	total_memory_gb = total_memory_bytes / (1024**3)

	return total_memory_gb

	print(f"float32 (PyTorch default): {model_memory_size(model, input_dtype=torch.float32):.2f} GB")
	print(f"bfloat16: {model_memory_size(model, input_dtype=torch.bfloat16):.2f} GB")
	# -----------------------------------------------------------------------

	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")

	model.to(device)
	print(f'device: {device}')

	latest_model_checkpoint = "parameters_300m/model_pg_398000_steps.pth"

	checkpoint = torch.load(latest_model_checkpoint, map_location=device, weights_only=False)

	# modified (added model loading code)
	model.load_state_dict(checkpoint["model_state_dict"])
	# generate_and_print_sample(PROMPT="रामले भात", tokenizer=_tokenizer, chat_tokenizer=chat_tokenizer, model=model, device=device, context_length = LLAMA32_CONFIG["context_length"])

	# from previous_chapters import generate_and_print_chat

	# generated_text = generate_and_print_chat(
	# prompt="रामले भात",
	# tokenizer=tokenizer,
	# chat_tokenizer=chat_tokenizer,
	# model=model,
	# device=None,
	# max_new_tokens=150,
	# context_length=None,
	# temperature=0.1,
	# top_k=50,
	# top_p=0.9,
	# repetition_penalty=1.2,
	# clean_the_text=True
	# )
	# print(generated_text)
	# =============================================
	# =============================================
	# =============================================
	def generate_text(prompt, max_new_tokens, top_k, top_p, temperature, repetition_penalty, penalize_len_below):
	return generate_chat_optimized(
	model=model,
	prompt=prompt,
	tokenizer=tokenizer,
	chat_tokenizer=chat_tokenizer,
	max_new_tokens=max_new_tokens,
	context_size=LLAMA32_CONFIG['context_length'],
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	eos_id=None,
	repetition_penalty=repetition_penalty,
	penalize_len_below=penalize_len_below,
	device=device,
	batch_size=1
	)


	css = """
	#bright-textbox {
	background-color: #ffeb3b; /* Bright yellow */
	color: #000000; /* Black text for contrast */
	border: 2px solid #fbc02d; /* Slightly darker yellow for the border */
	font-size: 16px;
	padding: 10px;
	border-radius: 5px;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(title="LLAMA3_Nepali_318M Text Generator", css=css) as interface:
	gr.Markdown("# LLAMA3_Nepali_318M Text Generator")
	gr.Markdown("Enter Nepali (नेपाली) text to generate content using the custom LLAMA3_Nepali_318M model.")

	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(
	label="Prompt",
	placeholder="यहाँ नेपाली मा इन्पुट दिनु होस् ... (please Enter Nepali text here...)" #,
	# value="रामले भात"
	)
	max_tokens = gr.Slider(minimum=1, maximum=512, value=25, step=1, label="Max New Tokens")

	with gr.Row():
	with gr.Column():
	temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.3, step=0.1, label="Temperature")
	repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty")
	with gr.Column():
	top_k = gr.Slider(minimum=0, maximum=100, value=5, step=1, label="Top K (set to 0 to use Top P)")
	top_p = gr.Slider(minimum=0, maximum=1.0, value=0.9, step=0.05, label="Top P (set above 0 to use instead of Top K)")

	min_length = gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Minimum Length Penalty")
	generate_btn = gr.Button("Generate Text")

	with gr.Column():
	output = gr.Textbox(label="Generated Text", lines=10)

	# Add examples if you have any
	gr.Examples(
	examples=[
	# ["रामले भात", 25, 10, 0, 0.7, 1.2, 15],
	# ["नेपाल एउटा", 25, 10, 0.9, 0.5, 1.2, 10],
	["नेपाल का वर्तमान प्रधानमन्त्री ", 25, 10, 0.4, 0.8, 1.2, 10],
	["भारतीय प्रधानमन्त्री ", 25, 10, 0.9, 0.5, 1.2, 15],
	["अमिरिकी रास्ट्रपति डोनाल्ड", 25, 10, 0.9, 0.6, 1.2, 15],
	],
	inputs=[prompt, max_tokens, top_k, top_p, temperature, repetition_penalty, min_length],
	outputs=output,
	fn=generate_text,
	cache_examples=True,
	)

	generate_btn.click(
	fn=generate_text,
	inputs=[prompt, max_tokens, top_p, top_k, temperature, repetition_penalty, min_length],
	outputs=output
	)


	interface.launch()