import json import random import torch import time import os from distributed_training.data.dataset import DataLoader from huggingface_hub import list_repo_refs from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir device = "cuda" test_indices_length = 1000 AUTOMATE = True models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"] if os.path.exists("results.json"): with open('results.json', 'r') as file: results = json.load(file) else: results = {} while True: for model_name in [models[0]]: if (model_name not in results.keys()): results[model_name] = {} tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True) refs = list_repo_refs(model_name, repo_type="model") global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None if global_epoch in results[model_name]['main-net'].keys(): print(f"Results for epoch {global_epoch} already calcualted") time.sleep(30*60) for epoch in range(0,global_epoch, 1): if str(epoch) in results[model_name]['main-net'].keys(): continue model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True) model = model.to(device) search_start = random.choice( range( DataLoader.max_pages - test_indices_length + 1 ) ) group = [ i for i in range( search_start, search_start + test_indices_length ) ] dataloader = DataLoader( batch_size=1, sequence_length=1024, rows=group, ) total_loss = 0 index = 0 # Train data for one epoch for index, batch in enumerate(dataloader): inputs = batch[0].to(device) labels = batch[1].to(device) if (len(inputs[0]) != len(labels[0])): breakpoint() if "optimized" in model_name: outputs = model(input_ids=inputs, labels=labels) loss = outputs[1] else: outputs = model(input_ids=inputs, labels=inputs) loss = outputs.loss # Accumulate Total Loss total_loss += loss.detach().item() # Backward Pass model.zero_grad() average_loss = total_loss / (index+1) results[model_name]['main-net'][str(epoch)] = [average_loss] print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}") with open("results.json", "w") as outfile: json.dump(results, outfile, indent = 4) current_revision = model.config._commit_hash keep_recent=1 try: cache_info = scan_cache_dir() for repo in cache_info.repos: if repo.repo_id == model_name: revisions = sorted( repo.revisions, key=lambda r: r.last_modified, reverse=True ) current_index = next( ( i for i, r in enumerate(revisions) if r.commit_hash == current_revision ), None, ) if current_index is not None: for revision in revisions[ max(current_index + 1, keep_recent) : ]: cache_info.delete_revisions(revision.commit_hash).execute() break except: print( "Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future." )