model_convergence / evaluate.py
kmfoda's picture
Auotmate evalaution
a38028c
raw
history blame
4.33 kB
import json
import random
import torch
import time
import os
from distributed_training.data.dataset import DataLoader
from huggingface_hub import list_repo_refs
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir
device = "cuda"
test_indices_length = 1000
AUTOMATE = True
models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]
if os.path.exists("results.json"):
with open('results.json', 'r') as file:
results = json.load(file)
else:
results = {}
while True:
for model_name in [models[0]]:
if (model_name not in results.keys()):
results[model_name] = {}
tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)
refs = list_repo_refs(model_name, repo_type="model")
global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
if global_epoch in results[model_name]['main-net'].keys():
print(f"Results for epoch {global_epoch} already calcualted")
time.sleep(30*60)
for epoch in range(0,global_epoch, 1):
if str(epoch) in results[model_name]['main-net'].keys():
continue
model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
model = model.to(device)
search_start = random.choice(
range(
DataLoader.max_pages
- test_indices_length
+ 1
)
)
group = [
i
for i in range(
search_start, search_start + test_indices_length
)
]
dataloader = DataLoader(
batch_size=1,
sequence_length=1024,
rows=group,
)
total_loss = 0
index = 0
# Train data for one epoch
for index, batch in enumerate(dataloader):
inputs = batch[0].to(device)
labels = batch[1].to(device)
if (len(inputs[0]) != len(labels[0])):
breakpoint()
if "optimized" in model_name:
outputs = model(input_ids=inputs, labels=labels)
loss = outputs[1]
else:
outputs = model(input_ids=inputs, labels=inputs)
loss = outputs.loss
# Accumulate Total Loss
total_loss += loss.detach().item()
# Backward Pass
model.zero_grad()
average_loss = total_loss / (index+1)
results[model_name]['main-net'][str(epoch)] = [average_loss]
print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}")
with open("results.json", "w") as outfile:
json.dump(results, outfile, indent = 4)
current_revision = model.config._commit_hash
keep_recent=1
try:
cache_info = scan_cache_dir()
for repo in cache_info.repos:
if repo.repo_id == model_name:
revisions = sorted(
repo.revisions, key=lambda r: r.last_modified, reverse=True
)
current_index = next(
(
i
for i, r in enumerate(revisions)
if r.commit_hash == current_revision
),
None,
)
if current_index is not None:
for revision in revisions[
max(current_index + 1, keep_recent) :
]:
cache_info.delete_revisions(revision.commit_hash).execute()
break
except:
print(
"Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
)