Spaces:
Sleeping
Sleeping
import json | |
import random | |
import torch | |
import time | |
import os | |
from distributed_training.data.dataset import DataLoader | |
from huggingface_hub import list_repo_refs | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir | |
device = "cuda" | |
test_indices_length = 1000 | |
AUTOMATE = True | |
models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"] | |
if os.path.exists("results.json"): | |
with open('results.json', 'r') as file: | |
results = json.load(file) | |
else: | |
results = {} | |
while True: | |
for model_name in [models[0]]: | |
if (model_name not in results.keys()): | |
results[model_name] = {} | |
tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True) | |
refs = list_repo_refs(model_name, repo_type="model") | |
global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None | |
if global_epoch in results[model_name]['main-net'].keys(): | |
print(f"Results for epoch {global_epoch} already calcualted") | |
time.sleep(30*60) | |
for epoch in range(0,global_epoch, 1): | |
if str(epoch) in results[model_name]['main-net'].keys(): | |
continue | |
model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True) | |
model = model.to(device) | |
search_start = random.choice( | |
range( | |
DataLoader.max_pages | |
- test_indices_length | |
+ 1 | |
) | |
) | |
group = [ | |
i | |
for i in range( | |
search_start, search_start + test_indices_length | |
) | |
] | |
dataloader = DataLoader( | |
batch_size=1, | |
sequence_length=1024, | |
rows=group, | |
) | |
total_loss = 0 | |
index = 0 | |
# Train data for one epoch | |
for index, batch in enumerate(dataloader): | |
inputs = batch[0].to(device) | |
labels = batch[1].to(device) | |
if (len(inputs[0]) != len(labels[0])): | |
breakpoint() | |
if "optimized" in model_name: | |
outputs = model(input_ids=inputs, labels=labels) | |
loss = outputs[1] | |
else: | |
outputs = model(input_ids=inputs, labels=inputs) | |
loss = outputs.loss | |
# Accumulate Total Loss | |
total_loss += loss.detach().item() | |
# Backward Pass | |
model.zero_grad() | |
average_loss = total_loss / (index+1) | |
results[model_name]['main-net'][str(epoch)] = [average_loss] | |
print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}") | |
with open("results.json", "w") as outfile: | |
json.dump(results, outfile, indent = 4) | |
current_revision = model.config._commit_hash | |
keep_recent=1 | |
try: | |
cache_info = scan_cache_dir() | |
for repo in cache_info.repos: | |
if repo.repo_id == model_name: | |
revisions = sorted( | |
repo.revisions, key=lambda r: r.last_modified, reverse=True | |
) | |
current_index = next( | |
( | |
i | |
for i, r in enumerate(revisions) | |
if r.commit_hash == current_revision | |
), | |
None, | |
) | |
if current_index is not None: | |
for revision in revisions[ | |
max(current_index + 1, keep_recent) : | |
]: | |
cache_info.delete_revisions(revision.commit_hash).execute() | |
break | |
except: | |
print( | |
"Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future." | |
) |