Spaces:
Sleeping
Sleeping
File size: 4,328 Bytes
0f87068 2467ab2 a38028c d5bfcd1 2467ab2 0f87068 a38028c 2467ab2 3358017 a38028c 2467ab2 a38028c 2467ab2 d5bfcd1 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c 2467ab2 a38028c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import json
import random
import torch
import time
import os
from distributed_training.data.dataset import DataLoader
from huggingface_hub import list_repo_refs
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir
device = "cuda"
test_indices_length = 1000
AUTOMATE = True
models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]
if os.path.exists("results.json"):
with open('results.json', 'r') as file:
results = json.load(file)
else:
results = {}
while True:
for model_name in [models[0]]:
if (model_name not in results.keys()):
results[model_name] = {}
tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)
refs = list_repo_refs(model_name, repo_type="model")
global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
if global_epoch in results[model_name]['main-net'].keys():
print(f"Results for epoch {global_epoch} already calcualted")
time.sleep(30*60)
for epoch in range(0,global_epoch, 1):
if str(epoch) in results[model_name]['main-net'].keys():
continue
model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
model = model.to(device)
search_start = random.choice(
range(
DataLoader.max_pages
- test_indices_length
+ 1
)
)
group = [
i
for i in range(
search_start, search_start + test_indices_length
)
]
dataloader = DataLoader(
batch_size=1,
sequence_length=1024,
rows=group,
)
total_loss = 0
index = 0
# Train data for one epoch
for index, batch in enumerate(dataloader):
inputs = batch[0].to(device)
labels = batch[1].to(device)
if (len(inputs[0]) != len(labels[0])):
breakpoint()
if "optimized" in model_name:
outputs = model(input_ids=inputs, labels=labels)
loss = outputs[1]
else:
outputs = model(input_ids=inputs, labels=inputs)
loss = outputs.loss
# Accumulate Total Loss
total_loss += loss.detach().item()
# Backward Pass
model.zero_grad()
average_loss = total_loss / (index+1)
results[model_name]['main-net'][str(epoch)] = [average_loss]
print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}")
with open("results.json", "w") as outfile:
json.dump(results, outfile, indent = 4)
current_revision = model.config._commit_hash
keep_recent=1
try:
cache_info = scan_cache_dir()
for repo in cache_info.repos:
if repo.repo_id == model_name:
revisions = sorted(
repo.revisions, key=lambda r: r.last_modified, reverse=True
)
current_index = next(
(
i
for i, r in enumerate(revisions)
if r.commit_hash == current_revision
),
None,
)
if current_index is not None:
for revision in revisions[
max(current_index + 1, keep_recent) :
]:
cache_info.delete_revisions(revision.commit_hash).execute()
break
except:
print(
"Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
) |