Spaces:

distributed
/

model_convergence

Sleeping

File size: 4,328 Bytes

0f87068
 
 
2467ab2
a38028c
d5bfcd1
2467ab2
 
0f87068
a38028c
2467ab2
 
3358017
a38028c
2467ab2
a38028c
2467ab2
d5bfcd1
 
 
 
 
2467ab2
a38028c
 
2467ab2
a38028c
 
2467ab2
a38028c
2467ab2
a38028c
 
2467ab2
a38028c
 
 
2467ab2
a38028c
2467ab2
a38028c
 
2467ab2
a38028c
 
 
 
 
 
 
 
 
2467ab2
a38028c
 
 
 
 
 
 
 
 
 
 
2467ab2
a38028c

import json
import random

import torch
import time
import os
from distributed_training.data.dataset import DataLoader
from huggingface_hub import list_repo_refs
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir

device = "cuda"
test_indices_length = 1000
AUTOMATE = True

models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]

if os.path.exists("results.json"):
    with open('results.json', 'r') as file:
        results = json.load(file)
else:
    results = {}

while True:
    for model_name in [models[0]]:

        if (model_name not in results.keys()):
            results[model_name] = {}

        tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)

        refs = list_repo_refs(model_name, repo_type="model")
        global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None

        if global_epoch in results[model_name]['main-net'].keys():
            print(f"Results for epoch {global_epoch} already calcualted")
            time.sleep(30*60)

        for epoch in range(0,global_epoch, 1):

            if str(epoch) in results[model_name]['main-net'].keys():
                continue

            model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
            model = model.to(device)

            search_start = random.choice(
                range(
                    DataLoader.max_pages
                    - test_indices_length
                    + 1
                )
            )
            group = [
                i
                for i in range(
                    search_start, search_start + test_indices_length
                )
            ]

            dataloader = DataLoader(
                batch_size=1,
                sequence_length=1024,
                rows=group,
            )

            total_loss = 0
            index = 0
            # Train data for one epoch
            for index, batch in enumerate(dataloader):
                inputs = batch[0].to(device)
                labels = batch[1].to(device)

                if (len(inputs[0]) != len(labels[0])):
                    breakpoint()
                if "optimized" in model_name:
                    outputs = model(input_ids=inputs, labels=labels)
                    loss = outputs[1]
                else:
                    outputs = model(input_ids=inputs, labels=inputs)
                    loss = outputs.loss

                # Accumulate Total Loss
                total_loss += loss.detach().item()

                # Backward Pass
                model.zero_grad()

            average_loss = total_loss / (index+1)
            results[model_name]['main-net'][str(epoch)] = [average_loss]
            print(f"Epoch: {epoch}  Average Loss: {average_loss:.2f}")

            with open("results.json", "w") as outfile:
                json.dump(results, outfile, indent = 4)

        current_revision = model.config._commit_hash
        keep_recent=1
        try:
            cache_info = scan_cache_dir()
            for repo in cache_info.repos:
                if repo.repo_id == model_name:
                    revisions = sorted(
                        repo.revisions, key=lambda r: r.last_modified, reverse=True
                    )
                    current_index = next(
                        (
                            i
                            for i, r in enumerate(revisions)
                            if r.commit_hash == current_revision
                        ),
                        None,
                    )
                    if current_index is not None:
                        for revision in revisions[
                            max(current_index + 1, keep_recent) :
                        ]:
                            cache_info.delete_revisions(revision.commit_hash).execute()
                    break
        except:
            print(
                "Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
            )