kmfoda commited on
Commit
a38028c
·
1 Parent(s): cd1750b

Auotmate evalaution

Browse files
Files changed (2) hide show
  1. evaluate.py +94 -59
  2. results.json +6 -0
evaluate.py CHANGED
@@ -2,15 +2,18 @@ import json
2
  import random
3
 
4
  import torch
 
5
  import os
6
  from distributed_training.data.dataset import DataLoader
7
  from huggingface_hub import list_repo_refs
8
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
9
 
10
  device = "cuda"
11
  test_indices_length = 1000
 
12
 
13
- models = ["distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]
14
 
15
  if os.path.exists("results.json"):
16
  with open('results.json', 'r') as file:
@@ -18,70 +21,102 @@ if os.path.exists("results.json"):
18
  else:
19
  results = {}
20
 
21
- for model_name in models:
 
22
 
23
- if (model_name not in results.keys()):
24
- results[model_name] = {}
25
 
26
- tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)
27
 
28
- refs = list_repo_refs(model_name, repo_type="model")
29
- global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
30
 
31
- for epoch in range(0,global_epoch, 1):
 
 
32
 
33
- if str(epoch) in results[model_name]['main-net'].keys():
34
- continue
35
 
36
- model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
37
- model = model.to(device)
38
 
39
- search_start = random.choice(
40
- range(
41
- DataLoader.max_pages
42
- - test_indices_length
43
- + 1
 
 
 
 
44
  )
45
- )
46
- group = [
47
- i
48
- for i in range(
49
- search_start, search_start + test_indices_length
 
 
 
 
 
 
50
  )
51
- ]
52
-
53
- dataloader = DataLoader(
54
- batch_size=1,
55
- sequence_length=1024,
56
- rows=group,
57
- )
58
-
59
- total_loss = 0
60
- index = 0
61
- # Train data for one epoch
62
- for index, batch in enumerate(dataloader):
63
- inputs = batch[0].to(device)
64
- labels = batch[1].to(device)
65
-
66
- if (len(inputs[0]) != len(labels[0])):
67
- breakpoint()
68
-
69
- if "optimized" in model_name:
70
- outputs = model(input_ids=inputs, labels=labels)
71
- loss = outputs[1]
72
- else:
73
- outputs = model(input_ids=inputs, labels=inputs)
74
- loss = outputs.loss
75
-
76
- # Accumulate Total Loss
77
- total_loss += loss.detach().item()
78
-
79
- # Backward Pass
80
- model.zero_grad()
81
-
82
- average_loss = total_loss / (index+1)
83
- results[model_name]['main-net'][str(epoch)] = [average_loss]
84
- print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}")
85
-
86
- with open("results.json", "w") as outfile:
87
- json.dump(results, outfile, indent = 4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import random
3
 
4
  import torch
5
+ import time
6
  import os
7
  from distributed_training.data.dataset import DataLoader
8
  from huggingface_hub import list_repo_refs
9
  from transformers import AutoModelForCausalLM, AutoTokenizer
10
+ from huggingface_hub import create_tag, list_repo_refs, scan_cache_dir
11
 
12
  device = "cuda"
13
  test_indices_length = 1000
14
+ AUTOMATE = True
15
 
16
+ models = ["distributed/optimized-gpt2-1b", "distributed/optimized-gpt2-500m", "distributed/optimized-gpt2-250m", "distributed/optimized-gpt2-250m-v0.1.3", "distributed/optimized-gpt2-250m-v0.1.1", "distributed/gpt2-94m"]
17
 
18
  if os.path.exists("results.json"):
19
  with open('results.json', 'r') as file:
 
21
  else:
22
  results = {}
23
 
24
+ while True:
25
+ for model_name in [models[0]]:
26
 
27
+ if (model_name not in results.keys()):
28
+ results[model_name] = {}
29
 
30
+ tokenizer = AutoTokenizer.from_pretrained("distributed/optimized-gpt2-250m", trust_remote_code=True)
31
 
32
+ refs = list_repo_refs(model_name, repo_type="model")
33
+ global_epoch = max([int(tag.name) for tag in refs.tags]) if refs.tags else None
34
 
35
+ if global_epoch in results[model_name]['main-net'].keys():
36
+ print(f"Results for epoch {global_epoch} already calcualted")
37
+ time.sleep(30*60)
38
 
39
+ for epoch in range(0,global_epoch, 1):
 
40
 
41
+ if str(epoch) in results[model_name]['main-net'].keys():
42
+ continue
43
 
44
+ model = AutoModelForCausalLM.from_pretrained(model_name, revision=str(epoch), trust_remote_code=True)
45
+ model = model.to(device)
46
+
47
+ search_start = random.choice(
48
+ range(
49
+ DataLoader.max_pages
50
+ - test_indices_length
51
+ + 1
52
+ )
53
  )
54
+ group = [
55
+ i
56
+ for i in range(
57
+ search_start, search_start + test_indices_length
58
+ )
59
+ ]
60
+
61
+ dataloader = DataLoader(
62
+ batch_size=1,
63
+ sequence_length=1024,
64
+ rows=group,
65
  )
66
+
67
+ total_loss = 0
68
+ index = 0
69
+ # Train data for one epoch
70
+ for index, batch in enumerate(dataloader):
71
+ inputs = batch[0].to(device)
72
+ labels = batch[1].to(device)
73
+
74
+ if (len(inputs[0]) != len(labels[0])):
75
+ breakpoint()
76
+ if "optimized" in model_name:
77
+ outputs = model(input_ids=inputs, labels=labels)
78
+ loss = outputs[1]
79
+ else:
80
+ outputs = model(input_ids=inputs, labels=inputs)
81
+ loss = outputs.loss
82
+
83
+ # Accumulate Total Loss
84
+ total_loss += loss.detach().item()
85
+
86
+ # Backward Pass
87
+ model.zero_grad()
88
+
89
+ average_loss = total_loss / (index+1)
90
+ results[model_name]['main-net'][str(epoch)] = [average_loss]
91
+ print(f"Epoch: {epoch} Average Loss: {average_loss:.2f}")
92
+
93
+ with open("results.json", "w") as outfile:
94
+ json.dump(results, outfile, indent = 4)
95
+
96
+ current_revision = model.config._commit_hash
97
+ keep_recent=1
98
+ try:
99
+ cache_info = scan_cache_dir()
100
+ for repo in cache_info.repos:
101
+ if repo.repo_id == model_name:
102
+ revisions = sorted(
103
+ repo.revisions, key=lambda r: r.last_modified, reverse=True
104
+ )
105
+ current_index = next(
106
+ (
107
+ i
108
+ for i, r in enumerate(revisions)
109
+ if r.commit_hash == current_revision
110
+ ),
111
+ None,
112
+ )
113
+ if current_index is not None:
114
+ for revision in revisions[
115
+ max(current_index + 1, keep_recent) :
116
+ ]:
117
+ cache_info.delete_revisions(revision.commit_hash).execute()
118
+ break
119
+ except:
120
+ print(
121
+ "Failed to delete previous model version from cache. This might lead to 100% disk space utlisation in the future."
122
+ )
results.json CHANGED
@@ -1635,6 +1635,12 @@
1635
  ],
1636
  "544": [
1637
  5.487078181581001
 
 
 
 
 
 
1638
  ]
1639
  },
1640
  "baseline": {
 
1635
  ],
1636
  "544": [
1637
  5.487078181581001
1638
+ ],
1639
+ "545": [
1640
+ 5.599780409645654
1641
+ ],
1642
+ "546": [
1643
+ 5.532580448878751
1644
  ]
1645
  },
1646
  "baseline": {