End of training
Browse files- .hydra/hydra.yaml +2 -2
- README.md +17 -17
- config.json +0 -2
- configuration_measurement_pred.py +0 -2
- logs/events.out.tfevents.1734712122.sac.ist.berkeley.edu.769282.0 +3 -0
- model-00001-of-00002.safetensors +1 -1
- model-00002-of-00002.safetensors +1 -1
- modeling_gpt_neox_measurement_pred.py +5 -2
- modeling_measurement_pred.py +19 -17
- sensor_loc_stories.py +2 -0
- sensor_locs_from_token.py +2 -0
- train.log +1 -1
- training_args.bin +1 -1
.hydra/hydra.yaml
CHANGED
@@ -141,7 +141,7 @@ hydra:
|
|
141 |
name: train
|
142 |
chdir: null
|
143 |
override_dirname: ''
|
144 |
-
id: '
|
145 |
num: 0
|
146 |
config_name: pythia_stories_slurm
|
147 |
env_set: {}
|
@@ -165,7 +165,7 @@ hydra:
|
|
165 |
- path: ''
|
166 |
schema: structured
|
167 |
provider: schema
|
168 |
-
output_dir: /nas/ucb/oliveradk/measurement-pred/multirun/2024-12-
|
169 |
choices:
|
170 |
hparams: hparams
|
171 |
model: pythia_stories
|
|
|
141 |
name: train
|
142 |
chdir: null
|
143 |
override_dirname: ''
|
144 |
+
id: '749101'
|
145 |
num: 0
|
146 |
config_name: pythia_stories_slurm
|
147 |
env_set: {}
|
|
|
165 |
- path: ''
|
166 |
schema: structured
|
167 |
provider: schema
|
168 |
+
output_dir: /nas/ucb/oliveradk/measurement-pred/multirun/2024-12-20/08-28-21/0
|
169 |
choices:
|
170 |
hparams: hparams
|
171 |
model: pythia_stories
|
README.md
CHANGED
@@ -6,27 +6,27 @@ tags:
|
|
6 |
metrics:
|
7 |
- accuracy
|
8 |
model-index:
|
9 |
-
- name: pythia-
|
10 |
results: []
|
11 |
---
|
12 |
|
13 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
14 |
should probably proofread and complete it, then remove this comment. -->
|
15 |
|
16 |
-
# pythia-
|
17 |
|
18 |
This model is a fine-tuned version of [EleutherAI/pythia-1.4b-deduped](https://huggingface.co/EleutherAI/pythia-1.4b-deduped) on an unknown dataset.
|
19 |
It achieves the following results on the evaluation set:
|
20 |
-
- Loss: 0.
|
21 |
-
- Accuracy: 0.
|
22 |
-
- Accuracy Sensor 0: 0.
|
23 |
-
- Auroc Sensor 0: 0.
|
24 |
-
- Accuracy Sensor 1: 0.
|
25 |
-
- Auroc Sensor 1: 0.
|
26 |
-
- Accuracy Sensor 2: 0.
|
27 |
-
- Auroc Sensor 2: 0.
|
28 |
-
- Accuracy Aggregated: 0.
|
29 |
-
- Auroc Aggregated: 0.
|
30 |
|
31 |
## Model description
|
32 |
|
@@ -61,11 +61,11 @@ The following hyperparameters were used during training:
|
|
61 |
|
62 |
| Training Loss | Epoch | Step | Validation Loss | Accuracy | Accuracy Sensor 0 | Auroc Sensor 0 | Accuracy Sensor 1 | Auroc Sensor 1 | Accuracy Sensor 2 | Auroc Sensor 2 | Accuracy Aggregated | Auroc Aggregated |
|
63 |
|:-------------:|:------:|:----:|:---------------:|:--------:|:-----------------:|:--------------:|:-----------------:|:--------------:|:-----------------:|:--------------:|:-------------------:|:----------------:|
|
64 |
-
| No log | 0.9948 | 119 | 0.
|
65 |
-
| 0.
|
66 |
-
| 0.
|
67 |
-
| 0.
|
68 |
-
| 0.
|
69 |
|
70 |
|
71 |
### Framework versions
|
|
|
6 |
metrics:
|
7 |
- accuracy
|
8 |
model-index:
|
9 |
+
- name: pythia-1_4b-deduped-measurement_pred-generated_stories
|
10 |
results: []
|
11 |
---
|
12 |
|
13 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
14 |
should probably proofread and complete it, then remove this comment. -->
|
15 |
|
16 |
+
# pythia-1_4b-deduped-measurement_pred-generated_stories
|
17 |
|
18 |
This model is a fine-tuned version of [EleutherAI/pythia-1.4b-deduped](https://huggingface.co/EleutherAI/pythia-1.4b-deduped) on an unknown dataset.
|
19 |
It achieves the following results on the evaluation set:
|
20 |
+
- Loss: 0.6829
|
21 |
+
- Accuracy: 0.8422
|
22 |
+
- Accuracy Sensor 0: 0.8474
|
23 |
+
- Auroc Sensor 0: 0.9408
|
24 |
+
- Accuracy Sensor 1: 0.8474
|
25 |
+
- Auroc Sensor 1: 0.9248
|
26 |
+
- Accuracy Sensor 2: 0.8370
|
27 |
+
- Auroc Sensor 2: 0.9156
|
28 |
+
- Accuracy Aggregated: 0.8370
|
29 |
+
- Auroc Aggregated: 0.9304
|
30 |
|
31 |
## Model description
|
32 |
|
|
|
61 |
|
62 |
| Training Loss | Epoch | Step | Validation Loss | Accuracy | Accuracy Sensor 0 | Auroc Sensor 0 | Accuracy Sensor 1 | Auroc Sensor 1 | Accuracy Sensor 2 | Auroc Sensor 2 | Accuracy Aggregated | Auroc Aggregated |
|
63 |
|:-------------:|:------:|:----:|:---------------:|:--------:|:-----------------:|:--------------:|:-----------------:|:--------------:|:-----------------:|:--------------:|:-------------------:|:----------------:|
|
64 |
+
| No log | 0.9948 | 119 | 0.5074 | 0.7130 | 0.7793 | 0.9063 | 0.6785 | 0.8889 | 0.7111 | 0.8917 | 0.6830 | 0.9080 |
|
65 |
+
| 0.606 | 1.9979 | 239 | 0.3942 | 0.8100 | 0.8178 | 0.9165 | 0.8252 | 0.8992 | 0.7956 | 0.9077 | 0.8015 | 0.9178 |
|
66 |
+
| 0.368 | 2.9927 | 358 | 0.4412 | 0.8304 | 0.8400 | 0.9369 | 0.8237 | 0.9237 | 0.8252 | 0.9129 | 0.8326 | 0.9235 |
|
67 |
+
| 0.1675 | 3.9958 | 478 | 0.5589 | 0.8474 | 0.8533 | 0.9411 | 0.8474 | 0.9284 | 0.8400 | 0.9112 | 0.8489 | 0.9296 |
|
68 |
+
| 0.0537 | 4.9739 | 595 | 0.6829 | 0.8422 | 0.8474 | 0.9408 | 0.8474 | 0.9248 | 0.8370 | 0.9156 | 0.8370 | 0.9304 |
|
69 |
|
70 |
|
71 |
### Framework versions
|
config.json
CHANGED
@@ -25,7 +25,6 @@
|
|
25 |
"n_sensors": 3,
|
26 |
"num_attention_heads": 16,
|
27 |
"num_hidden_layers": 24,
|
28 |
-
"pad_token_id": 50277,
|
29 |
"rope_scaling": null,
|
30 |
"rotary_emb_base": 10000,
|
31 |
"rotary_pct": 0.25,
|
@@ -36,7 +35,6 @@
|
|
36 |
"tie_word_embeddings": false,
|
37 |
"torch_dtype": "float32",
|
38 |
"transformers_version": "4.41.0",
|
39 |
-
"use_aggregated": true,
|
40 |
"use_cache": false,
|
41 |
"use_parallel_residual": true,
|
42 |
"vocab_size": 50304
|
|
|
25 |
"n_sensors": 3,
|
26 |
"num_attention_heads": 16,
|
27 |
"num_hidden_layers": 24,
|
|
|
28 |
"rope_scaling": null,
|
29 |
"rotary_emb_base": 10000,
|
30 |
"rotary_pct": 0.25,
|
|
|
35 |
"tie_word_embeddings": false,
|
36 |
"torch_dtype": "float32",
|
37 |
"transformers_version": "4.41.0",
|
|
|
38 |
"use_cache": false,
|
39 |
"use_parallel_residual": true,
|
40 |
"vocab_size": 50304
|
configuration_measurement_pred.py
CHANGED
@@ -7,7 +7,6 @@ class MeasurementPredictorConfig(PretrainedConfig):
|
|
7 |
sensor_token=" omit",
|
8 |
sensor_loc_type="locs_from_token",
|
9 |
n_sensors=3,
|
10 |
-
use_aggregated=True,
|
11 |
sensors_weight = 0.7,
|
12 |
aggregate_weight=0.3,
|
13 |
**kwargs
|
@@ -15,7 +14,6 @@ class MeasurementPredictorConfig(PretrainedConfig):
|
|
15 |
self.sensor_token = sensor_token
|
16 |
self.sensor_loc_type = sensor_loc_type
|
17 |
self.n_sensors = n_sensors
|
18 |
-
self.use_aggregated = use_aggregated
|
19 |
self.sensors_weight = sensors_weight
|
20 |
self.aggregate_weight = aggregate_weight
|
21 |
super().__init__(**kwargs)
|
|
|
7 |
sensor_token=" omit",
|
8 |
sensor_loc_type="locs_from_token",
|
9 |
n_sensors=3,
|
|
|
10 |
sensors_weight = 0.7,
|
11 |
aggregate_weight=0.3,
|
12 |
**kwargs
|
|
|
14 |
self.sensor_token = sensor_token
|
15 |
self.sensor_loc_type = sensor_loc_type
|
16 |
self.n_sensors = n_sensors
|
|
|
17 |
self.sensors_weight = sensors_weight
|
18 |
self.aggregate_weight = aggregate_weight
|
19 |
super().__init__(**kwargs)
|
logs/events.out.tfevents.1734712122.sac.ist.berkeley.edu.769282.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50a749ca4e5ea066cf649566551c642485bd190d4160a0f9016acc8e00efae45
|
3 |
+
size 10287
|
model-00001-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4978000256
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3383bea28d5e40a4af2443f4c670650bbabcbacecbbc5aa59bc00c1e26c43da7
|
3 |
size 4978000256
|
model-00002-of-00002.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268568360
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d838e481da12b787afbd911a2e7670a415f79ec84cac70c2f00bd8941695a84
|
3 |
size 268568360
|
modeling_gpt_neox_measurement_pred.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel
|
2 |
-
|
3 |
from .modeling_measurement_pred import MeasurementPredictorMixin
|
4 |
from .configuration_gpt_neox_measurement_pred import GPTNeoXMeasurementPredictorConfig
|
5 |
|
@@ -9,4 +9,7 @@ class GPTNeoXMeasurementPredictor(GPTNeoXPreTrainedModel, MeasurementPredictorMi
|
|
9 |
def __init__(self, config):
|
10 |
super().__init__(config)
|
11 |
self.gpt_neox = GPTNeoXModel(config)
|
12 |
-
self.post_init()
|
|
|
|
|
|
|
|
1 |
from transformers.models.gpt_neox import GPTNeoXPreTrainedModel, GPTNeoXModel
|
2 |
+
from transformers import PreTrainedTokenizerBase
|
3 |
from .modeling_measurement_pred import MeasurementPredictorMixin
|
4 |
from .configuration_gpt_neox_measurement_pred import GPTNeoXMeasurementPredictorConfig
|
5 |
|
|
|
9 |
def __init__(self, config):
|
10 |
super().__init__(config)
|
11 |
self.gpt_neox = GPTNeoXModel(config)
|
12 |
+
self.post_init()
|
13 |
+
|
14 |
+
def set_pad_token(self, tokenizer: PreTrainedTokenizerBase):
|
15 |
+
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
|
modeling_measurement_pred.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from typing import Optional, Tuple, Union
|
|
|
2 |
|
3 |
import torch
|
4 |
from torch.nn import BCEWithLogitsLoss
|
@@ -20,16 +21,18 @@ class MeasurementPredictorMixin(PreTrainedModel):
|
|
20 |
self.sensor_probes = torch.nn.ModuleList([
|
21 |
torch.nn.Linear(config.emb_dim, 1) for _ in range(config.n_sensors)
|
22 |
])
|
23 |
-
self.
|
24 |
-
if config.use_aggregated:
|
25 |
-
self.aggregate_probe = torch.nn.Linear(config.emb_dim, 1)
|
26 |
self.sensors_weight = config.sensors_weight
|
27 |
self.aggregate_weight = config.aggregate_weight
|
28 |
|
29 |
-
self.
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def init_sensor_loc_finder(self, tokenizer: PreTrainedTokenizerBase):
|
32 |
-
self.
|
33 |
tokenizer, sensor_token=self.sensor_token, n_sensors=self.n_sensors
|
34 |
)
|
35 |
|
@@ -67,28 +70,27 @@ class MeasurementPredictorMixin(PreTrainedModel):
|
|
67 |
output_hidden_states=output_hidden_states,
|
68 |
return_dict=return_dict,
|
69 |
)
|
70 |
-
|
|
|
71 |
sensor_embs = base_model_output.last_hidden_state.gather(
|
72 |
1, sensor_locs.unsqueeze(-1).expand(-1, -1, self.config.emb_dim)
|
73 |
)
|
74 |
-
assert sensor_embs.shape == (input_ids.shape[0], self.n_sensors, self.config.emb_dim),
|
|
|
|
|
75 |
sensor_logits = torch.concat([self.sensor_probes[i](sensor_embs[:, i, :])
|
76 |
for i in range(self.n_sensors)], dim=-1)
|
77 |
-
|
|
|
78 |
|
79 |
-
|
80 |
-
last_emb = base_model_output.last_hidden_state[:, -1, :]
|
81 |
-
aggregate_logits = self.aggregate_probe(last_emb)
|
82 |
-
logits = torch.concat([logits, aggregate_logits], dim=-1)
|
83 |
-
|
84 |
loss = None
|
85 |
if labels is not None:
|
86 |
loss_fct = BCEWithLogitsLoss()
|
87 |
-
sensor_loss = loss_fct(sensor_logits, labels[:, :self.n_sensors]) * self.sensors_weight
|
88 |
loss = sensor_loss
|
89 |
-
|
90 |
-
|
91 |
-
loss += aggregate_loss
|
92 |
|
93 |
if not return_dict:
|
94 |
output = (logits, ) + base_model_output[1:]
|
|
|
1 |
from typing import Optional, Tuple, Union
|
2 |
+
from abc import abstractmethod
|
3 |
|
4 |
import torch
|
5 |
from torch.nn import BCEWithLogitsLoss
|
|
|
21 |
self.sensor_probes = torch.nn.ModuleList([
|
22 |
torch.nn.Linear(config.emb_dim, 1) for _ in range(config.n_sensors)
|
23 |
])
|
24 |
+
self.aggregate_probe = torch.nn.Linear(config.emb_dim, 1)
|
|
|
|
|
25 |
self.sensors_weight = config.sensors_weight
|
26 |
self.aggregate_weight = config.aggregate_weight
|
27 |
|
28 |
+
self.find_sensor_locs: SensorLocFinder = None
|
29 |
+
|
30 |
+
@abstractmethod
|
31 |
+
def set_pad_token(self, tokenizer: PreTrainedTokenizerBase):
|
32 |
+
pass
|
33 |
|
34 |
def init_sensor_loc_finder(self, tokenizer: PreTrainedTokenizerBase):
|
35 |
+
self.find_sensor_locs = SENSOR_LOC_REGISTRY[self.sensor_loc_type](
|
36 |
tokenizer, sensor_token=self.sensor_token, n_sensors=self.n_sensors
|
37 |
)
|
38 |
|
|
|
70 |
output_hidden_states=output_hidden_states,
|
71 |
return_dict=return_dict,
|
72 |
)
|
73 |
+
# get sensor embeddings (including aggregate)
|
74 |
+
sensor_locs = self.find_sensor_locs(input_ids)
|
75 |
sensor_embs = base_model_output.last_hidden_state.gather(
|
76 |
1, sensor_locs.unsqueeze(-1).expand(-1, -1, self.config.emb_dim)
|
77 |
)
|
78 |
+
assert sensor_embs.shape == (input_ids.shape[0], self.n_sensors + 1, self.config.emb_dim), sensor_embs.shape
|
79 |
+
|
80 |
+
# get sensor and aggregate logits
|
81 |
sensor_logits = torch.concat([self.sensor_probes[i](sensor_embs[:, i, :])
|
82 |
for i in range(self.n_sensors)], dim=-1)
|
83 |
+
aggregate_logits = self.aggregate_probe(sensor_embs[:, -1, :])
|
84 |
+
logits = torch.concat([sensor_logits, aggregate_logits], dim=-1)
|
85 |
|
86 |
+
# compute loss
|
|
|
|
|
|
|
|
|
87 |
loss = None
|
88 |
if labels is not None:
|
89 |
loss_fct = BCEWithLogitsLoss()
|
90 |
+
sensor_loss = loss_fct(sensor_logits[:, :self.n_sensors], labels[:, :self.n_sensors]) * self.sensors_weight
|
91 |
loss = sensor_loss
|
92 |
+
aggregate_loss = loss_fct(aggregate_logits, labels[:, -1:]) * self.aggregate_weight
|
93 |
+
loss += aggregate_loss
|
|
|
94 |
|
95 |
if not return_dict:
|
96 |
output = (logits, ) + base_model_output[1:]
|
sensor_loc_stories.py
CHANGED
@@ -26,6 +26,8 @@ class StoriesSensorLocFinder(SensorLocFinder):
|
|
26 |
torch.argmax(eqs.to(torch.uint8), dim=-2),
|
27 |
input_ids.shape[-1] - 3,
|
28 |
).clamp(max=input_ids.shape[-1] - 3)
|
|
|
|
|
29 |
return locs
|
30 |
|
31 |
|
|
|
26 |
torch.argmax(eqs.to(torch.uint8), dim=-2),
|
27 |
input_ids.shape[-1] - 3,
|
28 |
).clamp(max=input_ids.shape[-1] - 3)
|
29 |
+
aggregate_sensor_loc = locs[:, -1].unsqueeze(1)
|
30 |
+
locs = torch.cat([locs, aggregate_sensor_loc], dim=1)
|
31 |
return locs
|
32 |
|
33 |
|
sensor_locs_from_token.py
CHANGED
@@ -13,4 +13,6 @@ class SensorLocFinderFromToken(SensorLocFinder):
|
|
13 |
def find_sensor_locs(self, input_ids: torch.Tensor) -> torch.Tensor:
|
14 |
flat_sensor_token_idxs = (input_ids == self.sensor_token_id).nonzero(as_tuple=True)[1]
|
15 |
sensor_token_idxs = flat_sensor_token_idxs.view(-1, self.n_sensors)
|
|
|
|
|
16 |
return sensor_token_idxs
|
|
|
13 |
def find_sensor_locs(self, input_ids: torch.Tensor) -> torch.Tensor:
|
14 |
flat_sensor_token_idxs = (input_ids == self.sensor_token_id).nonzero(as_tuple=True)[1]
|
15 |
sensor_token_idxs = flat_sensor_token_idxs.view(-1, self.n_sensors)
|
16 |
+
aggregate_sensor_token_idx = sensor_token_idxs[:, -1].unsqueeze(1)
|
17 |
+
sensor_token_idxs = torch.cat([sensor_token_idxs, aggregate_sensor_token_idx], dim=1)
|
18 |
return sensor_token_idxs
|
train.log
CHANGED
@@ -1 +1 @@
|
|
1 |
-
[2024-12-
|
|
|
1 |
+
[2024-12-20 16:28:39,973][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 5112
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24298721d6c9062b1c51d5bfdf6167e9932ab9eeebebbe68472c2e6d0db2e09d
|
3 |
size 5112
|