Training in progress, step 20
Browse files- .gitattributes +2 -0
- .hydra/config.yaml +80 -0
- .hydra/hydra.yaml +157 -0
- .hydra/overrides.yaml +1 -0
- adapter_config.json +37 -0
- adapter_model.safetensors +3 -0
- special_tokens_map.json +24 -0
- tokenizer.json +3 -0
- tokenizer_config.json +0 -0
- train.log +1 -0
- train.py +259 -0
- training_args.bin +3 -0
- utils.py +0 -0
- wandb/debug-internal.log +7 -0
- wandb/debug.log +25 -0
- wandb/run-20250202_235451-rfjfhgaw/files/output.log +46 -0
- wandb/run-20250202_235451-rfjfhgaw/files/requirements.txt +213 -0
- wandb/run-20250202_235451-rfjfhgaw/files/wandb-metadata.json +75 -0
- wandb/run-20250202_235451-rfjfhgaw/logs/debug-core.log +6 -0
- wandb/run-20250202_235451-rfjfhgaw/logs/debug-internal.log +7 -0
- wandb/run-20250202_235451-rfjfhgaw/logs/debug.log +25 -0
- wandb/run-20250202_235451-rfjfhgaw/run-rfjfhgaw.wandb +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
wandb/run-20250202_235451-rfjfhgaw/run-rfjfhgaw.wandb filter=lfs diff=lfs merge=lfs -text
|
.hydra/config.yaml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
time_start: null
|
2 |
+
DEBUG: false
|
3 |
+
debug_model: unsloth/Qwen2.5-7B-bnb-4bit
|
4 |
+
fold: 0
|
5 |
+
random_seed: true
|
6 |
+
train_on_all_folds: false
|
7 |
+
eval_only: false
|
8 |
+
merge_adapters: false
|
9 |
+
wandb_id: null
|
10 |
+
val_split_name: val
|
11 |
+
pad_token: <pad>
|
12 |
+
response_template_ids:
|
13 |
+
- 4
|
14 |
+
num_proc: 20
|
15 |
+
hub_repo_tags:
|
16 |
+
- odesia
|
17 |
+
script_args:
|
18 |
+
dataset_name: nbroad/odesia-combined-v3
|
19 |
+
config: null
|
20 |
+
gradient_checkpointing_use_reentrant: true
|
21 |
+
ignore_bias_buffers: false
|
22 |
+
model_config:
|
23 |
+
model_name_or_path: mistralai/Ministral-8B-Instruct-2410
|
24 |
+
torch_dtype: bfloat16
|
25 |
+
attn_implementation: flash_attention_2
|
26 |
+
use_peft: true
|
27 |
+
lora_r: 16
|
28 |
+
lora_alpha: 32
|
29 |
+
lora_dropout: 0.05
|
30 |
+
lora_target_modules:
|
31 |
+
- q_proj
|
32 |
+
- v_proj
|
33 |
+
- k_proj
|
34 |
+
- o_proj
|
35 |
+
- up_proj
|
36 |
+
- down_proj
|
37 |
+
- gate_proj
|
38 |
+
lora_modules_to_save: null
|
39 |
+
lora_task_type: CAUSAL_LM
|
40 |
+
use_rslora: true
|
41 |
+
load_in_8bit: false
|
42 |
+
load_in_4bit: false
|
43 |
+
bnb_4bit_quant_type: nf4
|
44 |
+
use_bnb_nested_quant: true
|
45 |
+
training_args:
|
46 |
+
resume_from_checkpoint: null
|
47 |
+
output_dir: ./
|
48 |
+
num_train_epochs: 1
|
49 |
+
per_device_train_batch_size: 4
|
50 |
+
per_device_eval_batch_size: 4
|
51 |
+
warmup_ratio: 0.1
|
52 |
+
fp16: false
|
53 |
+
bf16: true
|
54 |
+
eval_strategy: steps
|
55 |
+
save_strategy: steps
|
56 |
+
eval_steps: 20
|
57 |
+
save_steps: 20
|
58 |
+
save_total_limit: 2
|
59 |
+
logging_steps: 2
|
60 |
+
run_name: null
|
61 |
+
weight_decay: 0.01
|
62 |
+
report_to: wandb
|
63 |
+
learning_rate: 4.0e-05
|
64 |
+
metric_for_best_model: loss
|
65 |
+
greater_is_better: false
|
66 |
+
gradient_checkpointing: true
|
67 |
+
gradient_accumulation_steps: 8
|
68 |
+
gradient_checkpointing_kwargs:
|
69 |
+
use_reentrant: true
|
70 |
+
optim: adamw_torch
|
71 |
+
dataloader_num_workers: 1
|
72 |
+
seed: 18
|
73 |
+
max_grad_norm: 2.0
|
74 |
+
load_best_model_at_end: true
|
75 |
+
push_to_hub: true
|
76 |
+
hub_private_repo: true
|
77 |
+
lr_scheduler_type: cosine
|
78 |
+
remove_unused_columns: false
|
79 |
+
ddp_find_unused_parameters: false
|
80 |
+
use_liger_kernel: true
|
.hydra/hydra.yaml
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
hydra:
|
2 |
+
run:
|
3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
4 |
+
sweep:
|
5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
6 |
+
subdir: ${hydra.job.num}
|
7 |
+
launcher:
|
8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
9 |
+
sweeper:
|
10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
11 |
+
max_batch_size: null
|
12 |
+
params: null
|
13 |
+
help:
|
14 |
+
app_name: ${hydra.job.name}
|
15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
16 |
+
|
17 |
+
'
|
18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
19 |
+
|
20 |
+
Use --hydra-help to view Hydra specific help
|
21 |
+
|
22 |
+
'
|
23 |
+
template: '${hydra.help.header}
|
24 |
+
|
25 |
+
== Configuration groups ==
|
26 |
+
|
27 |
+
Compose your configuration from those groups (group=option)
|
28 |
+
|
29 |
+
|
30 |
+
$APP_CONFIG_GROUPS
|
31 |
+
|
32 |
+
|
33 |
+
== Config ==
|
34 |
+
|
35 |
+
Override anything in the config (foo.bar=value)
|
36 |
+
|
37 |
+
|
38 |
+
$CONFIG
|
39 |
+
|
40 |
+
|
41 |
+
${hydra.help.footer}
|
42 |
+
|
43 |
+
'
|
44 |
+
hydra_help:
|
45 |
+
template: 'Hydra (${hydra.runtime.version})
|
46 |
+
|
47 |
+
See https://hydra.cc for more info.
|
48 |
+
|
49 |
+
|
50 |
+
== Flags ==
|
51 |
+
|
52 |
+
$FLAGS_HELP
|
53 |
+
|
54 |
+
|
55 |
+
== Configuration groups ==
|
56 |
+
|
57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
58 |
+
to command line)
|
59 |
+
|
60 |
+
|
61 |
+
$HYDRA_CONFIG_GROUPS
|
62 |
+
|
63 |
+
|
64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
65 |
+
|
66 |
+
'
|
67 |
+
hydra_help: ???
|
68 |
+
hydra_logging:
|
69 |
+
version: 1
|
70 |
+
formatters:
|
71 |
+
simple:
|
72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
73 |
+
handlers:
|
74 |
+
console:
|
75 |
+
class: logging.StreamHandler
|
76 |
+
formatter: simple
|
77 |
+
stream: ext://sys.stdout
|
78 |
+
root:
|
79 |
+
level: INFO
|
80 |
+
handlers:
|
81 |
+
- console
|
82 |
+
loggers:
|
83 |
+
logging_example:
|
84 |
+
level: DEBUG
|
85 |
+
disable_existing_loggers: false
|
86 |
+
job_logging:
|
87 |
+
version: 1
|
88 |
+
formatters:
|
89 |
+
simple:
|
90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
91 |
+
handlers:
|
92 |
+
console:
|
93 |
+
class: logging.StreamHandler
|
94 |
+
formatter: simple
|
95 |
+
stream: ext://sys.stdout
|
96 |
+
file:
|
97 |
+
class: logging.FileHandler
|
98 |
+
formatter: simple
|
99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
100 |
+
root:
|
101 |
+
level: INFO
|
102 |
+
handlers:
|
103 |
+
- console
|
104 |
+
- file
|
105 |
+
disable_existing_loggers: false
|
106 |
+
env: {}
|
107 |
+
mode: RUN
|
108 |
+
searchpath: []
|
109 |
+
callbacks: {}
|
110 |
+
output_subdir: .hydra
|
111 |
+
overrides:
|
112 |
+
hydra:
|
113 |
+
- hydra.mode=RUN
|
114 |
+
task: []
|
115 |
+
job:
|
116 |
+
name: train
|
117 |
+
chdir: null
|
118 |
+
override_dirname: ''
|
119 |
+
id: ???
|
120 |
+
num: ???
|
121 |
+
config_name: m
|
122 |
+
env_set:
|
123 |
+
WANDB_RUN_GROUP: clm
|
124 |
+
TOKENIZERS_PARALLELISM: 'False'
|
125 |
+
HF_HUB_ENABLE_HF_TRANSFER: '1'
|
126 |
+
env_copy: []
|
127 |
+
config:
|
128 |
+
override_dirname:
|
129 |
+
kv_sep: '='
|
130 |
+
item_sep: ','
|
131 |
+
exclude_keys: []
|
132 |
+
runtime:
|
133 |
+
version: 1.3.2
|
134 |
+
version_base: '1.1'
|
135 |
+
cwd: /workspace/odesia-2025/train
|
136 |
+
config_sources:
|
137 |
+
- path: hydra.conf
|
138 |
+
schema: pkg
|
139 |
+
provider: hydra
|
140 |
+
- path: /workspace/odesia-2025/train/conf
|
141 |
+
schema: file
|
142 |
+
provider: main
|
143 |
+
- path: ''
|
144 |
+
schema: structured
|
145 |
+
provider: schema
|
146 |
+
output_dir: /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50
|
147 |
+
choices:
|
148 |
+
hydra/env: default
|
149 |
+
hydra/callbacks: null
|
150 |
+
hydra/job_logging: default
|
151 |
+
hydra/hydra_logging: default
|
152 |
+
hydra/hydra_help: default
|
153 |
+
hydra/help: default
|
154 |
+
hydra/sweeper: basic
|
155 |
+
hydra/launcher: basic
|
156 |
+
hydra/output: default
|
157 |
+
verbose: false
|
.hydra/overrides.yaml
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[]
|
adapter_config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"alpha_pattern": {},
|
3 |
+
"auto_mapping": null,
|
4 |
+
"base_model_name_or_path": "mistralai/Ministral-8B-Instruct-2410",
|
5 |
+
"bias": "none",
|
6 |
+
"eva_config": null,
|
7 |
+
"exclude_modules": null,
|
8 |
+
"fan_in_fan_out": false,
|
9 |
+
"inference_mode": true,
|
10 |
+
"init_lora_weights": true,
|
11 |
+
"layer_replication": null,
|
12 |
+
"layers_pattern": null,
|
13 |
+
"layers_to_transform": null,
|
14 |
+
"loftq_config": {},
|
15 |
+
"lora_alpha": 32,
|
16 |
+
"lora_bias": false,
|
17 |
+
"lora_dropout": 0.05,
|
18 |
+
"megatron_config": null,
|
19 |
+
"megatron_core": "megatron.core",
|
20 |
+
"modules_to_save": null,
|
21 |
+
"peft_type": "LORA",
|
22 |
+
"r": 16,
|
23 |
+
"rank_pattern": {},
|
24 |
+
"revision": null,
|
25 |
+
"target_modules": [
|
26 |
+
"v_proj",
|
27 |
+
"k_proj",
|
28 |
+
"q_proj",
|
29 |
+
"down_proj",
|
30 |
+
"gate_proj",
|
31 |
+
"up_proj",
|
32 |
+
"o_proj"
|
33 |
+
],
|
34 |
+
"task_type": "CAUSAL_LM",
|
35 |
+
"use_dora": false,
|
36 |
+
"use_rslora": true
|
37 |
+
}
|
adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73782dc170997a593479a105366cd2bef57662f026e69145b9564a91c61e758a
|
3 |
+
size 174655536
|
special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "<pad>",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<unk>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7edbeaf20dd7f571b5dd1c54d9ace4f9b6299127cc7ba2afb14a6d51a4a79a4
|
3 |
+
size 17078136
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
train.log
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[2025-02-02 23:55:28,573][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
train.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
AutoModelForCausalLM,
|
3 |
+
AutoTokenizer,
|
4 |
+
Trainer,
|
5 |
+
TrainingArguments,
|
6 |
+
)
|
7 |
+
from datasets import load_dataset, concatenate_datasets
|
8 |
+
from omegaconf import DictConfig, OmegaConf
|
9 |
+
import hydra
|
10 |
+
import wandb
|
11 |
+
import shutil
|
12 |
+
import os
|
13 |
+
from functools import partial
|
14 |
+
from pathlib import Path
|
15 |
+
from trl import (
|
16 |
+
SFTTrainer,
|
17 |
+
ModelConfig,
|
18 |
+
get_quantization_config,
|
19 |
+
get_kbit_device_map,
|
20 |
+
get_peft_config,
|
21 |
+
DataCollatorForCompletionOnlyLM,
|
22 |
+
)
|
23 |
+
from dotenv import load_dotenv
|
24 |
+
from peft import (
|
25 |
+
get_peft_model,
|
26 |
+
prepare_model_for_kbit_training,
|
27 |
+
AutoPeftModelForSequenceClassification,
|
28 |
+
)
|
29 |
+
|
30 |
+
# from utils import add_metric_to_card
|
31 |
+
|
32 |
+
loaded = load_dotenv("../.env", override=True)
|
33 |
+
|
34 |
+
if not loaded:
|
35 |
+
raise ValueError("Failed to load .env file")
|
36 |
+
|
37 |
+
|
38 |
+
def tokenize(example, tokenizer):
|
39 |
+
ids = tokenizer.apply_chat_template([
|
40 |
+
{"role": "user", "content": example["text"]},
|
41 |
+
{"role": "assistant", "content": example["response"]},
|
42 |
+
])
|
43 |
+
|
44 |
+
return {
|
45 |
+
"input_ids": ids,
|
46 |
+
}
|
47 |
+
|
48 |
+
|
49 |
+
@hydra.main(config_path="conf", config_name="q7b-4bit")
|
50 |
+
def main(cfg: DictConfig):
|
51 |
+
|
52 |
+
cfg.time_start = "_".join(str(Path.cwd()).rsplit("/", 2)[-2:])
|
53 |
+
|
54 |
+
if cfg.DEBUG:
|
55 |
+
cfg.model_config.model_name_or_path = cfg.debug_model
|
56 |
+
|
57 |
+
script_args = cfg.script_args
|
58 |
+
training_args = TrainingArguments(**OmegaConf.to_container(cfg.training_args))
|
59 |
+
model_config = ModelConfig(**OmegaConf.to_container(cfg.model_config))
|
60 |
+
|
61 |
+
if training_args.process_index == 0:
|
62 |
+
|
63 |
+
if cfg.eval_only or training_args.resume_from_checkpoint is not None:
|
64 |
+
wandb_id = cfg.wandb_id
|
65 |
+
resume = "must"
|
66 |
+
config = None
|
67 |
+
else:
|
68 |
+
wandb_id = None
|
69 |
+
resume = None
|
70 |
+
config = OmegaConf.to_container(cfg)
|
71 |
+
|
72 |
+
wandb.init(config=config, id=wandb_id, resume=resume)
|
73 |
+
# copy current file to output, so it gets saved to hub
|
74 |
+
shutil.copy(
|
75 |
+
Path(__file__).resolve(),
|
76 |
+
Path(training_args.output_dir) / Path(__file__).name,
|
77 |
+
)
|
78 |
+
|
79 |
+
shutil.copy(
|
80 |
+
Path(__file__).resolve().parent / "utils.py",
|
81 |
+
Path(training_args.output_dir) / "utils.py",
|
82 |
+
)
|
83 |
+
|
84 |
+
quantization_config = get_quantization_config(model_config)
|
85 |
+
model_kwargs = dict(
|
86 |
+
revision=model_config.model_revision,
|
87 |
+
trust_remote_code=model_config.trust_remote_code,
|
88 |
+
attn_implementation=model_config.attn_implementation,
|
89 |
+
torch_dtype=model_config.torch_dtype,
|
90 |
+
use_cache=False if training_args.gradient_checkpointing else True,
|
91 |
+
device_map=get_kbit_device_map() if quantization_config is not None else None,
|
92 |
+
quantization_config=quantization_config,
|
93 |
+
cache_dir=os.environ["HF_HUB_CACHE"],
|
94 |
+
)
|
95 |
+
|
96 |
+
peft_config = get_peft_config(model_config)
|
97 |
+
|
98 |
+
if training_args.use_liger_kernel:
|
99 |
+
from liger_kernel.transformers import (
|
100 |
+
apply_liger_kernel_to_qwen2,
|
101 |
+
apply_liger_kernel_to_llama,
|
102 |
+
apply_liger_kernel_to_mistral,
|
103 |
+
)
|
104 |
+
|
105 |
+
apply_liger_kernel_to_qwen2()
|
106 |
+
apply_liger_kernel_to_llama()
|
107 |
+
apply_liger_kernel_to_mistral()
|
108 |
+
if cfg.eval_only:
|
109 |
+
|
110 |
+
model = AutoPeftModelForSequenceClassification.from_pretrained(
|
111 |
+
model_config.model_name_or_path,
|
112 |
+
**model_kwargs,
|
113 |
+
token=os.environ["HF_WRITE_PERSONAL"],
|
114 |
+
)
|
115 |
+
|
116 |
+
if cfg.merge_adapters:
|
117 |
+
model = model.merge_and_unload()
|
118 |
+
|
119 |
+
else:
|
120 |
+
|
121 |
+
model = AutoModelForCausalLM.from_pretrained(
|
122 |
+
model_config.model_name_or_path,
|
123 |
+
**model_kwargs,
|
124 |
+
token=os.environ["HF_GATED"],
|
125 |
+
)
|
126 |
+
|
127 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
128 |
+
model_config.model_name_or_path,
|
129 |
+
use_fast=True,
|
130 |
+
token=os.environ["HF_GATED"],
|
131 |
+
)
|
132 |
+
|
133 |
+
tokenizer.padding_side = "left"
|
134 |
+
tokenizer.pad_token = cfg.pad_token
|
135 |
+
|
136 |
+
|
137 |
+
if not cfg.eval_only and model_config.load_in_4bit:
|
138 |
+
model = prepare_model_for_kbit_training(
|
139 |
+
model,
|
140 |
+
use_gradient_checkpointing=training_args.gradient_checkpointing,
|
141 |
+
gradient_checkpointing_kwargs=training_args.gradient_checkpointing_kwargs,
|
142 |
+
)
|
143 |
+
|
144 |
+
elif not cfg.eval_only and training_args.gradient_checkpointing:
|
145 |
+
model.enable_input_require_grads()
|
146 |
+
|
147 |
+
if not cfg.eval_only:
|
148 |
+
model = get_peft_model(model, peft_config)
|
149 |
+
|
150 |
+
with training_args.main_process_first():
|
151 |
+
ds = load_dataset(
|
152 |
+
script_args.dataset_name,
|
153 |
+
script_args.config,
|
154 |
+
token=os.environ["HF_WRITE_PERSONAL"],
|
155 |
+
)
|
156 |
+
|
157 |
+
# hack to downsample english squad
|
158 |
+
# ds["train"] = concatenate_datasets(
|
159 |
+
# [
|
160 |
+
# ds["train"].select(range(0, 45000)),
|
161 |
+
# ds["train"].select(range(98596, len(ds["train"]))),
|
162 |
+
# ])
|
163 |
+
|
164 |
+
if cfg.DEBUG:
|
165 |
+
ds[cfg.train_split_name] = (
|
166 |
+
ds[cfg.train_split_name].shuffle().select(range(100))
|
167 |
+
)
|
168 |
+
ds[cfg.val_split_name] = ds[cfg.val_split_name].shuffle().select(range(100))
|
169 |
+
|
170 |
+
if not cfg.eval_only:
|
171 |
+
ds[cfg.val_split_name] = ds[cfg.val_split_name].shuffle().select(range(500))
|
172 |
+
|
173 |
+
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=cfg.num_proc, remove_columns=ds["train"].column_names)
|
174 |
+
|
175 |
+
collator = DataCollatorForCompletionOnlyLM(
|
176 |
+
tokenizer=tokenizer,
|
177 |
+
mlm=False,
|
178 |
+
pad_to_multiple_of=16,
|
179 |
+
response_template=cfg.response_template_ids
|
180 |
+
)
|
181 |
+
|
182 |
+
if training_args.process_index == 0:
|
183 |
+
group = os.environ["WANDB_RUN_GROUP"]
|
184 |
+
training_args.hub_model_id = f"nbroad/nbroad-odesia-{group}-{wandb.run.id}"
|
185 |
+
training_args.hub_token = os.environ["HF_WRITE_PERSONAL"]
|
186 |
+
|
187 |
+
prefix = ""
|
188 |
+
|
189 |
+
if cfg.eval_only:
|
190 |
+
if "awq" in model_config.model_name_or_path.lower():
|
191 |
+
prefix = "awq_"
|
192 |
+
if model_config.load_in_4bit:
|
193 |
+
prefix += "int4_"
|
194 |
+
elif model_config.torch_dtype == "bfloat16":
|
195 |
+
prefix += "bf16_"
|
196 |
+
elif model_config.torch_dtype == "float16":
|
197 |
+
prefix += "fp16_"
|
198 |
+
|
199 |
+
trainer = SFTTrainer(
|
200 |
+
model=model,
|
201 |
+
args=training_args,
|
202 |
+
train_dataset=ds["train"],
|
203 |
+
eval_dataset=(
|
204 |
+
ds[cfg.val_split_name] if training_args.eval_strategy != "no" else None
|
205 |
+
),
|
206 |
+
processing_class=tokenizer,
|
207 |
+
data_collator=collator,
|
208 |
+
# compute_metrics=partial(compute_metrics, prefix=prefix),
|
209 |
+
)
|
210 |
+
|
211 |
+
if training_args.process_index == 0:
|
212 |
+
|
213 |
+
trainer.model.config.update(
|
214 |
+
{
|
215 |
+
"wandb_id": wandb.run.id,
|
216 |
+
"fold": cfg.fold,
|
217 |
+
"group": group,
|
218 |
+
"dataset": script_args.dataset_name,
|
219 |
+
}
|
220 |
+
)
|
221 |
+
|
222 |
+
if not cfg.eval_only:
|
223 |
+
if training_args.resume_from_checkpoint is not None:
|
224 |
+
os.chdir(Path(training_args.resume_from_checkpoint).parent)
|
225 |
+
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
|
226 |
+
else:
|
227 |
+
metrics = trainer.evaluate()
|
228 |
+
|
229 |
+
# if training_args.process_index == 0:
|
230 |
+
# met = [x for x in metrics if "accuracy" in x][0]
|
231 |
+
|
232 |
+
# result = add_metric_to_card(
|
233 |
+
# repo=training_args.hub_model_id,
|
234 |
+
# metrics_pretty_name=met,
|
235 |
+
# metrics_value=metrics[met],
|
236 |
+
# dataset_id=script_args.dataset_name,
|
237 |
+
# dataset_split=cfg.val_split_name,
|
238 |
+
# model_path=model_config.model_name_or_path,
|
239 |
+
# model_dtype=model_config.torch_dtype,
|
240 |
+
# token=os.environ["HF_WRITE_PERSONAL"],
|
241 |
+
# )
|
242 |
+
# print(result)
|
243 |
+
|
244 |
+
if not cfg.eval_only:
|
245 |
+
# Save and push to hub
|
246 |
+
trainer.save_model(training_args.output_dir)
|
247 |
+
if training_args.push_to_hub:
|
248 |
+
trainer.push_to_hub(
|
249 |
+
dataset_name=script_args.dataset_name,
|
250 |
+
model_name=model_config.model_name_or_path,
|
251 |
+
tags=cfg.hub_repo_tags,
|
252 |
+
)
|
253 |
+
|
254 |
+
if training_args.process_index == 0:
|
255 |
+
wandb.finish()
|
256 |
+
|
257 |
+
|
258 |
+
if __name__ == "__main__":
|
259 |
+
main()
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:324472939fe55486751170cdbabb03b6b7afb7633123594ce8afe3efc01bf07b
|
3 |
+
size 5624
|
utils.py
ADDED
File without changes
|
wandb/debug-internal.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-02-02T23:54:51.678061797Z","level":"INFO","msg":"stream: starting","core version":"0.19.5","symlink path":"/workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug-core.log"}
|
2 |
+
{"time":"2025-02-02T23:54:51.800734318Z","level":"INFO","msg":"created new stream","id":"rfjfhgaw"}
|
3 |
+
{"time":"2025-02-02T23:54:51.808334974Z","level":"INFO","msg":"stream: started","id":"rfjfhgaw"}
|
4 |
+
{"time":"2025-02-02T23:54:51.808441429Z","level":"INFO","msg":"writer: Do: started","stream_id":"rfjfhgaw"}
|
5 |
+
{"time":"2025-02-02T23:54:51.808480253Z","level":"INFO","msg":"handler: started","stream_id":"rfjfhgaw"}
|
6 |
+
{"time":"2025-02-02T23:54:51.808499495Z","level":"INFO","msg":"sender: started","stream_id":"rfjfhgaw"}
|
7 |
+
{"time":"2025-02-02T23:54:52.036289894Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/debug.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-02 23:54:51,374 INFO MainThread:1740 [wandb_setup.py:_flush():68] Current SDK version is 0.19.5
|
2 |
+
2025-02-02 23:54:51,376 INFO MainThread:1740 [wandb_setup.py:_flush():68] Configure stats pid to 1740
|
3 |
+
2025-02-02 23:54:51,377 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings
|
4 |
+
2025-02-02 23:54:51,378 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/settings
|
5 |
+
2025-02-02 23:54:51,379 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-02-02 23:54:51,382 INFO MainThread:1740 [wandb_init.py:setup_run_log_directory():637] Logging user logs to /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug.log
|
7 |
+
2025-02-02 23:54:51,384 INFO MainThread:1740 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug-internal.log
|
8 |
+
2025-02-02 23:54:51,385 INFO MainThread:1740 [wandb_init.py:init():756] calling init triggers
|
9 |
+
2025-02-02 23:54:51,386 INFO MainThread:1740 [wandb_init.py:init():761] wandb.init called with sweep_config: {}
|
10 |
+
config: {'time_start': '2025-02-02_23-54-50', 'DEBUG': False, 'debug_model': 'unsloth/Qwen2.5-7B-bnb-4bit', 'fold': 0, 'random_seed': True, 'train_on_all_folds': False, 'eval_only': False, 'merge_adapters': False, 'wandb_id': None, 'val_split_name': 'val', 'pad_token': '<pad>', 'response_template_ids': [4], 'num_proc': 20, 'hub_repo_tags': ['odesia'], 'script_args': {'dataset_name': 'nbroad/odesia-combined-v3', 'config': None, 'gradient_checkpointing_use_reentrant': True, 'ignore_bias_buffers': False}, 'model_config': {'model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'torch_dtype': 'bfloat16', 'attn_implementation': 'flash_attention_2', 'use_peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'], 'lora_modules_to_save': None, 'lora_task_type': 'CAUSAL_LM', 'use_rslora': True, 'load_in_8bit': False, 'load_in_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'use_bnb_nested_quant': True}, 'training_args': {'resume_from_checkpoint': None, 'output_dir': './', 'num_train_epochs': 1, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'warmup_ratio': 0.1, 'fp16': False, 'bf16': True, 'eval_strategy': 'steps', 'save_strategy': 'steps', 'eval_steps': 20, 'save_steps': 20, 'save_total_limit': 2, 'logging_steps': 2, 'run_name': None, 'weight_decay': 0.01, 'report_to': 'wandb', 'learning_rate': 4e-05, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'gradient_checkpointing': True, 'gradient_accumulation_steps': 8, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'optim': 'adamw_torch', 'dataloader_num_workers': 1, 'seed': 18, 'max_grad_norm': 2.0, 'load_best_model_at_end': True, 'push_to_hub': True, 'hub_private_repo': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': False, 'ddp_find_unused_parameters': False, 'use_liger_kernel': True}, '_wandb': {}}
|
11 |
+
2025-02-02 23:54:51,387 INFO MainThread:1740 [wandb_init.py:init():789] starting backend
|
12 |
+
2025-02-02 23:54:51,611 INFO MainThread:1740 [wandb_init.py:init():793] sending inform_init request
|
13 |
+
2025-02-02 23:54:51,671 INFO MainThread:1740 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-02-02 23:54:51,672 INFO MainThread:1740 [wandb_init.py:init():808] backend started and connected
|
15 |
+
2025-02-02 23:54:51,678 INFO MainThread:1740 [wandb_init.py:init():901] updated telemetry
|
16 |
+
2025-02-02 23:54:51,794 INFO MainThread:1740 [wandb_init.py:init():926] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-02-02 23:54:52,004 INFO MainThread:1740 [wandb_init.py:init():984] starting run threads in backend
|
18 |
+
2025-02-02 23:54:52,197 INFO MainThread:1740 [wandb_run.py:_console_start():2385] atexit reg
|
19 |
+
2025-02-02 23:54:52,198 INFO MainThread:1740 [wandb_run.py:_redirect():2235] redirect: wrap_raw
|
20 |
+
2025-02-02 23:54:52,198 INFO MainThread:1740 [wandb_run.py:_redirect():2300] Wrapping output streams.
|
21 |
+
2025-02-02 23:54:52,199 INFO MainThread:1740 [wandb_run.py:_redirect():2325] Redirects installed.
|
22 |
+
2025-02-02 23:54:52,210 INFO MainThread:1740 [wandb_init.py:init():1026] run started, returning control to user process
|
23 |
+
2025-02-02 23:55:34,633 INFO MainThread:1740 [wandb_run.py:_config_callback():1253] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'k_proj', 'q_proj', 'down_proj', 'gate_proj', 'up_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': True, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 131072, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'sliding_window': 32768, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 100000000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', '_attn_implementation_autoset': True, 'transformers_version': '4.48.2', 'model_type': 'mistral', 'wandb_id': 'rfjfhgaw', 'fold': 0, 'group': 'clm', 'dataset': 'nbroad/odesia-combined-v3', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 4e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 2.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb02_23-54-50_7c024bd6c651', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 20, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 18, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 20, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 'nbroad/nbroad-odesia-clm-rfjfhgaw', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': True, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 1024, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '<CHARS_PER_TOKEN>', 'use_liger': False}
|
24 |
+
2025-02-02 23:55:34,642 INFO MainThread:1740 [wandb_config.py:__setitem__():154] config set model/num_parameters = 8063455232 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fbb092ff110>>
|
25 |
+
2025-02-02 23:55:34,642 INFO MainThread:1740 [wandb_run.py:_config_callback():1253] config_cb model/num_parameters 8063455232 None
|
wandb/run-20250202_235451-rfjfhgaw/files/output.log
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Applied Liger kernels to Qwen2
|
2 |
+
Downloading shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00, 3.46it/s]
|
3 |
+
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
|
4 |
+
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 5.79it/s]
|
5 |
+
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
|
6 |
+
README.md: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 212/212 [00:00<00:00, 836kB/s]
|
7 |
+
train_both.parquet: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34.2M/34.2M [00:00<00:00, 42.3MB/s]
|
8 |
+
val_both.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.3M/11.3M [00:00<00:00, 43.1MB/s]
|
9 |
+
Generating train split: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69123/69123 [00:00<00:00, 247870.63 examples/s]
|
10 |
+
Generating val split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20350/20350 [00:00<00:00, 233244.21 examples/s]
|
11 |
+
Map (num_proc=20): 100%|██████████████████████████████████████████████��██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69123/69123 [00:14<00:00, 4881.30 examples/s]
|
12 |
+
Map (num_proc=20): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:04<00:00, 118.19 examples/s]
|
13 |
+
/usr/local/lib/python3.11/dist-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a processing_class with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to overflow issues when training a model in half-precision. You might consider adding `processing_class.padding_side = 'right'` to your code.
|
14 |
+
warnings.warn(
|
15 |
+
[2025-02-02 23:55:28,573][accelerate.utils.other][WARNING] - Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
|
16 |
+
The model is not an instance of PreTrainedModel. No liger kernels will be applied.
|
17 |
+
[34m[1mwandb[0m: [33mWARNING[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
|
18 |
+
0%| | 0/360 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
19 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
20 |
+
/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
21 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
22 |
+
/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
23 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
24 |
+
/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
25 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
26 |
+
1%|█ | 2/360 [00:59<2:51:42, 28.78s/it]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
27 |
+
{'loss': 1.2229, 'grad_norm': 24.9372501373291, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.01}
|
28 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
29 |
+
3%|██████▋ | 12/360 [05:36<2:41:33, 27.85s/it]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
30 |
+
{'loss': 1.1054, 'grad_norm': 20.7418155670166, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.01}
|
31 |
+
{'loss': 0.9977, 'grad_norm': 15.117389678955078, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.02}
|
32 |
+
{'loss': 0.788, 'grad_norm': 9.767716407775879, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.02}
|
33 |
+
{'loss': 0.5502, 'grad_norm': 7.186247825622559, 'learning_rate': 1.1111111111111113e-05, 'epoch': 0.03}
|
34 |
+
{'loss': 0.4154, 'grad_norm': 5.886741638183594, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.03}
|
35 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
36 |
+
4%|███████▏ | 13/360 [06:03<2:39:43, 27.62s/it]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
37 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
38 |
+
5%|██████████▌ | 19/360 [08:57<2:45:26, 29.11s/it]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
39 |
+
{'loss': 0.2891, 'grad_norm': 3.077014684677124, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.04}
|
40 |
+
{'loss': 0.2373, 'grad_norm': 2.210777521133423, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.04}
|
41 |
+
{'loss': 0.2191, 'grad_norm': 2.5989363193511963, 'learning_rate': 2e-05, 'epoch': 0.05}
|
42 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
43 |
+
6%|███████████ | 20/360 [09:55<2:45:24, 29.19s/it]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
|
44 |
+
{'loss': 0.1995, 'grad_norm': 3.38708233833313, 'learning_rate': 2.2222222222222227e-05, 'epoch': 0.06}
|
45 |
+
with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
|
46 |
+
{'eval_loss': 0.22643855214118958, 'eval_runtime': 28.1991, 'eval_samples_per_second': 17.731, 'eval_steps_per_second': 0.745, 'epoch': 0.06}
|
wandb/run-20250202_235451-rfjfhgaw/files/requirements.txt
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytz==2025.1
|
2 |
+
antlr4-python3-runtime==4.9.3
|
3 |
+
xxhash==3.5.0
|
4 |
+
tzdata==2025.1
|
5 |
+
einops==0.8.0
|
6 |
+
typing_extensions==4.12.2
|
7 |
+
tqdm==4.67.1
|
8 |
+
smmap==5.0.2
|
9 |
+
setproctitle==1.3.4
|
10 |
+
sentry-sdk==2.20.0
|
11 |
+
safetensors==0.5.2
|
12 |
+
regex==2024.11.6
|
13 |
+
python-dotenv==1.0.1
|
14 |
+
pyarrow==19.0.0
|
15 |
+
protobuf==5.29.3
|
16 |
+
propcache==0.2.1
|
17 |
+
omegaconf==2.3.0
|
18 |
+
multidict==6.1.0
|
19 |
+
mdurl==0.1.2
|
20 |
+
hf_transfer==0.1.9
|
21 |
+
frozenlist==1.5.0
|
22 |
+
docker-pycreds==0.4.0
|
23 |
+
dill==0.3.8
|
24 |
+
click==8.1.8
|
25 |
+
annotated-types==0.7.0
|
26 |
+
aiohappyeyeballs==2.4.4
|
27 |
+
yarl==1.18.3
|
28 |
+
pydantic_core==2.27.2
|
29 |
+
pandas==2.2.3
|
30 |
+
multiprocess==0.70.16
|
31 |
+
markdown-it-py==3.0.0
|
32 |
+
hydra-core==1.3.2
|
33 |
+
huggingface-hub==0.28.1
|
34 |
+
gitdb==4.0.12
|
35 |
+
aiosignal==1.3.2
|
36 |
+
tokenizers==0.21.0
|
37 |
+
rich==13.9.4
|
38 |
+
pydantic==2.10.6
|
39 |
+
GitPython==3.1.44
|
40 |
+
aiohttp==3.11.11
|
41 |
+
wandb==0.19.5
|
42 |
+
transformers==4.48.2
|
43 |
+
accelerate==1.3.0
|
44 |
+
peft==0.14.0
|
45 |
+
datasets==3.2.0
|
46 |
+
trl==0.14.0
|
47 |
+
flash_attn==2.7.4.post1
|
48 |
+
scipy==1.15.1
|
49 |
+
liger_kernel==0.5.2
|
50 |
+
entrypoints==0.4
|
51 |
+
jupyter_client==7.4.9
|
52 |
+
nbclassic==1.1.0
|
53 |
+
notebook==6.5.5
|
54 |
+
pyzmq==24.0.1
|
55 |
+
PyYAML==6.0.2
|
56 |
+
Send2Trash==1.8.3
|
57 |
+
anyio==4.6.0
|
58 |
+
argon2-cffi==23.1.0
|
59 |
+
argon2-cffi-bindings==21.2.0
|
60 |
+
arrow==1.3.0
|
61 |
+
asttokens==2.4.1
|
62 |
+
async-lru==2.0.4
|
63 |
+
attrs==24.2.0
|
64 |
+
babel==2.16.0
|
65 |
+
beautifulsoup4==4.12.3
|
66 |
+
bleach==6.1.0
|
67 |
+
certifi==2024.8.30
|
68 |
+
cffi==1.17.1
|
69 |
+
charset-normalizer==3.3.2
|
70 |
+
comm==0.2.2
|
71 |
+
debugpy==1.8.5
|
72 |
+
decorator==5.1.1
|
73 |
+
defusedxml==0.7.1
|
74 |
+
executing==2.1.0
|
75 |
+
fastjsonschema==2.20.0
|
76 |
+
fqdn==1.5.1
|
77 |
+
h11==0.14.0
|
78 |
+
httpcore==1.0.5
|
79 |
+
httpx==0.27.2
|
80 |
+
idna==3.10
|
81 |
+
ipykernel==6.29.5
|
82 |
+
ipython==8.27.0
|
83 |
+
ipython-genutils==0.2.0
|
84 |
+
ipywidgets==8.1.5
|
85 |
+
isoduration==20.11.0
|
86 |
+
jedi==0.19.1
|
87 |
+
json5==0.9.25
|
88 |
+
jsonpointer==3.0.0
|
89 |
+
jsonschema==4.23.0
|
90 |
+
jsonschema-specifications==2023.12.1
|
91 |
+
jupyter-archive==3.4.0
|
92 |
+
jupyter_contrib_core==0.4.2
|
93 |
+
jupyter_contrib_nbextensions==0.7.0
|
94 |
+
jupyter_core==5.7.2
|
95 |
+
jupyter-events==0.10.0
|
96 |
+
jupyter-highlight-selected-word==0.2.0
|
97 |
+
jupyter-lsp==2.2.5
|
98 |
+
jupyter_nbextensions_configurator==0.6.4
|
99 |
+
jupyter_server==2.14.2
|
100 |
+
jupyter_server_terminals==0.5.3
|
101 |
+
jupyterlab==4.2.5
|
102 |
+
jupyterlab_pygments==0.3.0
|
103 |
+
jupyterlab_server==2.27.3
|
104 |
+
jupyterlab_widgets==3.0.13
|
105 |
+
lxml==5.3.0
|
106 |
+
matplotlib-inline==0.1.7
|
107 |
+
mistune==3.0.2
|
108 |
+
nbclient==0.10.0
|
109 |
+
nbconvert==7.16.4
|
110 |
+
nbformat==5.10.4
|
111 |
+
nest-asyncio==1.6.0
|
112 |
+
notebook_shim==0.2.4
|
113 |
+
overrides==7.7.0
|
114 |
+
packaging==24.1
|
115 |
+
pandocfilters==1.5.1
|
116 |
+
parso==0.8.4
|
117 |
+
pexpect==4.9.0
|
118 |
+
platformdirs==4.3.6
|
119 |
+
prometheus_client==0.21.0
|
120 |
+
prompt_toolkit==3.0.47
|
121 |
+
psutil==6.0.0
|
122 |
+
ptyprocess==0.7.0
|
123 |
+
pure_eval==0.2.3
|
124 |
+
pycparser==2.22
|
125 |
+
Pygments==2.18.0
|
126 |
+
python-dateutil==2.9.0.post0
|
127 |
+
python-json-logger==2.0.7
|
128 |
+
referencing==0.35.1
|
129 |
+
requests==2.32.3
|
130 |
+
rfc3339-validator==0.1.4
|
131 |
+
rfc3986-validator==0.1.1
|
132 |
+
rpds-py==0.20.0
|
133 |
+
sniffio==1.3.1
|
134 |
+
soupsieve==2.6
|
135 |
+
stack-data==0.6.3
|
136 |
+
terminado==0.18.1
|
137 |
+
tinycss2==1.3.0
|
138 |
+
tornado==6.4.1
|
139 |
+
traitlets==5.14.3
|
140 |
+
types-python-dateutil==2.9.0.20240906
|
141 |
+
uri-template==1.3.0
|
142 |
+
urllib3==2.2.3
|
143 |
+
wcwidth==0.2.13
|
144 |
+
webcolors==24.8.0
|
145 |
+
webencodings==0.5.1
|
146 |
+
websocket-client==1.8.0
|
147 |
+
widgetsnbextension==4.0.13
|
148 |
+
Jinja2==3.1.3
|
149 |
+
MarkupSafe==2.1.5
|
150 |
+
filelock==3.13.1
|
151 |
+
fsspec==2024.2.0
|
152 |
+
mpmath==1.3.0
|
153 |
+
networkx==3.2.1
|
154 |
+
numpy==1.26.3
|
155 |
+
nvidia-cublas-cu12==12.4.2.65
|
156 |
+
nvidia-cuda-cupti-cu12==12.4.99
|
157 |
+
nvidia-cuda-nvrtc-cu12==12.4.99
|
158 |
+
nvidia-cuda-runtime-cu12==12.4.99
|
159 |
+
nvidia-cudnn-cu12==9.1.0.70
|
160 |
+
nvidia-cufft-cu12==11.2.0.44
|
161 |
+
nvidia-curand-cu12==10.3.5.119
|
162 |
+
nvidia-cusolver-cu12==11.6.0.99
|
163 |
+
nvidia-cusparse-cu12==12.3.0.142
|
164 |
+
nvidia-nccl-cu12==2.20.5
|
165 |
+
nvidia-nvjitlink-cu12==12.4.99
|
166 |
+
nvidia-nvtx-cu12==12.4.99
|
167 |
+
pillow==10.2.0
|
168 |
+
sympy==1.12
|
169 |
+
torch==2.4.1+cu124
|
170 |
+
torchaudio==2.4.1+cu124
|
171 |
+
torchvision==0.19.1+cu124
|
172 |
+
triton==3.0.0
|
173 |
+
pip==24.2
|
174 |
+
setuptools==75.1.0
|
175 |
+
wheel==0.44.0
|
176 |
+
PyGObject==3.42.1
|
177 |
+
PyJWT==2.3.0
|
178 |
+
SecretStorage==3.3.1
|
179 |
+
blinker==1.4
|
180 |
+
cryptography==3.4.8
|
181 |
+
dbus-python==1.2.18
|
182 |
+
distro==1.7.0
|
183 |
+
httplib2==0.20.2
|
184 |
+
importlib-metadata==4.6.4
|
185 |
+
jeepney==0.7.1
|
186 |
+
keyring==23.5.0
|
187 |
+
launchpadlib==1.10.16
|
188 |
+
lazr.restfulclient==0.14.4
|
189 |
+
lazr.uri==1.0.6
|
190 |
+
more-itertools==8.10.0
|
191 |
+
oauthlib==3.2.0
|
192 |
+
pyparsing==2.4.7
|
193 |
+
python-apt==2.4.0+ubuntu4
|
194 |
+
six==1.16.0
|
195 |
+
wadllib==1.3.6
|
196 |
+
zipp==1.0.0
|
197 |
+
autocommand==2.2.2
|
198 |
+
backports.tarfile==1.2.0
|
199 |
+
importlib_metadata==8.0.0
|
200 |
+
importlib_resources==6.4.0
|
201 |
+
inflect==7.3.1
|
202 |
+
jaraco.collections==5.1.0
|
203 |
+
jaraco.context==5.3.0
|
204 |
+
jaraco.functools==4.0.1
|
205 |
+
jaraco.text==3.12.1
|
206 |
+
more-itertools==10.3.0
|
207 |
+
packaging==24.1
|
208 |
+
platformdirs==4.2.2
|
209 |
+
tomli==2.0.1
|
210 |
+
typeguard==4.3.0
|
211 |
+
typing_extensions==4.12.2
|
212 |
+
wheel==0.43.0
|
213 |
+
zipp==3.19.2
|
wandb/run-20250202_235451-rfjfhgaw/files/wandb-metadata.json
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.35",
|
3 |
+
"python": "CPython 3.11.10",
|
4 |
+
"startedAt": "2025-02-02T23:54:51.674481Z",
|
5 |
+
"args": [
|
6 |
+
"-cn",
|
7 |
+
"m"
|
8 |
+
],
|
9 |
+
"program": "/workspace/odesia-2025/train/train.py",
|
10 |
+
"codePath": "train/train.py",
|
11 |
+
"git": {
|
12 |
+
"remote": "https://github.com/nbroad1881/odesia-2025.git",
|
13 |
+
"commit": "238d6f7f91f6ca9c27bf1143df4b90379df36ada"
|
14 |
+
},
|
15 |
+
"email": "[email protected]",
|
16 |
+
"root": "/workspace/odesia-2025/train/outputs/2025-02-02/23-54-50",
|
17 |
+
"host": "7c024bd6c651",
|
18 |
+
"executable": "/usr/bin/python",
|
19 |
+
"cpu_count": 48,
|
20 |
+
"cpu_count_logical": 96,
|
21 |
+
"gpu": "NVIDIA A40",
|
22 |
+
"gpu_count": 6,
|
23 |
+
"disk": {
|
24 |
+
"/": {
|
25 |
+
"total": "214748364800",
|
26 |
+
"used": "1626214400"
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"memory": {
|
30 |
+
"total": "540662632448"
|
31 |
+
},
|
32 |
+
"cpu": {
|
33 |
+
"count": 48,
|
34 |
+
"countLogical": 96
|
35 |
+
},
|
36 |
+
"gpu_nvidia": [
|
37 |
+
{
|
38 |
+
"name": "NVIDIA A40",
|
39 |
+
"memoryTotal": "48305799168",
|
40 |
+
"cudaCores": 10752,
|
41 |
+
"architecture": "Ampere"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"name": "NVIDIA A40",
|
45 |
+
"memoryTotal": "48305799168",
|
46 |
+
"cudaCores": 10752,
|
47 |
+
"architecture": "Ampere"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"name": "NVIDIA A40",
|
51 |
+
"memoryTotal": "48305799168",
|
52 |
+
"cudaCores": 10752,
|
53 |
+
"architecture": "Ampere"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"name": "NVIDIA A40",
|
57 |
+
"memoryTotal": "48305799168",
|
58 |
+
"cudaCores": 10752,
|
59 |
+
"architecture": "Ampere"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"name": "NVIDIA A40",
|
63 |
+
"memoryTotal": "48305799168",
|
64 |
+
"cudaCores": 10752,
|
65 |
+
"architecture": "Ampere"
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"name": "NVIDIA A40",
|
69 |
+
"memoryTotal": "48305799168",
|
70 |
+
"cudaCores": 10752,
|
71 |
+
"architecture": "Ampere"
|
72 |
+
}
|
73 |
+
],
|
74 |
+
"cudaVersion": "12.7"
|
75 |
+
}
|
wandb/run-20250202_235451-rfjfhgaw/logs/debug-core.log
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-02-02T23:54:51.431322614Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpj8nv__ot/port-1740.txt","pid":1740,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false}
|
2 |
+
{"time":"2025-02-02T23:54:51.432486863Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":1740}
|
3 |
+
{"time":"2025-02-02T23:54:51.432545288Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":34123,"Zone":""}}
|
4 |
+
{"time":"2025-02-02T23:54:51.611902427Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:44070"}
|
5 |
+
{"time":"2025-02-02T23:54:51.675927028Z","level":"INFO","msg":"handleInformInit: received","streamId":"rfjfhgaw","id":"127.0.0.1:44070"}
|
6 |
+
{"time":"2025-02-02T23:54:51.809336738Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"rfjfhgaw","id":"127.0.0.1:44070"}
|
wandb/run-20250202_235451-rfjfhgaw/logs/debug-internal.log
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"time":"2025-02-02T23:54:51.678061797Z","level":"INFO","msg":"stream: starting","core version":"0.19.5","symlink path":"/workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug-core.log"}
|
2 |
+
{"time":"2025-02-02T23:54:51.800734318Z","level":"INFO","msg":"created new stream","id":"rfjfhgaw"}
|
3 |
+
{"time":"2025-02-02T23:54:51.808334974Z","level":"INFO","msg":"stream: started","id":"rfjfhgaw"}
|
4 |
+
{"time":"2025-02-02T23:54:51.808441429Z","level":"INFO","msg":"writer: Do: started","stream_id":"rfjfhgaw"}
|
5 |
+
{"time":"2025-02-02T23:54:51.808480253Z","level":"INFO","msg":"handler: started","stream_id":"rfjfhgaw"}
|
6 |
+
{"time":"2025-02-02T23:54:51.808499495Z","level":"INFO","msg":"sender: started","stream_id":"rfjfhgaw"}
|
7 |
+
{"time":"2025-02-02T23:54:52.036289894Z","level":"INFO","msg":"Starting system monitor"}
|
wandb/run-20250202_235451-rfjfhgaw/logs/debug.log
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2025-02-02 23:54:51,374 INFO MainThread:1740 [wandb_setup.py:_flush():68] Current SDK version is 0.19.5
|
2 |
+
2025-02-02 23:54:51,376 INFO MainThread:1740 [wandb_setup.py:_flush():68] Configure stats pid to 1740
|
3 |
+
2025-02-02 23:54:51,377 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from /root/.config/wandb/settings
|
4 |
+
2025-02-02 23:54:51,378 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/settings
|
5 |
+
2025-02-02 23:54:51,379 INFO MainThread:1740 [wandb_setup.py:_flush():68] Loading settings from environment variables
|
6 |
+
2025-02-02 23:54:51,382 INFO MainThread:1740 [wandb_init.py:setup_run_log_directory():637] Logging user logs to /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug.log
|
7 |
+
2025-02-02 23:54:51,384 INFO MainThread:1740 [wandb_init.py:setup_run_log_directory():638] Logging internal logs to /workspace/odesia-2025/train/outputs/2025-02-02/23-54-50/wandb/run-20250202_235451-rfjfhgaw/logs/debug-internal.log
|
8 |
+
2025-02-02 23:54:51,385 INFO MainThread:1740 [wandb_init.py:init():756] calling init triggers
|
9 |
+
2025-02-02 23:54:51,386 INFO MainThread:1740 [wandb_init.py:init():761] wandb.init called with sweep_config: {}
|
10 |
+
config: {'time_start': '2025-02-02_23-54-50', 'DEBUG': False, 'debug_model': 'unsloth/Qwen2.5-7B-bnb-4bit', 'fold': 0, 'random_seed': True, 'train_on_all_folds': False, 'eval_only': False, 'merge_adapters': False, 'wandb_id': None, 'val_split_name': 'val', 'pad_token': '<pad>', 'response_template_ids': [4], 'num_proc': 20, 'hub_repo_tags': ['odesia'], 'script_args': {'dataset_name': 'nbroad/odesia-combined-v3', 'config': None, 'gradient_checkpointing_use_reentrant': True, 'ignore_bias_buffers': False}, 'model_config': {'model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'torch_dtype': 'bfloat16', 'attn_implementation': 'flash_attention_2', 'use_peft': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj'], 'lora_modules_to_save': None, 'lora_task_type': 'CAUSAL_LM', 'use_rslora': True, 'load_in_8bit': False, 'load_in_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'use_bnb_nested_quant': True}, 'training_args': {'resume_from_checkpoint': None, 'output_dir': './', 'num_train_epochs': 1, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'warmup_ratio': 0.1, 'fp16': False, 'bf16': True, 'eval_strategy': 'steps', 'save_strategy': 'steps', 'eval_steps': 20, 'save_steps': 20, 'save_total_limit': 2, 'logging_steps': 2, 'run_name': None, 'weight_decay': 0.01, 'report_to': 'wandb', 'learning_rate': 4e-05, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'gradient_checkpointing': True, 'gradient_accumulation_steps': 8, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'optim': 'adamw_torch', 'dataloader_num_workers': 1, 'seed': 18, 'max_grad_norm': 2.0, 'load_best_model_at_end': True, 'push_to_hub': True, 'hub_private_repo': True, 'lr_scheduler_type': 'cosine', 'remove_unused_columns': False, 'ddp_find_unused_parameters': False, 'use_liger_kernel': True}, '_wandb': {}}
|
11 |
+
2025-02-02 23:54:51,387 INFO MainThread:1740 [wandb_init.py:init():789] starting backend
|
12 |
+
2025-02-02 23:54:51,611 INFO MainThread:1740 [wandb_init.py:init():793] sending inform_init request
|
13 |
+
2025-02-02 23:54:51,671 INFO MainThread:1740 [backend.py:_multiprocessing_setup():97] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
14 |
+
2025-02-02 23:54:51,672 INFO MainThread:1740 [wandb_init.py:init():808] backend started and connected
|
15 |
+
2025-02-02 23:54:51,678 INFO MainThread:1740 [wandb_init.py:init():901] updated telemetry
|
16 |
+
2025-02-02 23:54:51,794 INFO MainThread:1740 [wandb_init.py:init():926] communicating run to backend with 90.0 second timeout
|
17 |
+
2025-02-02 23:54:52,004 INFO MainThread:1740 [wandb_init.py:init():984] starting run threads in backend
|
18 |
+
2025-02-02 23:54:52,197 INFO MainThread:1740 [wandb_run.py:_console_start():2385] atexit reg
|
19 |
+
2025-02-02 23:54:52,198 INFO MainThread:1740 [wandb_run.py:_redirect():2235] redirect: wrap_raw
|
20 |
+
2025-02-02 23:54:52,198 INFO MainThread:1740 [wandb_run.py:_redirect():2300] Wrapping output streams.
|
21 |
+
2025-02-02 23:54:52,199 INFO MainThread:1740 [wandb_run.py:_redirect():2325] Redirects installed.
|
22 |
+
2025-02-02 23:54:52,210 INFO MainThread:1740 [wandb_init.py:init():1026] run started, returning control to user process
|
23 |
+
2025-02-02 23:55:34,633 INFO MainThread:1740 [wandb_run.py:_config_callback():1253] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'k_proj', 'q_proj', 'down_proj', 'gate_proj', 'up_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': True, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 131072, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'sliding_window': 32768, 'head_dim': 128, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': False, 'rope_theta': 100000000.0, 'attention_dropout': 0.0, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['MistralForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'mistralai/Ministral-8B-Instruct-2410', '_attn_implementation_autoset': True, 'transformers_version': '4.48.2', 'model_type': 'mistral', 'wandb_id': 'rfjfhgaw', 'fold': 0, 'group': 'clm', 'dataset': 'nbroad/odesia-combined-v3', 'output_dir': './', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 4e-05, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 2.0, 'num_train_epochs': 1, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': './runs/Feb02_23-54-50_7c024bd6c651', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 20, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 18, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 20, 'dataloader_num_workers': 1, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': './', 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': False, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': 'nbroad/nbroad-odesia-clm-rfjfhgaw', 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': True, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 1024, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '<CHARS_PER_TOKEN>', 'use_liger': False}
|
24 |
+
2025-02-02 23:55:34,642 INFO MainThread:1740 [wandb_config.py:__setitem__():154] config set model/num_parameters = 8063455232 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fbb092ff110>>
|
25 |
+
2025-02-02 23:55:34,642 INFO MainThread:1740 [wandb_run.py:_config_callback():1253] config_cb model/num_parameters 8063455232 None
|
wandb/run-20250202_235451-rfjfhgaw/run-rfjfhgaw.wandb
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e84bdacc2e5e5f069d65fe782c1a7e0228eb45a0223d33fe26afbffbc6e0ce9
|
3 |
+
size 294912
|