kajuma commited on
Commit
026567a
·
verified ·
1 Parent(s): cfaffb2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1121,3 +1121,4 @@ checkpoints/iter_0006362/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
1121
  checkpoints/iter_0006362/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
1122
  checkpoints/iter_0006362/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
1123
  checkpoints/iter_0006362/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
 
 
1121
  checkpoints/iter_0006362/__6_1.distcp filter=lfs diff=lfs merge=lfs -text
1122
  checkpoints/iter_0006362/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
1123
  checkpoints/iter_0006362/__7_1.distcp filter=lfs diff=lfs merge=lfs -text
1124
+ wandb/wandb/run-20251223_145018-ogu6y2pr/run-ogu6y2pr.wandb filter=lfs diff=lfs merge=lfs -text
args.json ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "use_ray": false,
3
+ "ray_exp_name": null,
4
+ "device_groups": null,
5
+ "model": "Qwen/Qwen3-0.6B-Base",
6
+ "model_type": "qwen3_plt",
7
+ "model_revision": null,
8
+ "task_type": "causal_lm",
9
+ "torch_dtype": "bfloat16",
10
+ "attn_impl": null,
11
+ "new_special_tokens": [],
12
+ "num_labels": null,
13
+ "problem_type": null,
14
+ "rope_scaling": null,
15
+ "device_map": null,
16
+ "max_memory": {},
17
+ "max_model_len": null,
18
+ "local_repo_path": null,
19
+ "init_strategy": null,
20
+ "template": "qwen3",
21
+ "system": null,
22
+ "max_length": 16384,
23
+ "truncation_strategy": "right",
24
+ "max_pixels": null,
25
+ "agent_template": null,
26
+ "norm_bbox": null,
27
+ "use_chat_template": false,
28
+ "padding_free": true,
29
+ "padding_side": "right",
30
+ "loss_scale": "all",
31
+ "sequence_parallel_size": 1,
32
+ "response_prefix": null,
33
+ "template_backend": "swift",
34
+ "dataset": [],
35
+ "val_dataset": [],
36
+ "cached_dataset": [
37
+ "/workspace/2of3"
38
+ ],
39
+ "cached_val_dataset": [],
40
+ "split_dataset_ratio": 0.0,
41
+ "data_seed": 42,
42
+ "dataset_num_proc": 32,
43
+ "load_from_cache_file": false,
44
+ "dataset_shuffle": true,
45
+ "val_dataset_shuffle": false,
46
+ "streaming": false,
47
+ "interleave_prob": null,
48
+ "stopping_strategy": "first_exhausted",
49
+ "shuffle_buffer_size": 1000,
50
+ "download_mode": "reuse_dataset_if_exists",
51
+ "columns": {},
52
+ "strict": false,
53
+ "remove_unused_columns": true,
54
+ "model_name": null,
55
+ "model_author": null,
56
+ "custom_dataset_info": [],
57
+ "quant_method": null,
58
+ "quant_bits": null,
59
+ "hqq_axis": null,
60
+ "bnb_4bit_compute_dtype": "bfloat16",
61
+ "bnb_4bit_quant_type": "nf4",
62
+ "bnb_4bit_use_double_quant": true,
63
+ "bnb_4bit_quant_storage": null,
64
+ "max_new_tokens": null,
65
+ "temperature": null,
66
+ "top_k": 50,
67
+ "top_p": 0.9,
68
+ "repetition_penalty": 1.0,
69
+ "num_beams": 1,
70
+ "stream": false,
71
+ "stop_words": [],
72
+ "logprobs": false,
73
+ "top_logprobs": null,
74
+ "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT",
75
+ "lora_modules": [],
76
+ "tuner_backend": "peft",
77
+ "train_type": "full",
78
+ "adapters": [],
79
+ "external_plugins": [],
80
+ "seed": 42,
81
+ "model_kwargs": {},
82
+ "load_args": false,
83
+ "load_data_args": false,
84
+ "packing": true,
85
+ "packing_length": 16384,
86
+ "packing_num_proc": 1,
87
+ "lazy_tokenize": false,
88
+ "custom_register_path": [
89
+ "custom_model/custom_register.py"
90
+ ],
91
+ "use_hf": true,
92
+ "hub_token": null,
93
+ "ddp_timeout": 18000000,
94
+ "ddp_backend": null,
95
+ "ignore_args_error": false,
96
+ "use_swift_lora": false,
97
+ "freeze_llm": false,
98
+ "freeze_vit": true,
99
+ "freeze_aligner": true,
100
+ "freeze_parameters": [],
101
+ "freeze_parameters_regex": null,
102
+ "freeze_parameters_ratio": 0.0,
103
+ "trainable_parameters": [],
104
+ "trainable_parameters_regex": null,
105
+ "adapter_load": null,
106
+ "target_modules": [
107
+ "all-linear"
108
+ ],
109
+ "target_regex": null,
110
+ "modules_to_save": [],
111
+ "lora_rank": 8,
112
+ "lora_alpha": 32,
113
+ "lora_dropout": 0.05,
114
+ "lora_bias": "none",
115
+ "lora_dtype": null,
116
+ "use_rslora": false,
117
+ "rlhf_type": null,
118
+ "ref_load": null,
119
+ "ref_adapter_load": null,
120
+ "beta": 0.1,
121
+ "rpo_alpha": null,
122
+ "reference_free": false,
123
+ "label_smoothing": 0.0,
124
+ "f_divergence_type": "reverse_kl",
125
+ "loss_type": null,
126
+ "desirable_weight": 1.0,
127
+ "undesirable_weight": 1.0,
128
+ "calculate_KL": null,
129
+ "center_rewards_coefficient": null,
130
+ "generation_batch_size": null,
131
+ "steps_per_generation": null,
132
+ "num_generations": 8,
133
+ "max_completion_length": 512,
134
+ "importance_sampling_level": "token",
135
+ "tau_pos": 1.0,
136
+ "tau_neg": 1.05,
137
+ "epsilon": 0.2,
138
+ "epsilon_high": null,
139
+ "delta": null,
140
+ "use_vllm": true,
141
+ "vllm_mode": null,
142
+ "vllm_enable_prefix_caching": true,
143
+ "vllm_gpu_memory_utilization": 0.9,
144
+ "vllm_tensor_parallel_size": 1,
145
+ "vllm_max_model_len": null,
146
+ "vllm_enforce_eager": false,
147
+ "vllm_limit_mm_per_prompt": null,
148
+ "vllm_disable_cascade_attn": false,
149
+ "vllm_max_num_seqs": null,
150
+ "vllm_mm_processor_cache_gb": null,
151
+ "vllm_engine_kwargs": null,
152
+ "sleep_level": 0,
153
+ "offload_optimizer": false,
154
+ "offload_model": false,
155
+ "offload_bridge": false,
156
+ "vllm_server_base_url": null,
157
+ "vllm_server_host": null,
158
+ "vllm_server_port": [
159
+ 8000
160
+ ],
161
+ "vllm_server_timeout": 240.0,
162
+ "vllm_server_group_port": null,
163
+ "reward_funcs": [],
164
+ "reward_weights": null,
165
+ "cosine_min_len_value_wrong": -0.5,
166
+ "cosine_max_len_value_wrong": 0.0,
167
+ "cosine_min_len_value_correct": 1.0,
168
+ "cosine_max_len_value_correct": 0.5,
169
+ "cosine_max_len": null,
170
+ "repetition_n_grams": 3,
171
+ "repetition_max_penalty": -1.0,
172
+ "soft_max_length": null,
173
+ "soft_cache_length": null,
174
+ "dynamic_sample": false,
175
+ "max_resample_times": 3,
176
+ "overlong_filter": false,
177
+ "scale_rewards": "group",
178
+ "advantage_estimator": "grpo",
179
+ "kl_in_reward": false,
180
+ "wandb_log_unique_prompts": null,
181
+ "log_completions": false,
182
+ "rollout_importance_sampling_mode": null,
183
+ "rollout_importance_sampling_threshold": 2.0,
184
+ "log_rollout_offpolicy_metrics": false,
185
+ "off_policy_sequence_mask_delta": null,
186
+ "reward_model": null,
187
+ "reward_model_plugin": null,
188
+ "sync_ref_model": false,
189
+ "ref_model_sync_steps": 512,
190
+ "ref_model_mixup_alpha": 0.6,
191
+ "async_generate": false,
192
+ "move_model_batches": null,
193
+ "multi_turn_scheduler": null,
194
+ "max_turns": null,
195
+ "completion_length_limit_scope": "per_round",
196
+ "vllm_server_pass_dataset": false,
197
+ "log_entropy": false,
198
+ "top_entropy_quantile": 1.0,
199
+ "num_iterations": 1,
200
+ "check_model": true,
201
+ "padded_vocab_size": 151936,
202
+ "initialize_embedding": false,
203
+ "mlp_padding_free": false,
204
+ "load_safetensors": false,
205
+ "save_safetensors": false,
206
+ "ref_model": null,
207
+ "ref_adapters": [],
208
+ "merge_lora": false,
209
+ "max_shard_size": "5GB",
210
+ "train_dataloader_shuffle": true,
211
+ "dataloader_pin_memory": true,
212
+ "dataloader_persistent_workers": true,
213
+ "dataloader_prefetch_factor": 10,
214
+ "architectures": "Qwen3ForCausalLM",
215
+ "llm_architectures": "Qwen3ForCausalLM",
216
+ "max_epochs": 1,
217
+ "enable_dft_loss": false,
218
+ "enable_channel_loss": false,
219
+ "patch_size": 4,
220
+ "save_strategy": "steps",
221
+ "original_max_position_embeddings": null,
222
+ "partial_rotary_factor": null,
223
+ "use_shared_expert_gate": false,
224
+ "vit_gradient_checkpointing": true,
225
+ "vit_lr": null,
226
+ "aligner_lr": null,
227
+ "gradient_checkpointing_kwargs": null,
228
+ "linear_num_value_heads": null,
229
+ "linear_num_key_heads": null,
230
+ "linear_key_head_dim": null,
231
+ "linear_value_head_dim": null,
232
+ "linear_conv_kernel_dim": null,
233
+ "layer_types": null,
234
+ "mrope_interleaved": false,
235
+ "micro_batch_size": 2,
236
+ "global_batch_size": 256,
237
+ "recompute_granularity": "full",
238
+ "recompute_method": "uniform",
239
+ "recompute_num_layers": 1,
240
+ "recompute_modules": [
241
+ "core_attn"
242
+ ],
243
+ "use_cpu_initialization": false,
244
+ "deterministic_mode": false,
245
+ "train_iters": 6350,
246
+ "log_interval": 1,
247
+ "tensorboard_dir": "/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs",
248
+ "no_masked_softmax_fusion": false,
249
+ "no_bias_dropout_fusion": false,
250
+ "no_bias_swiglu_fusion": false,
251
+ "no_rope_fusion": false,
252
+ "no_gradient_accumulation_fusion": false,
253
+ "cross_entropy_loss_fusion": true,
254
+ "cross_entropy_fusion_impl": "native",
255
+ "calculate_per_token_loss": true,
256
+ "use_flash_attn": false,
257
+ "attention_backend": "flash",
258
+ "optimizer": "adam",
259
+ "optimizer_cpu_offload": false,
260
+ "optimizer_offload_fraction": 1.0,
261
+ "use_precision_aware_optimizer": true,
262
+ "main_grads_dtype": "fp32",
263
+ "main_params_dtype": "fp32",
264
+ "exp_avg_dtype": "fp32",
265
+ "exp_avg_sq_dtype": "fp32",
266
+ "dataloader_type": "cyclic",
267
+ "manual_gc": false,
268
+ "manual_gc_interval": 0,
269
+ "lr": 0.0001,
270
+ "lr_decay_style": "cosine",
271
+ "lr_decay_iters": null,
272
+ "lr_warmup_iters": 0,
273
+ "lr_warmup_fraction": 0.05,
274
+ "min_lr": 3e-06,
275
+ "weight_decay": 0.1,
276
+ "clip_grad": 1.0,
277
+ "adam_beta1": 0.9,
278
+ "adam_beta2": 0.95,
279
+ "adam_eps": 1e-08,
280
+ "sgd_momentum": 0.9,
281
+ "save": "/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727",
282
+ "save_interval": 100,
283
+ "save_retain_interval": null,
284
+ "no_save_optim": false,
285
+ "no_save_rng": false,
286
+ "load": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT",
287
+ "no_load_optim": false,
288
+ "no_load_rng": false,
289
+ "finetune": true,
290
+ "ckpt_format": "torch_dist",
291
+ "no_initialization": true,
292
+ "auto_detect_ckpt_format": true,
293
+ "exit_on_missing_checkpoint": true,
294
+ "async_save": false,
295
+ "use_persistent_ckpt_worker": false,
296
+ "ckpt_fully_parallel_load": false,
297
+ "ckpt_assume_constant_structure": false,
298
+ "distributed_backend": "nccl",
299
+ "local_rank": 0,
300
+ "use_distributed_optimizer": true,
301
+ "tensor_model_parallel_size": 1,
302
+ "pipeline_model_parallel_size": 1,
303
+ "decoder_first_pipeline_num_layers": null,
304
+ "decoder_last_pipeline_num_layers": null,
305
+ "account_for_embedding_in_pipeline_split": false,
306
+ "account_for_loss_in_pipeline_split": false,
307
+ "sequence_parallel": false,
308
+ "context_parallel_size": 1,
309
+ "tp_comm_overlap": false,
310
+ "overlap_grad_reduce": true,
311
+ "overlap_param_gather": true,
312
+ "distributed_timeout_minutes": 300000,
313
+ "num_layers_per_virtual_pipeline_stage": null,
314
+ "num_virtual_stages_per_pipeline_rank": null,
315
+ "microbatch_group_size_per_virtual_pipeline_stage": null,
316
+ "pipeline_model_parallel_layout": null,
317
+ "num_layers": 28,
318
+ "hidden_size": 1024,
319
+ "ffn_hidden_size": 3072,
320
+ "num_attention_heads": 16,
321
+ "group_query_attention": true,
322
+ "num_query_groups": 8,
323
+ "softmax_type": null,
324
+ "window_size": null,
325
+ "window_attn_skip_freq": null,
326
+ "max_position_embeddings": 32768,
327
+ "position_embedding_type": "rope",
328
+ "mrope_section": null,
329
+ "rotary_base": 1000000,
330
+ "rotary_percent": 1.0,
331
+ "rotary_interleaved": false,
332
+ "normalization": "RMSNorm",
333
+ "norm_epsilon": 1e-06,
334
+ "swiglu": true,
335
+ "quick_geglu": false,
336
+ "activation_func_clamp_value": null,
337
+ "glu_linear_offset": null,
338
+ "untie_embeddings_and_output_weights": false,
339
+ "disable_bias_linear": true,
340
+ "add_qkv_bias": false,
341
+ "attention_dropout": 0.0,
342
+ "hidden_dropout": 0.0,
343
+ "kv_channels": 128,
344
+ "qk_layernorm": true,
345
+ "qk_l2_norm": null,
346
+ "no_rope_freq": null,
347
+ "moe_apply_probs_on_input": null,
348
+ "transformer_impl": "transformer_engine",
349
+ "num_experts": null,
350
+ "moe_layer_freq": "1",
351
+ "moe_ffn_hidden_size": null,
352
+ "moe_shared_expert_intermediate_size": null,
353
+ "moe_router_topk": 2,
354
+ "moe_router_num_groups": null,
355
+ "moe_router_group_topk": null,
356
+ "moe_router_pre_softmax": false,
357
+ "moe_router_dtype": "fp32",
358
+ "moe_router_score_function": "softmax",
359
+ "moe_router_bias_update_rate": null,
360
+ "moe_router_enable_expert_bias": false,
361
+ "moe_router_topk_scaling_factor": null,
362
+ "moe_router_load_balancing_type": "aux_loss",
363
+ "expert_model_parallel_size": 1,
364
+ "expert_tensor_parallel_size": 1,
365
+ "moe_token_dispatcher_type": null,
366
+ "moe_enable_deepep": false,
367
+ "moe_grouped_gemm": true,
368
+ "moe_permute_fusion": false,
369
+ "moe_aux_loss_coeff": 0.0,
370
+ "moe_z_loss_coeff": null,
371
+ "moe_shared_expert_overlap": false,
372
+ "moe_layer_recompute": false,
373
+ "moe_expert_capacity_factor": null,
374
+ "moe_pad_expert_input_to_capacity": false,
375
+ "moe_token_drop_policy": null,
376
+ "multi_latent_attention": false,
377
+ "q_lora_rank": null,
378
+ "kv_lora_rank": 32,
379
+ "qk_head_dim": 128,
380
+ "qk_pos_emb_head_dim": 64,
381
+ "mtp_num_layers": null,
382
+ "mtp_loss_scaling_factor": 0.1,
383
+ "fp8_format": null,
384
+ "fp8_recipe": "delayed",
385
+ "fp8_amax_history_len": 1024,
386
+ "fp8_amax_compute_algo": "max",
387
+ "fp8_param_gather": false,
388
+ "fp16": false,
389
+ "bf16": true,
390
+ "apply_query_key_layer_scaling": false,
391
+ "attention_softmax_in_fp32": true,
392
+ "log_params_norm": false,
393
+ "log_throughput": false,
394
+ "tensorboard_log_interval": 1,
395
+ "tensorboard_queue_size": 50,
396
+ "log_timers_to_tensorboard": true,
397
+ "no_log_learning_rate_to_tensorboard": false,
398
+ "log_validation_ppl_to_tensorboard": true,
399
+ "log_memory_to_tensorboard": true,
400
+ "logging_level": "20",
401
+ "wandb_project": "plt",
402
+ "wandb_exp_name": "plt_1",
403
+ "wandb_save_dir": null,
404
+ "eval_iters": -1,
405
+ "eval_interval": 100,
406
+ "seq_length": 16384,
407
+ "num_workers": 32,
408
+ "no_data_sharding": false,
409
+ "megatron_extra_kwargs": {},
410
+ "add_version": true,
411
+ "rank": 0,
412
+ "global_world_size": 8,
413
+ "local_world_size": 8,
414
+ "model_suffix": "Qwen3-0.6B-Base",
415
+ "model_info": "ModelInfo(model_type='qwen3_plt', model_dir='/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', torch_dtype=torch.bfloat16, max_model_len=32768, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=False, config=None, task_type='causal_lm', num_labels=None)",
416
+ "model_meta": "ModelMeta(model_type='qwen3_plt', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B-Base', hf_model_id='Qwen/Qwen3-0.6B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-Base', hf_model_id='Qwen/Qwen3-1.7B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-Base', hf_model_id='Qwen/Qwen3-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-Base', hf_model_id='Qwen/Qwen3-8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-Base', hf_model_id='Qwen/Qwen3-14B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-Base', hf_model_id='Qwen/Qwen3-32B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B', hf_model_id='Qwen/Qwen3-8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B', hf_model_id='Qwen/Qwen3-14B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B', hf_model_id='Qwen/Qwen3-32B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-0.6B-FP8', hf_model_id='Qwen/Qwen3-0.6B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B-FP8', hf_model_id='Qwen/Qwen3-1.7B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-FP8', hf_model_id='Qwen/Qwen3-4B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-FP8', hf_model_id='Qwen/Qwen3-8B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-FP8', hf_model_id='Qwen/Qwen3-14B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-FP8', hf_model_id='Qwen/Qwen3-32B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B-AWQ', hf_model_id='Qwen/Qwen3-4B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-8B-AWQ', hf_model_id='Qwen/Qwen3-8B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-14B-AWQ', hf_model_id='Qwen/Qwen3-14B-AWQ', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-32B-AWQ', hf_model_id='Qwen/Qwen3-32B-AWQ', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3', get_function=functools.partial(<function get_model_tokenizer_with_flash_attn at 0x798ff700be20>, automodel_class=<class 'modeling_qwen3.Qwen3ForCausalLM'>), model_arch=ModelKeys(arch_name='llama', embedding='model.embed_tokens', module_list='model.layers', lm_head='lm_head', q_proj='model.layers.{}.self_attn.q_proj', k_proj='model.layers.{}.self_attn.k_proj', v_proj='model.layers.{}.self_attn.v_proj', o_proj='model.layers.{}.self_attn.o_proj', attention='model.layers.{}.self_attn', mlp='model.layers.{}.mlp', down_proj='model.layers.{}.mlp.down_proj', qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None), architectures=['Qwen3ForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.51'], tags=[])",
417
+ "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
418
+ "_val_dataset_exists": [],
419
+ "hub": "<class 'swift.hub.hub.HFHub'>",
420
+ "megatron_model_meta": "MegatronModelMeta(megatron_model_type='gpt', model_types=['qwen2', 'qwen2_5', 'qwq', 'qwq_preview', 'qwen2_5_math', 'llama', 'llama3', 'llama3_1', 'llama3_2', 'longwriter_llama3_1', 'codefuse_codellama', 'marco_o1', 'deepseek', 'deepseek_r1_distill', 'yi', 'yi_coder', 'sus', 'skywork_o1', 'openbuddy_llama', 'openbuddy_llama3', 'megrez', 'reflection', 'numina', 'ziya', 'mengzi3', 'qwen3', 'qwen3_thinking', 'qwen3_nothinking', 'qwen2_moe', 'qwen3_moe', 'qwen3_moe_thinking', 'qwen3_coder', 'internlm3', 'mimo', 'mimo_rl', 'moonlight', 'kimi_k2', 'deepseek_moe', 'deepseek_v2', 'deepseek_v2_5', 'deepseek_r1', 'dots1', 'ernie', 'glm4_5', 'deepseek_v3_1', 'ernie_thinking', 'gpt_oss', 'llama3_2_plt', 'qwen3_plt'], is_multimodal=False, bridge_cls=<class 'swift.megatron.model.gpt_bridge.GPTBridge'>, model_cls=<class 'swift.megatron.model.gpt_model.GPTModel'>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x798f7a795c60>, visual_cls=None, extra_args_provider=None)",
421
+ "extra_args": {
422
+ "model_dir": "/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd",
423
+ "is_multimodal": false,
424
+ "hf_model_type": "qwen3_plt",
425
+ "use_ray": false,
426
+ "ray_exp_name": null,
427
+ "device_groups": null,
428
+ "model": "Qwen/Qwen3-0.6B-Base",
429
+ "model_type": "qwen3_plt",
430
+ "model_revision": null,
431
+ "task_type": "causal_lm",
432
+ "torch_dtype": "bfloat16",
433
+ "attn_impl": null,
434
+ "new_special_tokens": [],
435
+ "num_labels": null,
436
+ "problem_type": null,
437
+ "rope_scaling": null,
438
+ "device_map": null,
439
+ "max_memory": {},
440
+ "max_model_len": null,
441
+ "local_repo_path": null,
442
+ "init_strategy": null,
443
+ "template": "qwen3",
444
+ "system": null,
445
+ "max_length": 16384,
446
+ "truncation_strategy": "right",
447
+ "max_pixels": null,
448
+ "agent_template": null,
449
+ "norm_bbox": null,
450
+ "use_chat_template": false,
451
+ "padding_free": true,
452
+ "padding_side": "right",
453
+ "sequence_parallel_size": 1,
454
+ "response_prefix": null,
455
+ "template_backend": "swift",
456
+ "dataset": [],
457
+ "val_dataset": [],
458
+ "cached_dataset": [
459
+ "/workspace/2of3"
460
+ ],
461
+ "cached_val_dataset": [],
462
+ "split_dataset_ratio": 0.0,
463
+ "data_seed": 42,
464
+ "dataset_num_proc": 32,
465
+ "load_from_cache_file": false,
466
+ "dataset_shuffle": true,
467
+ "val_dataset_shuffle": false,
468
+ "streaming": false,
469
+ "interleave_prob": null,
470
+ "stopping_strategy": "first_exhausted",
471
+ "shuffle_buffer_size": 1000,
472
+ "download_mode": "reuse_dataset_if_exists",
473
+ "columns": {},
474
+ "strict": false,
475
+ "remove_unused_columns": true,
476
+ "model_name": null,
477
+ "model_author": null,
478
+ "custom_dataset_info": [],
479
+ "quant_method": null,
480
+ "quant_bits": null,
481
+ "hqq_axis": null,
482
+ "bnb_4bit_compute_dtype": "bfloat16",
483
+ "bnb_4bit_quant_type": "nf4",
484
+ "bnb_4bit_use_double_quant": true,
485
+ "bnb_4bit_quant_storage": null,
486
+ "max_new_tokens": null,
487
+ "temperature": null,
488
+ "top_k": 50,
489
+ "top_p": 0.9,
490
+ "repetition_penalty": 1.0,
491
+ "num_beams": 1,
492
+ "stream": false,
493
+ "stop_words": [],
494
+ "logprobs": false,
495
+ "top_logprobs": null,
496
+ "ckpt_dir": "/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT",
497
+ "lora_modules": [],
498
+ "tuner_backend": "peft",
499
+ "train_type": "full",
500
+ "adapters": [],
501
+ "external_plugins": [],
502
+ "model_kwargs": {},
503
+ "load_args": false,
504
+ "load_data_args": false,
505
+ "packing": true,
506
+ "packing_length": 16384,
507
+ "packing_num_proc": 1,
508
+ "lazy_tokenize": false,
509
+ "custom_register_path": [
510
+ "custom_model/custom_register.py"
511
+ ],
512
+ "use_hf": true,
513
+ "hub_token": null,
514
+ "ddp_timeout": 18000000,
515
+ "ddp_backend": null,
516
+ "ignore_args_error": false,
517
+ "use_swift_lora": false,
518
+ "freeze_llm": false,
519
+ "freeze_vit": true,
520
+ "freeze_aligner": true,
521
+ "freeze_parameters": [],
522
+ "freeze_parameters_regex": null,
523
+ "freeze_parameters_ratio": 0.0,
524
+ "trainable_parameters": [],
525
+ "trainable_parameters_regex": null,
526
+ "adapter_load": null,
527
+ "target_modules": [
528
+ "all-linear"
529
+ ],
530
+ "target_regex": null,
531
+ "modules_to_save": [],
532
+ "lora_rank": 8,
533
+ "lora_alpha": 32,
534
+ "lora_dropout": 0.05,
535
+ "lora_bias": "none",
536
+ "lora_dtype": null,
537
+ "use_rslora": false,
538
+ "rlhf_type": null,
539
+ "ref_load": null,
540
+ "ref_adapter_load": null,
541
+ "beta": 0.1,
542
+ "rpo_alpha": null,
543
+ "reference_free": false,
544
+ "label_smoothing": 0.0,
545
+ "f_divergence_type": "reverse_kl",
546
+ "loss_type": null,
547
+ "desirable_weight": 1.0,
548
+ "undesirable_weight": 1.0,
549
+ "calculate_KL": null,
550
+ "center_rewards_coefficient": null,
551
+ "generation_batch_size": null,
552
+ "steps_per_generation": null,
553
+ "num_generations": 8,
554
+ "max_completion_length": 512,
555
+ "importance_sampling_level": "token",
556
+ "tau_pos": 1.0,
557
+ "tau_neg": 1.05,
558
+ "epsilon": 0.2,
559
+ "epsilon_high": null,
560
+ "delta": null,
561
+ "use_vllm": true,
562
+ "vllm_mode": null,
563
+ "vllm_enable_prefix_caching": true,
564
+ "vllm_gpu_memory_utilization": 0.9,
565
+ "vllm_tensor_parallel_size": 1,
566
+ "vllm_max_model_len": null,
567
+ "vllm_enforce_eager": false,
568
+ "vllm_limit_mm_per_prompt": null,
569
+ "vllm_disable_cascade_attn": false,
570
+ "vllm_max_num_seqs": null,
571
+ "vllm_mm_processor_cache_gb": null,
572
+ "vllm_engine_kwargs": null,
573
+ "sleep_level": 0,
574
+ "offload_optimizer": false,
575
+ "offload_model": false,
576
+ "offload_bridge": false,
577
+ "vllm_server_base_url": null,
578
+ "vllm_server_host": null,
579
+ "vllm_server_port": [
580
+ 8000
581
+ ],
582
+ "vllm_server_timeout": 240.0,
583
+ "vllm_server_group_port": null,
584
+ "reward_funcs": [],
585
+ "reward_weights": null,
586
+ "cosine_min_len_value_wrong": -0.5,
587
+ "cosine_max_len_value_wrong": 0.0,
588
+ "cosine_min_len_value_correct": 1.0,
589
+ "cosine_max_len_value_correct": 0.5,
590
+ "cosine_max_len": null,
591
+ "repetition_n_grams": 3,
592
+ "repetition_max_penalty": -1.0,
593
+ "soft_max_length": null,
594
+ "soft_cache_length": null,
595
+ "dynamic_sample": false,
596
+ "max_resample_times": 3,
597
+ "overlong_filter": false,
598
+ "scale_rewards": "group",
599
+ "advantage_estimator": "grpo",
600
+ "kl_in_reward": false,
601
+ "wandb_log_unique_prompts": null,
602
+ "log_completions": false,
603
+ "rollout_importance_sampling_mode": null,
604
+ "rollout_importance_sampling_threshold": 2.0,
605
+ "log_rollout_offpolicy_metrics": false,
606
+ "off_policy_sequence_mask_delta": null,
607
+ "reward_model": null,
608
+ "reward_model_plugin": null,
609
+ "sync_ref_model": false,
610
+ "ref_model_sync_steps": 512,
611
+ "ref_model_mixup_alpha": 0.6,
612
+ "async_generate": false,
613
+ "move_model_batches": null,
614
+ "multi_turn_scheduler": null,
615
+ "max_turns": null,
616
+ "completion_length_limit_scope": "per_round",
617
+ "vllm_server_pass_dataset": false,
618
+ "log_entropy": false,
619
+ "top_entropy_quantile": 1.0,
620
+ "num_iterations": 1,
621
+ "check_model": true,
622
+ "padded_vocab_size": 151936,
623
+ "initialize_embedding": false,
624
+ "mlp_padding_free": false,
625
+ "load_safetensors": false,
626
+ "save_safetensors": false,
627
+ "ref_model": null,
628
+ "ref_adapters": [],
629
+ "merge_lora": false,
630
+ "max_shard_size": "5GB",
631
+ "train_dataloader_shuffle": true,
632
+ "dataloader_pin_memory": true,
633
+ "dataloader_persistent_workers": true,
634
+ "dataloader_prefetch_factor": 10,
635
+ "architectures": "Qwen3ForCausalLM",
636
+ "llm_architectures": "Qwen3ForCausalLM",
637
+ "max_epochs": 1,
638
+ "enable_dft_loss": false,
639
+ "enable_channel_loss": false,
640
+ "patch_size": 4,
641
+ "save_strategy": "steps",
642
+ "original_max_position_embeddings": null,
643
+ "partial_rotary_factor": null,
644
+ "use_shared_expert_gate": false,
645
+ "vit_gradient_checkpointing": true,
646
+ "vit_lr": null,
647
+ "aligner_lr": null,
648
+ "gradient_checkpointing_kwargs": null,
649
+ "linear_num_value_heads": null,
650
+ "linear_num_key_heads": null,
651
+ "linear_key_head_dim": null,
652
+ "linear_value_head_dim": null,
653
+ "linear_conv_kernel_dim": null,
654
+ "layer_types": null,
655
+ "mrope_interleaved": false,
656
+ "add_version": true
657
+ }
658
+ }
images/batch-size vs samples.png ADDED
images/batch-size.png ADDED
images/grad-norm vs samples.png ADDED
images/grad-norm.png ADDED
images/iteration-time.png ADDED
images/learning-rate vs samples.png ADDED
images/learning-rate.png ADDED
images/lm loss vs samples.png ADDED
images/lm loss.png ADDED
images/loss-scale vs samples.png ADDED
images/loss-scale.png ADDED
images/mem-allocated-bytes.png ADDED
images/mem-allocated-count.png ADDED
images/mem-max-allocated-bytes.png ADDED
images/mem-reserved-bytes.png ADDED
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 6362
latest_wandb_artifact_path.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tepic/plt
logging.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
runs/events.out.tfevents.1766501418.36fd00e7b21c.40598.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d27e19cd50fe51da9d4a46c516ac0601bad60f07c9f6a3dcd67b7d7e04bbdd
3
+ size 5384136
wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-23T14:50:19.157923178Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-23T14:50:19.452689851Z","level":"INFO","msg":"stream: created new stream","id":"ogu6y2pr"}
3
+ {"time":"2025-12-23T14:50:19.452982231Z","level":"INFO","msg":"handler: started","stream_id":"ogu6y2pr"}
4
+ {"time":"2025-12-23T14:50:19.453108747Z","level":"INFO","msg":"stream: started","id":"ogu6y2pr"}
5
+ {"time":"2025-12-23T14:50:19.453203475Z","level":"INFO","msg":"writer: started","stream_id":"ogu6y2pr"}
6
+ {"time":"2025-12-23T14:50:19.453240834Z","level":"INFO","msg":"sender: started","stream_id":"ogu6y2pr"}
7
+ {"time":"2025-12-23T15:01:38.571944052Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
8
+ {"time":"2025-12-23T15:01:38.572286Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
9
+ {"time":"2025-12-23T15:12:36.396153575Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
10
+ {"time":"2025-12-23T15:12:36.396527132Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
11
+ {"time":"2025-12-23T15:23:29.324008023Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
12
+ {"time":"2025-12-23T15:23:29.324352406Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
13
+ {"time":"2025-12-23T15:34:29.472612198Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
14
+ {"time":"2025-12-23T15:34:29.47297197Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
15
+ {"time":"2025-12-23T15:45:30.426297295Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
16
+ {"time":"2025-12-23T15:45:30.426633692Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
17
+ {"time":"2025-12-23T15:56:27.157101003Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
18
+ {"time":"2025-12-23T15:56:27.157277527Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
19
+ {"time":"2025-12-23T16:07:25.783471884Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
20
+ {"time":"2025-12-23T16:07:25.78382365Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
21
+ {"time":"2025-12-23T16:18:28.498810989Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
22
+ {"time":"2025-12-23T16:18:28.499138715Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
23
+ {"time":"2025-12-23T16:29:31.444349128Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
24
+ {"time":"2025-12-23T16:29:31.444726336Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
25
+ {"time":"2025-12-23T16:40:32.834491152Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
26
+ {"time":"2025-12-23T16:40:32.834850242Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
27
+ {"time":"2025-12-23T16:51:29.42362482Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
28
+ {"time":"2025-12-23T16:51:29.423959343Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
29
+ {"time":"2025-12-23T17:02:30.793211082Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
30
+ {"time":"2025-12-23T17:02:30.793539822Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
31
+ {"time":"2025-12-23T17:13:33.060094819Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
32
+ {"time":"2025-12-23T17:13:33.060295815Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
33
+ {"time":"2025-12-23T17:24:32.958248966Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
34
+ {"time":"2025-12-23T17:24:32.958591544Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
35
+ {"time":"2025-12-23T17:35:34.024128754Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
36
+ {"time":"2025-12-23T17:35:34.024472732Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
37
+ {"time":"2025-12-23T17:46:30.37232901Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
38
+ {"time":"2025-12-23T17:46:30.372775091Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
39
+ {"time":"2025-12-23T17:57:32.302526996Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
40
+ {"time":"2025-12-23T17:57:32.30288267Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
41
+ {"time":"2025-12-23T18:08:31.048703971Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
42
+ {"time":"2025-12-23T18:08:31.049055711Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
43
+ {"time":"2025-12-23T18:19:30.830781514Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
44
+ {"time":"2025-12-23T18:19:30.831097265Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
45
+ {"time":"2025-12-23T18:30:33.803044335Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
46
+ {"time":"2025-12-23T18:30:33.803369724Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
47
+ {"time":"2025-12-23T18:41:30.279364466Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
48
+ {"time":"2025-12-23T18:41:30.279584052Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
49
+ {"time":"2025-12-23T18:52:31.826751928Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
50
+ {"time":"2025-12-23T18:52:31.827089624Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
51
+ {"time":"2025-12-23T19:03:30.598572678Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
52
+ {"time":"2025-12-23T19:03:30.598924055Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
53
+ {"time":"2025-12-23T19:14:32.616037222Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
54
+ {"time":"2025-12-23T19:14:32.616197778Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
55
+ {"time":"2025-12-23T19:25:32.273259098Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
56
+ {"time":"2025-12-23T19:25:32.273602938Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
57
+ {"time":"2025-12-23T19:36:32.889615697Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
58
+ {"time":"2025-12-23T19:36:32.889787719Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
59
+ {"time":"2025-12-23T19:47:37.77571508Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
60
+ {"time":"2025-12-23T19:47:37.776047293Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
61
+ {"time":"2025-12-23T19:58:39.630164384Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
62
+ {"time":"2025-12-23T19:58:39.630513085Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
63
+ {"time":"2025-12-23T20:09:41.513708638Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
64
+ {"time":"2025-12-23T20:09:41.51405603Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
65
+ {"time":"2025-12-23T20:20:43.088173421Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
66
+ {"time":"2025-12-23T20:20:43.088393383Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
67
+ {"time":"2025-12-23T20:31:43.221503093Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
68
+ {"time":"2025-12-23T20:31:43.221686078Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
69
+ {"time":"2025-12-23T20:42:41.052656025Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
70
+ {"time":"2025-12-23T20:42:41.052839564Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
71
+ {"time":"2025-12-23T20:53:42.488501769Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
72
+ {"time":"2025-12-23T20:53:42.488862537Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
73
+ {"time":"2025-12-23T21:04:41.351069333Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
74
+ {"time":"2025-12-23T21:04:41.35141939Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
75
+ {"time":"2025-12-23T21:15:38.340545121Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
76
+ {"time":"2025-12-23T21:15:38.340904893Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
77
+ {"time":"2025-12-23T21:26:37.510508084Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
78
+ {"time":"2025-12-23T21:26:37.510875914Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
79
+ {"time":"2025-12-23T21:37:36.292525017Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
80
+ {"time":"2025-12-23T21:37:36.292900087Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
81
+ {"time":"2025-12-23T21:48:34.770970878Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
82
+ {"time":"2025-12-23T21:48:34.771291232Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
83
+ {"time":"2025-12-23T21:59:33.737624435Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
84
+ {"time":"2025-12-23T21:59:33.737981585Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
85
+ {"time":"2025-12-23T22:10:39.042583666Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
86
+ {"time":"2025-12-23T22:10:39.042916749Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
87
+ {"time":"2025-12-23T22:21:41.112891247Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
88
+ {"time":"2025-12-23T22:21:41.11321011Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
89
+ {"time":"2025-12-23T22:32:40.580325627Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
90
+ {"time":"2025-12-23T22:32:40.580465985Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
91
+ {"time":"2025-12-23T22:43:36.940776917Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
92
+ {"time":"2025-12-23T22:43:36.941123182Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
93
+ {"time":"2025-12-23T22:54:33.373651394Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
94
+ {"time":"2025-12-23T22:54:33.37402137Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
95
+ {"time":"2025-12-23T23:05:35.325478747Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
96
+ {"time":"2025-12-23T23:05:35.325854894Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
97
+ {"time":"2025-12-23T23:16:36.760827184Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
98
+ {"time":"2025-12-23T23:16:36.760987996Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
99
+ {"time":"2025-12-23T23:27:34.549769482Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
100
+ {"time":"2025-12-23T23:27:34.550122142Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
101
+ {"time":"2025-12-23T23:38:34.786053357Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
102
+ {"time":"2025-12-23T23:38:34.786371337Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
103
+ {"time":"2025-12-23T23:49:33.035575592Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
104
+ {"time":"2025-12-23T23:49:33.035971484Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
105
+ {"time":"2025-12-24T00:00:31.990522355Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
106
+ {"time":"2025-12-24T00:00:31.990721359Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
107
+ {"time":"2025-12-24T00:11:34.140353801Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
108
+ {"time":"2025-12-24T00:11:34.140717429Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
109
+ {"time":"2025-12-24T00:22:34.75725217Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
110
+ {"time":"2025-12-24T00:22:34.757560563Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
111
+ {"time":"2025-12-24T00:33:30.609082628Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
112
+ {"time":"2025-12-24T00:33:30.609405858Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
113
+ {"time":"2025-12-24T00:44:24.763501577Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
114
+ {"time":"2025-12-24T00:44:24.763833284Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
115
+ {"time":"2025-12-24T00:55:22.589601724Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
116
+ {"time":"2025-12-24T00:55:22.589953341Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
117
+ {"time":"2025-12-24T01:06:21.284476721Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
118
+ {"time":"2025-12-24T01:06:21.28472507Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
119
+ {"time":"2025-12-24T01:17:23.427057704Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
120
+ {"time":"2025-12-24T01:17:23.427909522Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
121
+ {"time":"2025-12-24T01:28:21.205072065Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
122
+ {"time":"2025-12-24T01:28:21.205400369Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
123
+ {"time":"2025-12-24T01:39:23.227311862Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
124
+ {"time":"2025-12-24T01:39:23.227635463Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
125
+ {"time":"2025-12-24T01:50:21.711610423Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
126
+ {"time":"2025-12-24T01:50:21.711992483Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
127
+ {"time":"2025-12-24T02:01:26.168432549Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
128
+ {"time":"2025-12-24T02:01:26.169382559Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
129
+ {"time":"2025-12-24T02:12:27.119904169Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
130
+ {"time":"2025-12-24T02:12:27.12021425Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
131
+ {"time":"2025-12-24T02:23:29.280347891Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
132
+ {"time":"2025-12-24T02:23:29.280692788Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
133
+ {"time":"2025-12-24T02:30:17.524116078Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
134
+ {"time":"2025-12-24T02:30:17.524323977Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
135
+ {"time":"2025-12-24T02:30:17.911709787Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
136
+ {"time":"2025-12-24T02:30:18.150633972Z","level":"INFO","msg":"handler: operation stats","stats":{}}
137
+ {"time":"2025-12-24T02:30:18.155591363Z","level":"INFO","msg":"stream: closing","id":"ogu6y2pr"}
138
+ {"time":"2025-12-24T02:30:18.1556065Z","level":"INFO","msg":"handler: closed","stream_id":"ogu6y2pr"}
139
+ {"time":"2025-12-24T02:30:18.15573719Z","level":"INFO","msg":"sender: closed","stream_id":"ogu6y2pr"}
140
+ {"time":"2025-12-24T02:30:18.15575783Z","level":"INFO","msg":"stream: closed","id":"ogu6y2pr"}
wandb/wandb/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Configure stats pid to 40598
3
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
5
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug.log
7
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug-internal.log
8
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-23 14:50:18,908 INFO MainThread:40598 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 2, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 6350, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 16384, 'encoder_seq_length': 16384, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'plt_1', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3_plt', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3_plt', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 16384, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/2of3'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 16384, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': ['custom_model/custom_register.py'], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': 1, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 4, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
11
+ 2025-12-23 14:50:18,908 INFO MainThread:40598 [wandb_init.py:init():889] starting backend
12
+ 2025-12-23 14:50:19,151 INFO MainThread:40598 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-23 14:50:19,154 INFO MainThread:40598 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-23 14:50:19,157 INFO MainThread:40598 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-23 14:50:19,163 INFO MainThread:40598 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-23 14:50:19,679 INFO MainThread:40598 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-23 14:50:19,775 INFO MainThread:40598 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-24 02:30:17,368 INFO MainThread:40598 [wandb_run.py:_finish():2287] finishing run tepic/plt/ogu6y2pr
23
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-24 02:30:18,154 INFO MainThread:40598 [wandb_run.py:_footer_sync_info():3862] logging synced files
wandb/wandb/run-20251223_145018-ogu6y2pr/files/config.yaml ADDED
@@ -0,0 +1,1780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.23.1
4
+ e:
5
+ 5txfscc07xx6ws9ptnhwj8isjczvektf:
6
+ args:
7
+ - --seed
8
+ - "42"
9
+ - --micro-batch-size
10
+ - "2"
11
+ - --global-batch-size
12
+ - "256"
13
+ - --recompute-granularity
14
+ - full
15
+ - --recompute-method
16
+ - uniform
17
+ - --recompute-num-layers
18
+ - "1"
19
+ - --recompute-modules
20
+ - core_attn
21
+ - --train-iters
22
+ - "6350"
23
+ - --log-interval
24
+ - "1"
25
+ - --tensorboard-dir
26
+ - /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs
27
+ - --cross-entropy-loss-fusion
28
+ - --cross-entropy-fusion-impl
29
+ - native
30
+ - --calculate-per-token-loss
31
+ - --attention-backend
32
+ - flash
33
+ - --optimizer
34
+ - adam
35
+ - --optimizer-offload-fraction
36
+ - "1.0"
37
+ - --use-precision-aware-optimizer
38
+ - --main-grads-dtype
39
+ - fp32
40
+ - --main-params-dtype
41
+ - fp32
42
+ - --exp-avg-dtype
43
+ - fp32
44
+ - --exp-avg-sq-dtype
45
+ - fp32
46
+ - --dataloader-type
47
+ - cyclic
48
+ - --manual-gc-interval
49
+ - "0"
50
+ - --lr
51
+ - "0.0001"
52
+ - --lr-decay-style
53
+ - cosine
54
+ - --lr-warmup-iters
55
+ - "0"
56
+ - --lr-warmup-fraction
57
+ - "0.05"
58
+ - --min-lr
59
+ - "3e-06"
60
+ - --weight-decay
61
+ - "0.1"
62
+ - --clip-grad
63
+ - "1.0"
64
+ - --adam-beta1
65
+ - "0.9"
66
+ - --adam-beta2
67
+ - "0.95"
68
+ - --adam-eps
69
+ - "1e-08"
70
+ - --sgd-momentum
71
+ - "0.9"
72
+ - --save
73
+ - /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727
74
+ - --save-interval
75
+ - "100"
76
+ - --load
77
+ - /workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT
78
+ - --finetune
79
+ - --ckpt-format
80
+ - torch_dist
81
+ - --no-initialization
82
+ - --auto-detect-ckpt-format
83
+ - --exit-on-missing-checkpoint
84
+ - --distributed-backend
85
+ - nccl
86
+ - --local-rank
87
+ - "7"
88
+ - --use-distributed-optimizer
89
+ - --tensor-model-parallel-size
90
+ - "1"
91
+ - --pipeline-model-parallel-size
92
+ - "1"
93
+ - --context-parallel-size
94
+ - "1"
95
+ - --overlap-grad-reduce
96
+ - --overlap-param-gather
97
+ - --distributed-timeout-minutes
98
+ - "300000"
99
+ - --num-layers
100
+ - "28"
101
+ - --hidden-size
102
+ - "1024"
103
+ - --ffn-hidden-size
104
+ - "3072"
105
+ - --num-attention-heads
106
+ - "16"
107
+ - --group-query-attention
108
+ - --num-query-groups
109
+ - "8"
110
+ - --max-position-embeddings
111
+ - "32768"
112
+ - --position-embedding-type
113
+ - rope
114
+ - --rotary-base
115
+ - "1000000"
116
+ - --rotary-percent
117
+ - "1.0"
118
+ - --normalization
119
+ - RMSNorm
120
+ - --norm-epsilon
121
+ - "1e-06"
122
+ - --swiglu
123
+ - --disable-bias-linear
124
+ - --attention-dropout
125
+ - "0.0"
126
+ - --hidden-dropout
127
+ - "0.0"
128
+ - --kv-channels
129
+ - "128"
130
+ - --qk-layernorm
131
+ - --transformer-impl
132
+ - transformer_engine
133
+ - --moe-layer-freq
134
+ - "1"
135
+ - --moe-router-topk
136
+ - "2"
137
+ - --moe-router-dtype
138
+ - fp32
139
+ - --moe-router-score-function
140
+ - softmax
141
+ - --moe-router-load-balancing-type
142
+ - aux_loss
143
+ - --expert-model-parallel-size
144
+ - "1"
145
+ - --expert-tensor-parallel-size
146
+ - "1"
147
+ - --moe-token-dispatcher-type
148
+ - alltoall
149
+ - --moe-grouped-gemm
150
+ - --moe-aux-loss-coeff
151
+ - "0.0"
152
+ - --moe-token-drop-policy
153
+ - probs
154
+ - --kv-lora-rank
155
+ - "32"
156
+ - --qk-head-dim
157
+ - "128"
158
+ - --qk-pos-emb-head-dim
159
+ - "64"
160
+ - --mtp-loss-scaling-factor
161
+ - "0.1"
162
+ - --fp8-recipe
163
+ - delayed
164
+ - --fp8-amax-history-len
165
+ - "1024"
166
+ - --fp8-amax-compute-algo
167
+ - max
168
+ - --bf16
169
+ - --attention-softmax-in-fp32
170
+ - --tensorboard-log-interval
171
+ - "1"
172
+ - --tensorboard-queue-size
173
+ - "50"
174
+ - --log-timers-to-tensorboard
175
+ - --log-validation-ppl-to-tensorboard
176
+ - --log-memory-to-tensorboard
177
+ - --logging-level
178
+ - "20"
179
+ - --wandb-project
180
+ - plt
181
+ - --wandb-exp-name
182
+ - plt_1
183
+ - --eval-iters
184
+ - "-1"
185
+ - --eval-interval
186
+ - "100"
187
+ - --seq-length
188
+ - "16384"
189
+ - --num-workers
190
+ - "32"
191
+ codePath: swift/cli/_megatron/pt.py
192
+ codePathLocal: swift/cli/_megatron/pt.py
193
+ cpu_count: 72
194
+ cpu_count_logical: 144
195
+ cudaVersion: "13.0"
196
+ disk:
197
+ /:
198
+ total: "7669363507200"
199
+ used: "923719221248"
200
201
+ executable: /venv/main/bin/python3.12
202
+ git:
203
+ commit: ea7cc214b68fb511dd83bff83a504b7f43053577
204
+ remote: https://github.com/weak-kajuma/halcyon-recipe2.git
205
+ gpu: NVIDIA GeForce RTX 5090
206
+ gpu_count: 8
207
+ gpu_nvidia:
208
+ - architecture: Blackwell
209
+ cudaCores: 21760
210
+ memoryTotal: "34190917632"
211
+ name: NVIDIA GeForce RTX 5090
212
+ uuid: GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3
213
+ - architecture: Blackwell
214
+ cudaCores: 21760
215
+ memoryTotal: "34190917632"
216
+ name: NVIDIA GeForce RTX 5090
217
+ uuid: GPU-23ca8669-46fc-19eb-348b-e51e591c150d
218
+ - architecture: Blackwell
219
+ cudaCores: 21760
220
+ memoryTotal: "34190917632"
221
+ name: NVIDIA GeForce RTX 5090
222
+ uuid: GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152
223
+ - architecture: Blackwell
224
+ cudaCores: 21760
225
+ memoryTotal: "34190917632"
226
+ name: NVIDIA GeForce RTX 5090
227
+ uuid: GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9
228
+ - architecture: Blackwell
229
+ cudaCores: 21760
230
+ memoryTotal: "34190917632"
231
+ name: NVIDIA GeForce RTX 5090
232
+ uuid: GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7
233
+ - architecture: Blackwell
234
+ cudaCores: 21760
235
+ memoryTotal: "34190917632"
236
+ name: NVIDIA GeForce RTX 5090
237
+ uuid: GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49
238
+ - architecture: Blackwell
239
+ cudaCores: 21760
240
+ memoryTotal: "34190917632"
241
+ name: NVIDIA GeForce RTX 5090
242
+ uuid: GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b
243
+ - architecture: Blackwell
244
+ cudaCores: 21760
245
+ memoryTotal: "34190917632"
246
+ name: NVIDIA GeForce RTX 5090
247
+ uuid: GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef
248
+ host: 36fd00e7b21c
249
+ memory:
250
+ total: "540643262464"
251
+ os: Linux-6.8.0-58-generic-x86_64-with-glibc2.39
252
+ program: /workspace/halcyon-recipe2/swift/cli/_megatron/pt.py
253
+ python: CPython 3.12.12
254
+ root: /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb
255
+ startedAt: "2025-12-23T14:50:18.904388Z"
256
+ writerId: 5txfscc07xx6ws9ptnhwj8isjczvektf
257
+ m: []
258
+ python_version: 3.12.12
259
+ t:
260
+ "1":
261
+ - 1
262
+ - 11
263
+ - 41
264
+ - 49
265
+ - 51
266
+ - 71
267
+ - 84
268
+ - 98
269
+ - 105
270
+ "2":
271
+ - 1
272
+ - 11
273
+ - 41
274
+ - 49
275
+ - 51
276
+ - 71
277
+ - 84
278
+ - 98
279
+ - 105
280
+ "3":
281
+ - 2
282
+ - 13
283
+ - 16
284
+ - 61
285
+ "4": 3.12.12
286
+ "5": 0.23.1
287
+ "6": 4.57.3
288
+ "12": 0.23.1
289
+ "13": linux-x86_64
290
+ account_for_embedding_in_pipeline_split:
291
+ value: false
292
+ account_for_loss_in_pipeline_split:
293
+ value: false
294
+ accumulate_allreduce_grads_in_fp32:
295
+ value: true
296
+ adam_beta1:
297
+ value: 0.9
298
+ adam_beta2:
299
+ value: 0.95
300
+ adam_eps:
301
+ value: 1e-08
302
+ adapter_load:
303
+ value: null
304
+ adapters:
305
+ value: []
306
+ add_bias_linear:
307
+ value: false
308
+ add_position_embedding:
309
+ value: true
310
+ add_qkv_bias:
311
+ value: false
312
+ add_version:
313
+ value: true
314
+ adlr_autoresume:
315
+ value: false
316
+ adlr_autoresume_interval:
317
+ value: 1000
318
+ advantage_estimator:
319
+ value: grpo
320
+ agent_template:
321
+ value: null
322
+ align_grad_reduce:
323
+ value: true
324
+ align_param_gather:
325
+ value: false
326
+ aligner_lr:
327
+ value: null
328
+ app_tag_run_name:
329
+ value: null
330
+ app_tag_run_version:
331
+ value: 0.0.0
332
+ apply_layernorm_1p:
333
+ value: false
334
+ apply_query_key_layer_scaling:
335
+ value: false
336
+ apply_residual_connection_post_layernorm:
337
+ value: false
338
+ apply_rope_fusion:
339
+ value: true
340
+ architectures:
341
+ value: Qwen3ForCausalLM
342
+ async_generate:
343
+ value: false
344
+ async_save:
345
+ value: null
346
+ async_tensor_model_parallel_allreduce:
347
+ value: true
348
+ attention_backend:
349
+ value: flash
350
+ attention_dropout:
351
+ value: 0
352
+ attention_softmax_in_fp32:
353
+ value: true
354
+ attn_impl:
355
+ value: null
356
+ auto_detect_ckpt_format:
357
+ value: true
358
+ barrier_with_L1_time:
359
+ value: true
360
+ bert_binary_head:
361
+ value: true
362
+ bert_embedder_type:
363
+ value: megatron
364
+ bert_load:
365
+ value: null
366
+ beta:
367
+ value: 0.1
368
+ bf16:
369
+ value: true
370
+ bias_dropout_fusion:
371
+ value: true
372
+ bias_gelu_fusion:
373
+ value: false
374
+ bias_swiglu_fusion:
375
+ value: true
376
+ biencoder_projection_dim:
377
+ value: 0
378
+ biencoder_shared_query_context_model:
379
+ value: false
380
+ block_data_path:
381
+ value: null
382
+ bnb_4bit_compute_dtype:
383
+ value: torch.bfloat16
384
+ bnb_4bit_quant_storage:
385
+ value: null
386
+ bnb_4bit_quant_type:
387
+ value: nf4
388
+ bnb_4bit_use_double_quant:
389
+ value: true
390
+ cache_mla_latents:
391
+ value: false
392
+ cached_dataset:
393
+ value:
394
+ - /workspace/2of3
395
+ cached_val_dataset:
396
+ value: []
397
+ calc_ft_timeouts:
398
+ value: false
399
+ calculate_KL:
400
+ value: null
401
+ calculate_per_token_loss:
402
+ value: true
403
+ center_rewards_coefficient:
404
+ value: null
405
+ check_for_large_grads:
406
+ value: false
407
+ check_for_nan_in_loss_and_grad:
408
+ value: true
409
+ check_for_spiky_loss:
410
+ value: false
411
+ check_model:
412
+ value: true
413
+ check_weight_hash_across_dp_replicas_interval:
414
+ value: null
415
+ ckpt_assume_constant_structure:
416
+ value: false
417
+ ckpt_convert_format:
418
+ value: null
419
+ ckpt_convert_save:
420
+ value: null
421
+ ckpt_convert_update_legacy_dist_opt_format:
422
+ value: false
423
+ ckpt_dir:
424
+ value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT
425
+ ckpt_format:
426
+ value: torch_dist
427
+ ckpt_fully_parallel_load:
428
+ value: false
429
+ ckpt_fully_parallel_save:
430
+ value: true
431
+ ckpt_fully_parallel_save_deprecated:
432
+ value: false
433
+ ckpt_step:
434
+ value: null
435
+ classes_fraction:
436
+ value: 1
437
+ clip_grad:
438
+ value: 1
439
+ clone_scatter_output_in_embedding:
440
+ value: true
441
+ completion_length_limit_scope:
442
+ value: per_round
443
+ config_logger_dir:
444
+ value: ""
445
+ consumed_train_samples:
446
+ value: 0
447
+ consumed_valid_samples:
448
+ value: 0
449
+ context_parallel_size:
450
+ value: 1
451
+ cosine_max_len:
452
+ value: null
453
+ cosine_max_len_value_correct:
454
+ value: 0.5
455
+ cosine_max_len_value_wrong:
456
+ value: 0
457
+ cosine_min_len_value_correct:
458
+ value: 1
459
+ cosine_min_len_value_wrong:
460
+ value: -0.5
461
+ cp_comm_type:
462
+ value:
463
+ - p2p
464
+ create_attention_mask_in_dataloader:
465
+ value: true
466
+ cross_entropy_fusion_impl:
467
+ value: native
468
+ cross_entropy_loss_fusion:
469
+ value: true
470
+ cuda_graph_scope:
471
+ value: full
472
+ cuda_graph_warmup_steps:
473
+ value: 3
474
+ custom_dataset_info:
475
+ value: []
476
+ custom_register_path:
477
+ value:
478
+ - custom_model/custom_register.py
479
+ data_args_path:
480
+ value: null
481
+ data_cache_path:
482
+ value: null
483
+ data_parallel_random_init:
484
+ value: false
485
+ data_parallel_sharding_strategy:
486
+ value: no_shard
487
+ data_parallel_size:
488
+ value: 8
489
+ data_path:
490
+ value: null
491
+ data_per_class_fraction:
492
+ value: 1
493
+ data_seed:
494
+ value: 42
495
+ data_sharding:
496
+ value: true
497
+ dataloader_persistent_workers:
498
+ value: true
499
+ dataloader_pin_memory:
500
+ value: true
501
+ dataloader_prefetch_factor:
502
+ value: 10
503
+ dataloader_type:
504
+ value: cyclic
505
+ dataset:
506
+ value: []
507
+ dataset_num_proc:
508
+ value: 32
509
+ dataset_shuffle:
510
+ value: true
511
+ ddp_average_in_collective:
512
+ value: false
513
+ ddp_backend:
514
+ value: null
515
+ ddp_bucket_size:
516
+ value: null
517
+ ddp_num_buckets:
518
+ value: null
519
+ ddp_pad_buckets_for_high_nccl_busbw:
520
+ value: false
521
+ ddp_timeout:
522
+ value: 18000000
523
+ decoder_first_pipeline_num_layers:
524
+ value: null
525
+ decoder_last_pipeline_num_layers:
526
+ value: null
527
+ decoder_num_layers:
528
+ value: null
529
+ decoder_seq_length:
530
+ value: null
531
+ decoupled_lr:
532
+ value: null
533
+ decoupled_min_lr:
534
+ value: null
535
+ decrease_batch_size_if_needed:
536
+ value: false
537
+ defer_embedding_wgrad_compute:
538
+ value: false
539
+ delay_wgrad_compute:
540
+ value: false
541
+ delta:
542
+ value: null
543
+ deprecated_use_mcore_models:
544
+ value: false
545
+ desirable_weight:
546
+ value: 1
547
+ deterministic_mode:
548
+ value: false
549
+ device_groups:
550
+ value: null
551
+ device_map:
552
+ value: null
553
+ dino_bottleneck_size:
554
+ value: 256
555
+ dino_freeze_last_layer:
556
+ value: 1
557
+ dino_head_hidden_size:
558
+ value: 2048
559
+ dino_local_crops_number:
560
+ value: 10
561
+ dino_local_img_size:
562
+ value: 96
563
+ dino_norm_last_layer:
564
+ value: false
565
+ dino_teacher_temp:
566
+ value: 0.07
567
+ dino_warmup_teacher_temp:
568
+ value: 0.04
569
+ dino_warmup_teacher_temp_epochs:
570
+ value: 30
571
+ disable_bf16_reduced_precision_matmul:
572
+ value: false
573
+ disable_mamba_mem_eff_path:
574
+ value: false
575
+ disable_straggler_on_startup:
576
+ value: false
577
+ dist_ckpt_format_deprecated:
578
+ value: null
579
+ dist_ckpt_strictness:
580
+ value: assume_ok_unexpected
581
+ distribute_saved_activations:
582
+ value: false
583
+ distributed_backend:
584
+ value: nccl
585
+ distributed_timeout_minutes:
586
+ value: 300000
587
+ download_mode:
588
+ value: reuse_dataset_if_exists
589
+ dynamic_sample:
590
+ value: false
591
+ embedding_init_method_std:
592
+ value: null
593
+ embedding_path:
594
+ value: null
595
+ empty_unused_memory_level:
596
+ value: 0
597
+ enable_channel_loss:
598
+ value: false
599
+ enable_cuda_graph:
600
+ value: false
601
+ enable_dft_loss:
602
+ value: false
603
+ enable_experimental:
604
+ value: false
605
+ enable_ft_package:
606
+ value: false
607
+ enable_full_sharding_in_hsdp:
608
+ value: false
609
+ enable_gloo_process_groups:
610
+ value: true
611
+ enable_msc:
612
+ value: true
613
+ enable_one_logger:
614
+ value: true
615
+ encoder_num_layers:
616
+ value: 28
617
+ encoder_seq_length:
618
+ value: 16384
619
+ end_weight_decay:
620
+ value: 0.1
621
+ eod_mask_loss:
622
+ value: false
623
+ epsilon:
624
+ value: 0.2
625
+ epsilon_high:
626
+ value: null
627
+ error_injection_rate:
628
+ value: 0
629
+ error_injection_type:
630
+ value: transient_error
631
+ eval_interval:
632
+ value: 100
633
+ eval_iters:
634
+ value: -1
635
+ evidence_data_path:
636
+ value: null
637
+ exit_duration_in_mins:
638
+ value: null
639
+ exit_interval:
640
+ value: null
641
+ exit_on_missing_checkpoint:
642
+ value: true
643
+ exit_signal_handler:
644
+ value: false
645
+ exp_avg_dtype:
646
+ value: torch.float32
647
+ exp_avg_sq_dtype:
648
+ value: torch.float32
649
+ expert_model_parallel_size:
650
+ value: 1
651
+ expert_tensor_parallel_size:
652
+ value: 1
653
+ external_cuda_graph:
654
+ value: false
655
+ external_plugins:
656
+ value: []
657
+ f_divergence_type:
658
+ value: reverse_kl
659
+ ffn_hidden_size:
660
+ value: 3072
661
+ finetune:
662
+ value: true
663
+ first_last_layers_bf16:
664
+ value: false
665
+ flash_decode:
666
+ value: false
667
+ fp8:
668
+ value: null
669
+ fp8_amax_compute_algo:
670
+ value: max
671
+ fp8_amax_history_len:
672
+ value: 1024
673
+ fp8_interval:
674
+ value: 1
675
+ fp8_margin:
676
+ value: 0
677
+ fp8_param_gather:
678
+ value: false
679
+ fp8_recipe:
680
+ value: delayed
681
+ fp8_wgrad:
682
+ value: true
683
+ fp16:
684
+ value: false
685
+ fp16_lm_cross_entropy:
686
+ value: false
687
+ fp32_residual_connection:
688
+ value: false
689
+ freeze_aligner:
690
+ value: true
691
+ freeze_llm:
692
+ value: false
693
+ freeze_parameters:
694
+ value: []
695
+ freeze_parameters_ratio:
696
+ value: 0
697
+ freeze_parameters_regex:
698
+ value: null
699
+ freeze_vit:
700
+ value: true
701
+ fsdp_double_buffer:
702
+ value: false
703
+ full_validation:
704
+ value: false
705
+ generation_batch_size:
706
+ value: null
707
+ global_batch_size:
708
+ value: 256
709
+ grad_reduce_in_bf16:
710
+ value: false
711
+ gradient_accumulation_fusion:
712
+ value: true
713
+ gradient_checkpointing_kwargs:
714
+ value: null
715
+ gradient_reduce_div_fusion:
716
+ value: true
717
+ group_query_attention:
718
+ value: true
719
+ head_lr_mult:
720
+ value: 1
721
+ heterogeneous_layers_config_encoded_json:
722
+ value: null
723
+ heterogeneous_layers_config_path:
724
+ value: null
725
+ hf_model_type:
726
+ value: qwen3_plt
727
+ hidden_dropout:
728
+ value: 0
729
+ hidden_size:
730
+ value: 1024
731
+ hierarchical_context_parallel_sizes:
732
+ value: null
733
+ high_priority_stream_groups:
734
+ value: []
735
+ hqq_axis:
736
+ value: null
737
+ hub_token:
738
+ value: null
739
+ hybrid_attention_ratio:
740
+ value: 0
741
+ hybrid_mlp_ratio:
742
+ value: 0
743
+ hybrid_override_pattern:
744
+ value: null
745
+ hysteresis:
746
+ value: 2
747
+ ict_head_size:
748
+ value: null
749
+ ict_load:
750
+ value: null
751
+ ignore_args_error:
752
+ value: false
753
+ img_h:
754
+ value: 224
755
+ img_w:
756
+ value: 224
757
+ importance_sampling_level:
758
+ value: token
759
+ indexer_batch_size:
760
+ value: 128
761
+ indexer_log_interval:
762
+ value: 1000
763
+ inference_batch_times_seqlen_threshold:
764
+ value: -1
765
+ inference_dynamic_batching:
766
+ value: false
767
+ inference_dynamic_batching_buffer_guaranteed_fraction:
768
+ value: 0.2
769
+ inference_dynamic_batching_buffer_overflow_factor:
770
+ value: null
771
+ inference_dynamic_batching_buffer_size_gb:
772
+ value: 40
773
+ inference_dynamic_batching_chunk_size:
774
+ value: 256
775
+ inference_dynamic_batching_max_requests_override:
776
+ value: null
777
+ inference_dynamic_batching_max_tokens_override:
778
+ value: null
779
+ inference_dynamic_batching_num_cuda_graphs:
780
+ value: 16
781
+ inference_max_batch_size:
782
+ value: 8
783
+ inference_max_seq_length:
784
+ value: 2560
785
+ inference_rng_tracker:
786
+ value: false
787
+ init_method_std:
788
+ value: 0.02
789
+ init_method_xavier_uniform:
790
+ value: false
791
+ init_model_with_meta_device:
792
+ value: false
793
+ init_strategy:
794
+ value: null
795
+ initial_loss_scale:
796
+ value: 4294967296
797
+ initialize_embedding:
798
+ value: false
799
+ inprocess_active_world_size:
800
+ value: 8
801
+ inprocess_barrier_timeout:
802
+ value: 120
803
+ inprocess_completion_timeout:
804
+ value: 120
805
+ inprocess_empty_cuda_cache:
806
+ value: false
807
+ inprocess_granularity:
808
+ value: node
809
+ inprocess_hard_timeout:
810
+ value: 90
811
+ inprocess_heartbeat_interval:
812
+ value: 30
813
+ inprocess_heartbeat_timeout:
814
+ value: 60
815
+ inprocess_last_call_wait:
816
+ value: 1
817
+ inprocess_max_iterations:
818
+ value: null
819
+ inprocess_monitor_process_interval:
820
+ value: 1
821
+ inprocess_monitor_thread_interval:
822
+ value: 1
823
+ inprocess_progress_watchdog_interval:
824
+ value: 1
825
+ inprocess_restart:
826
+ value: false
827
+ inprocess_soft_timeout:
828
+ value: 60
829
+ inprocess_termination_grace_time:
830
+ value: 1
831
+ interleave_prob:
832
+ value: null
833
+ is_hybrid_model:
834
+ value: false
835
+ is_multimodal:
836
+ value: false
837
+ iter_per_epoch:
838
+ value: 1250
839
+ iterations_to_skip:
840
+ value: []
841
+ keep_fp8_transpose_cache:
842
+ value: false
843
+ kitchen_config_file:
844
+ value: null
845
+ kitchen_recipe_number:
846
+ value: null
847
+ kl_in_reward:
848
+ value: false
849
+ kv_channels:
850
+ value: 128
851
+ kv_lora_rank:
852
+ value: 32
853
+ label_smoothing:
854
+ value: 0
855
+ layer_types:
856
+ value: null
857
+ lazy_mpu_init:
858
+ value: null
859
+ lazy_tokenize:
860
+ value: false
861
+ linear_conv_kernel_dim:
862
+ value: null
863
+ linear_key_head_dim:
864
+ value: null
865
+ linear_num_key_heads:
866
+ value: null
867
+ linear_num_value_heads:
868
+ value: null
869
+ linear_value_head_dim:
870
+ value: null
871
+ llm_architectures:
872
+ value: Qwen3ForCausalLM
873
+ load:
874
+ value: /workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT
875
+ load_args:
876
+ value: false
877
+ load_data_args:
878
+ value: false
879
+ load_from_cache_file:
880
+ value: false
881
+ load_main_params_from_ckpt:
882
+ value: null
883
+ load_model_opt_format:
884
+ value: false
885
+ load_safetensors:
886
+ value: false
887
+ local_rank:
888
+ value: 7
889
+ local_repo_path:
890
+ value: null
891
+ log_completions:
892
+ value: false
893
+ log_energy:
894
+ value: false
895
+ log_entropy:
896
+ value: false
897
+ log_interval:
898
+ value: 1
899
+ log_loss_scale_to_tensorboard:
900
+ value: true
901
+ log_memory_to_tensorboard:
902
+ value: true
903
+ log_num_zeros_in_grad:
904
+ value: false
905
+ log_params_norm:
906
+ value: false
907
+ log_progress:
908
+ value: false
909
+ log_rollout_offpolicy_metrics:
910
+ value: false
911
+ log_straggler:
912
+ value: false
913
+ log_throughput:
914
+ value: false
915
+ log_timers_to_tensorboard:
916
+ value: true
917
+ log_validation_ppl_to_tensorboard:
918
+ value: true
919
+ log_world_size_to_tensorboard:
920
+ value: false
921
+ logging_level:
922
+ value: 20
923
+ logprobs:
924
+ value: false
925
+ lora_alpha:
926
+ value: 32
927
+ lora_bias:
928
+ value: none
929
+ lora_dropout:
930
+ value: 0.05
931
+ lora_dtype:
932
+ value: null
933
+ lora_modules:
934
+ value: []
935
+ lora_rank:
936
+ value: 8
937
+ loss_scale:
938
+ value: null
939
+ loss_scale_window:
940
+ value: 1000
941
+ loss_type:
942
+ value: null
943
+ lr:
944
+ value: 0.0001
945
+ lr_decay_iters:
946
+ value: null
947
+ lr_decay_samples:
948
+ value: null
949
+ lr_decay_style:
950
+ value: cosine
951
+ lr_warmup_fraction:
952
+ value: 0.05
953
+ lr_warmup_init:
954
+ value: 0
955
+ lr_warmup_iters:
956
+ value: 0
957
+ lr_warmup_samples:
958
+ value: 0
959
+ lr_wsd_decay_iters:
960
+ value: null
961
+ lr_wsd_decay_samples:
962
+ value: null
963
+ lr_wsd_decay_style:
964
+ value: exponential
965
+ main_grads_dtype:
966
+ value: torch.float32
967
+ main_params_dtype:
968
+ value: torch.float32
969
+ make_vocab_size_divisible_by:
970
+ value: 128
971
+ mamba_head_dim:
972
+ value: 64
973
+ mamba_num_groups:
974
+ value: 8
975
+ mamba_num_heads:
976
+ value: null
977
+ mamba_state_dim:
978
+ value: 128
979
+ manual_gc:
980
+ value: false
981
+ manual_gc_eval:
982
+ value: true
983
+ manual_gc_interval:
984
+ value: 0
985
+ mask_factor:
986
+ value: 1
987
+ mask_prob:
988
+ value: 0.15
989
+ mask_type:
990
+ value: random
991
+ masked_softmax_fusion:
992
+ value: true
993
+ max_completion_length:
994
+ value: 512
995
+ max_epochs:
996
+ value: 1
997
+ max_length:
998
+ value: 16384
999
+ max_model_len:
1000
+ value: null
1001
+ max_new_tokens:
1002
+ value: null
1003
+ max_pixels:
1004
+ value: null
1005
+ max_position_embeddings:
1006
+ value: 32768
1007
+ max_resample_times:
1008
+ value: 3
1009
+ max_shard_size:
1010
+ value: 5GB
1011
+ max_tokens_to_oom:
1012
+ value: 12000
1013
+ max_turns:
1014
+ value: null
1015
+ memory_snapshot_path:
1016
+ value: snapshot.pickle
1017
+ merge_file:
1018
+ value: null
1019
+ merge_lora:
1020
+ value: false
1021
+ micro_batch_size:
1022
+ value: 2
1023
+ microbatch_group_size_per_vp_stage:
1024
+ value: null
1025
+ mid_level_dataset_surplus:
1026
+ value: 0.005
1027
+ min_loss_scale:
1028
+ value: 1
1029
+ min_lr:
1030
+ value: 3e-06
1031
+ mlp_chunks_for_prefill:
1032
+ value: 1
1033
+ mlp_padding_free:
1034
+ value: false
1035
+ mmap_bin_files:
1036
+ value: true
1037
+ mock_data:
1038
+ value: false
1039
+ model:
1040
+ value: Qwen/Qwen3-0.6B-Base
1041
+ model_author:
1042
+ value: null
1043
+ model_dir:
1044
+ value: /workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd
1045
+ model_name:
1046
+ value: null
1047
+ model_revision:
1048
+ value: null
1049
+ model_type:
1050
+ value: qwen3_plt
1051
+ modules_to_save:
1052
+ value: []
1053
+ moe_apply_probs_on_input:
1054
+ value: false
1055
+ moe_aux_loss_coeff:
1056
+ value: 0
1057
+ moe_deepep_num_sms:
1058
+ value: 20
1059
+ moe_enable_deepep:
1060
+ value: false
1061
+ moe_expert_capacity_factor:
1062
+ value: null
1063
+ moe_extended_tp:
1064
+ value: false
1065
+ moe_ffn_hidden_size:
1066
+ value: null
1067
+ moe_grouped_gemm:
1068
+ value: true
1069
+ moe_input_jitter_eps:
1070
+ value: null
1071
+ moe_layer_freq:
1072
+ value: 1
1073
+ moe_layer_recompute:
1074
+ value: false
1075
+ moe_pad_expert_input_to_capacity:
1076
+ value: false
1077
+ moe_per_layer_logging:
1078
+ value: false
1079
+ moe_permute_fusion:
1080
+ value: false
1081
+ moe_router_bias_update_rate:
1082
+ value: 0.001
1083
+ moe_router_dtype:
1084
+ value: fp32
1085
+ moe_router_enable_expert_bias:
1086
+ value: false
1087
+ moe_router_force_load_balancing:
1088
+ value: false
1089
+ moe_router_fusion:
1090
+ value: false
1091
+ moe_router_group_topk:
1092
+ value: null
1093
+ moe_router_load_balancing_type:
1094
+ value: aux_loss
1095
+ moe_router_num_groups:
1096
+ value: null
1097
+ moe_router_padding_for_fp8:
1098
+ value: false
1099
+ moe_router_pre_softmax:
1100
+ value: false
1101
+ moe_router_score_function:
1102
+ value: softmax
1103
+ moe_router_topk:
1104
+ value: 2
1105
+ moe_router_topk_scaling_factor:
1106
+ value: null
1107
+ moe_shared_expert_intermediate_size:
1108
+ value: null
1109
+ moe_shared_expert_overlap:
1110
+ value: false
1111
+ moe_token_dispatcher_type:
1112
+ value: alltoall
1113
+ moe_token_drop_policy:
1114
+ value: probs
1115
+ moe_upcycling_granularity:
1116
+ value: 1
1117
+ moe_use_legacy_grouped_gemm:
1118
+ value: false
1119
+ moe_use_upcycling:
1120
+ value: false
1121
+ moe_z_loss_coeff:
1122
+ value: null
1123
+ move_model_batches:
1124
+ value: null
1125
+ mrope_interleaved:
1126
+ value: false
1127
+ mrope_section:
1128
+ value: null
1129
+ mscale:
1130
+ value: 1
1131
+ mscale_all_dim:
1132
+ value: 0
1133
+ mtp_loss_scaling_factor:
1134
+ value: 0.1
1135
+ mtp_num_layers:
1136
+ value: null
1137
+ multi_latent_attention:
1138
+ value: false
1139
+ multi_turn_scheduler:
1140
+ value: null
1141
+ multiple_validation_sets:
1142
+ value: false
1143
+ nccl_all_reduce_for_prefill:
1144
+ value: false
1145
+ nccl_communicator_config_path:
1146
+ value: null
1147
+ nccl_ub:
1148
+ value: false
1149
+ new_special_tokens:
1150
+ value: []
1151
+ no_load_optim:
1152
+ value: null
1153
+ no_load_rng:
1154
+ value: null
1155
+ no_persist_layer_norm:
1156
+ value: false
1157
+ no_rope_freq:
1158
+ value: null
1159
+ no_save_optim:
1160
+ value: null
1161
+ no_save_rng:
1162
+ value: null
1163
+ non_persistent_ckpt_type:
1164
+ value: null
1165
+ non_persistent_global_ckpt_dir:
1166
+ value: null
1167
+ non_persistent_local_ckpt_algo:
1168
+ value: fully_parallel
1169
+ non_persistent_local_ckpt_dir:
1170
+ value: null
1171
+ non_persistent_save_interval:
1172
+ value: null
1173
+ norm_bbox:
1174
+ value: null
1175
+ norm_epsilon:
1176
+ value: 1e-06
1177
+ normalization:
1178
+ value: RMSNorm
1179
+ num_attention_heads:
1180
+ value: 16
1181
+ num_beams:
1182
+ value: 1
1183
+ num_channels:
1184
+ value: 3
1185
+ num_classes:
1186
+ value: 1000
1187
+ num_dataset_builder_threads:
1188
+ value: 1
1189
+ num_distributed_optimizer_instances:
1190
+ value: 1
1191
+ num_experts:
1192
+ value: null
1193
+ num_generations:
1194
+ value: 8
1195
+ num_iterations:
1196
+ value: 1
1197
+ num_labels:
1198
+ value: null
1199
+ num_layers:
1200
+ value: 28
1201
+ num_layers_at_end_in_bf16:
1202
+ value: 1
1203
+ num_layers_at_start_in_bf16:
1204
+ value: 1
1205
+ num_layers_per_virtual_pipeline_stage:
1206
+ value: null
1207
+ num_query_groups:
1208
+ value: 8
1209
+ num_virtual_stages_per_pipeline_rank:
1210
+ value: null
1211
+ num_workers:
1212
+ value: 32
1213
+ object_storage_cache_path:
1214
+ value: null
1215
+ off_policy_sequence_mask_delta:
1216
+ value: null
1217
+ offload_bridge:
1218
+ value: false
1219
+ offload_model:
1220
+ value: false
1221
+ offload_optimizer:
1222
+ value: false
1223
+ one_logger_async:
1224
+ value: false
1225
+ one_logger_project:
1226
+ value: megatron-lm
1227
+ one_logger_run_name:
1228
+ value: null
1229
+ onnx_safe:
1230
+ value: null
1231
+ openai_gelu:
1232
+ value: false
1233
+ optimizer:
1234
+ value: adam
1235
+ optimizer_cpu_offload:
1236
+ value: false
1237
+ optimizer_offload_fraction:
1238
+ value: 1
1239
+ original_max_position_embeddings:
1240
+ value: null
1241
+ output_bert_embeddings:
1242
+ value: false
1243
+ overlap_cpu_optimizer_d2h_h2d:
1244
+ value: false
1245
+ overlap_grad_reduce:
1246
+ value: true
1247
+ overlap_moe_expert_parallel_comm:
1248
+ value: false
1249
+ overlap_p2p_comm:
1250
+ value: false
1251
+ overlap_p2p_comm_warmup_flush:
1252
+ value: false
1253
+ overlap_param_gather:
1254
+ value: true
1255
+ overlap_param_gather_with_optimizer_step:
1256
+ value: false
1257
+ overlong_filter:
1258
+ value: false
1259
+ override_opt_param_scheduler:
1260
+ value: false
1261
+ packing:
1262
+ value: true
1263
+ packing_length:
1264
+ value: 16384
1265
+ packing_num_proc:
1266
+ value: 1
1267
+ padded_vocab_size:
1268
+ value: 151936
1269
+ padding_free:
1270
+ value: true
1271
+ padding_side:
1272
+ value: right
1273
+ params_dtype:
1274
+ value: torch.bfloat16
1275
+ partial_rotary_factor:
1276
+ value: null
1277
+ patch_dim:
1278
+ value: 16
1279
+ patch_size:
1280
+ value: 4
1281
+ per_split_data_args_path:
1282
+ value: null
1283
+ perform_initialization:
1284
+ value: false
1285
+ pin_cpu_grads:
1286
+ value: true
1287
+ pin_cpu_params:
1288
+ value: true
1289
+ pipeline_model_parallel_comm_backend:
1290
+ value: null
1291
+ pipeline_model_parallel_layout:
1292
+ value: null
1293
+ pipeline_model_parallel_size:
1294
+ value: 1
1295
+ position_embedding_type:
1296
+ value: rope
1297
+ pretrained_checkpoint:
1298
+ value: null
1299
+ problem_type:
1300
+ value: null
1301
+ profile:
1302
+ value: false
1303
+ profile_ranks:
1304
+ value:
1305
+ - 0
1306
+ profile_step_end:
1307
+ value: 12
1308
+ profile_step_start:
1309
+ value: 10
1310
+ q_lora_rank:
1311
+ value: null
1312
+ qk_head_dim:
1313
+ value: 128
1314
+ qk_l2_norm:
1315
+ value: false
1316
+ qk_layernorm:
1317
+ value: true
1318
+ qk_pos_emb_head_dim:
1319
+ value: 64
1320
+ quant_bits:
1321
+ value: null
1322
+ quant_method:
1323
+ value: null
1324
+ query_in_block_prob:
1325
+ value: 0.1
1326
+ rampup_batch_size:
1327
+ value: null
1328
+ rank:
1329
+ value: 7
1330
+ ray_exp_name:
1331
+ value: null
1332
+ recompute_granularity:
1333
+ value: full
1334
+ recompute_method:
1335
+ value: uniform
1336
+ recompute_modules:
1337
+ value:
1338
+ - core_attn
1339
+ recompute_num_layers:
1340
+ value: 1
1341
+ record_memory_history:
1342
+ value: false
1343
+ ref_adapter_load:
1344
+ value: null
1345
+ ref_adapters:
1346
+ value: []
1347
+ ref_load:
1348
+ value: null
1349
+ ref_model:
1350
+ value: null
1351
+ ref_model_mixup_alpha:
1352
+ value: 0.6
1353
+ ref_model_sync_steps:
1354
+ value: 512
1355
+ reference_free:
1356
+ value: false
1357
+ relative_attention_max_distance:
1358
+ value: 128
1359
+ relative_attention_num_buckets:
1360
+ value: 32
1361
+ remove_unused_columns:
1362
+ value: true
1363
+ repetition_max_penalty:
1364
+ value: -1
1365
+ repetition_n_grams:
1366
+ value: 3
1367
+ repetition_penalty:
1368
+ value: 1
1369
+ replication:
1370
+ value: false
1371
+ replication_factor:
1372
+ value: 2
1373
+ replication_jump:
1374
+ value: null
1375
+ rerun_mode:
1376
+ value: validate_results
1377
+ reset_attention_mask:
1378
+ value: false
1379
+ reset_position_ids:
1380
+ value: false
1381
+ response_prefix:
1382
+ value: null
1383
+ result_rejected_tracker_filename:
1384
+ value: null
1385
+ retriever_report_topk_accuracies:
1386
+ value: []
1387
+ retriever_score_scaling:
1388
+ value: false
1389
+ retriever_seq_length:
1390
+ value: 256
1391
+ retro_add_retriever:
1392
+ value: false
1393
+ retro_attention_gate:
1394
+ value: 1
1395
+ retro_cyclic_train_iters:
1396
+ value: null
1397
+ retro_encoder_attention_dropout:
1398
+ value: 0.1
1399
+ retro_encoder_hidden_dropout:
1400
+ value: 0.1
1401
+ retro_encoder_layers:
1402
+ value: 2
1403
+ retro_num_neighbors:
1404
+ value: 2
1405
+ retro_num_retrieved_chunks:
1406
+ value: 2
1407
+ retro_project_dir:
1408
+ value: null
1409
+ retro_verify_neighbor_count:
1410
+ value: true
1411
+ reuse_grad_buf_for_mxfp8_param_ag:
1412
+ value: false
1413
+ reward_funcs:
1414
+ value: []
1415
+ reward_model:
1416
+ value: null
1417
+ reward_model_plugin:
1418
+ value: null
1419
+ reward_weights:
1420
+ value: null
1421
+ rlhf_type:
1422
+ value: null
1423
+ rollout_importance_sampling_mode:
1424
+ value: null
1425
+ rollout_importance_sampling_threshold:
1426
+ value: 2
1427
+ rope_scaling:
1428
+ value: null
1429
+ rope_scaling_factor:
1430
+ value: 8
1431
+ rope_type:
1432
+ value: null
1433
+ rotary_base:
1434
+ value: 1000000
1435
+ rotary_interleaved:
1436
+ value: false
1437
+ rotary_percent:
1438
+ value: 1
1439
+ rotary_scaling_factor:
1440
+ value: 1
1441
+ rotary_seq_len_interpolation_factor:
1442
+ value: null
1443
+ rpo_alpha:
1444
+ value: null
1445
+ run_workload_inspector_server:
1446
+ value: false
1447
+ sample_rate:
1448
+ value: 1
1449
+ save:
1450
+ value: /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727
1451
+ save_interval:
1452
+ value: 100
1453
+ save_retain_interval:
1454
+ value: null
1455
+ save_safetensors:
1456
+ value: false
1457
+ save_strategy:
1458
+ value: steps
1459
+ scale_rewards:
1460
+ value: group
1461
+ scatter_gather_tensors_in_pipeline:
1462
+ value: true
1463
+ seed:
1464
+ value: 42
1465
+ seq_length:
1466
+ value: 16384
1467
+ sequence_parallel:
1468
+ value: false
1469
+ sequence_parallel_size:
1470
+ value: 1
1471
+ sft:
1472
+ value: false
1473
+ sft_tokenizer_prompt_format:
1474
+ value: nemotron-h-aligned
1475
+ sgd_momentum:
1476
+ value: 0.9
1477
+ sharp_enabled_group:
1478
+ value: null
1479
+ short_seq_prob:
1480
+ value: 0.1
1481
+ shuffle_buffer_size:
1482
+ value: 1000
1483
+ skip_train:
1484
+ value: false
1485
+ skipped_train_samples:
1486
+ value: 0
1487
+ sleep_level:
1488
+ value: 0
1489
+ soft_cache_length:
1490
+ value: null
1491
+ soft_max_length:
1492
+ value: null
1493
+ spec:
1494
+ value: null
1495
+ split:
1496
+ value: null
1497
+ split_dataset_ratio:
1498
+ value: 0
1499
+ squared_relu:
1500
+ value: false
1501
+ start_weight_decay:
1502
+ value: 0.1
1503
+ steps_per_generation:
1504
+ value: null
1505
+ stop_words:
1506
+ value: []
1507
+ stopping_strategy:
1508
+ value: first_exhausted
1509
+ straggler_ctrlr_port:
1510
+ value: 65535
1511
+ straggler_minmax_count:
1512
+ value: 1
1513
+ stream:
1514
+ value: false
1515
+ streaming:
1516
+ value: false
1517
+ strict:
1518
+ value: false
1519
+ strict_fsdp_dtensor_load:
1520
+ value: true
1521
+ suggested_communication_unit_size:
1522
+ value: null
1523
+ swiglu:
1524
+ value: true
1525
+ swin_backbone_type:
1526
+ value: tiny
1527
+ symmetric_ar_type:
1528
+ value: null
1529
+ sync_ref_model:
1530
+ value: false
1531
+ system:
1532
+ value: null
1533
+ target_modules:
1534
+ value:
1535
+ - all-linear
1536
+ target_regex:
1537
+ value: null
1538
+ task_type:
1539
+ value: causal_lm
1540
+ tau_neg:
1541
+ value: 1.05
1542
+ tau_pos:
1543
+ value: 1
1544
+ te_rng_tracker:
1545
+ value: false
1546
+ temperature:
1547
+ value: null
1548
+ template:
1549
+ value: qwen3
1550
+ template_backend:
1551
+ value: swift
1552
+ tensor_model_parallel_size:
1553
+ value: 1
1554
+ tensorboard_dir:
1555
+ value: /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs
1556
+ tensorboard_log_interval:
1557
+ value: 1
1558
+ tensorboard_queue_size:
1559
+ value: 50
1560
+ test_data_path:
1561
+ value: null
1562
+ test_mode:
1563
+ value: false
1564
+ tiktoken_num_special_tokens:
1565
+ value: 1000
1566
+ tiktoken_pattern:
1567
+ value: null
1568
+ tiktoken_special_tokens:
1569
+ value: null
1570
+ timing_log_level:
1571
+ value: 0
1572
+ timing_log_option:
1573
+ value: minmax
1574
+ titles_data_path:
1575
+ value: null
1576
+ tokenizer_model:
1577
+ value: null
1578
+ tokenizer_type:
1579
+ value: null
1580
+ top_entropy_quantile:
1581
+ value: 1
1582
+ top_k:
1583
+ value: 50
1584
+ top_logprobs:
1585
+ value: null
1586
+ top_p:
1587
+ value: 0.9
1588
+ torch_dtype:
1589
+ value: torch.bfloat16
1590
+ torch_fsdp2_reshard_after_forward:
1591
+ value: true
1592
+ tp_comm_bootstrap_backend:
1593
+ value: nccl
1594
+ tp_comm_bulk_dgrad:
1595
+ value: true
1596
+ tp_comm_bulk_wgrad:
1597
+ value: true
1598
+ tp_comm_overlap:
1599
+ value: false
1600
+ tp_comm_overlap_ag:
1601
+ value: true
1602
+ tp_comm_overlap_cfg:
1603
+ value: null
1604
+ tp_comm_overlap_rs:
1605
+ value: true
1606
+ tp_comm_overlap_rs_dgrad:
1607
+ value: false
1608
+ tp_comm_split_ag:
1609
+ value: true
1610
+ tp_comm_split_rs:
1611
+ value: true
1612
+ train_data_path:
1613
+ value: null
1614
+ train_dataloader_shuffle:
1615
+ value: true
1616
+ train_iters:
1617
+ value: 6350
1618
+ train_samples:
1619
+ value: null
1620
+ train_sync_interval:
1621
+ value: null
1622
+ train_type:
1623
+ value: full
1624
+ trainable_parameters:
1625
+ value: []
1626
+ trainable_parameters_regex:
1627
+ value: null
1628
+ transformer_impl:
1629
+ value: transformer_engine
1630
+ transformer_pipeline_model_parallel_size:
1631
+ value: 1
1632
+ truncation_strategy:
1633
+ value: right
1634
+ tuner_backend:
1635
+ value: peft
1636
+ undesirable_weight:
1637
+ value: 1
1638
+ untie_embeddings_and_output_weights:
1639
+ value: false
1640
+ use_chat_template:
1641
+ value: false
1642
+ use_checkpoint_args:
1643
+ value: false
1644
+ use_checkpoint_opt_param_scheduler:
1645
+ value: false
1646
+ use_cpu_initialization:
1647
+ value: null
1648
+ use_dist_ckpt:
1649
+ value: true
1650
+ use_dist_ckpt_deprecated:
1651
+ value: false
1652
+ use_distributed_optimizer:
1653
+ value: true
1654
+ use_flash_attn:
1655
+ value: false
1656
+ use_fused_weighted_squared_relu:
1657
+ value: false
1658
+ use_hf:
1659
+ value: true
1660
+ use_legacy_models:
1661
+ value: false
1662
+ use_megatron_fsdp:
1663
+ value: false
1664
+ use_mp_args_from_checkpoint_args:
1665
+ value: false
1666
+ use_one_sent_docs:
1667
+ value: false
1668
+ use_persistent_ckpt_worker:
1669
+ value: false
1670
+ use_precision_aware_optimizer:
1671
+ value: true
1672
+ use_pytorch_profiler:
1673
+ value: false
1674
+ use_ray:
1675
+ value: false
1676
+ use_ring_exchange_p2p:
1677
+ value: false
1678
+ use_rope_scaling:
1679
+ value: false
1680
+ use_rotary_position_embeddings:
1681
+ value: false
1682
+ use_rslora:
1683
+ value: false
1684
+ use_shared_expert_gate:
1685
+ value: false
1686
+ use_sharp:
1687
+ value: false
1688
+ use_swift_lora:
1689
+ value: false
1690
+ use_tokenizer_model_from_checkpoint_args:
1691
+ value: true
1692
+ use_torch_fsdp2:
1693
+ value: false
1694
+ use_torch_optimizer_for_cpu_offload:
1695
+ value: false
1696
+ use_tp_pp_dp_mapping:
1697
+ value: false
1698
+ use_vllm:
1699
+ value: true
1700
+ v_head_dim:
1701
+ value: 128
1702
+ val_dataset:
1703
+ value: []
1704
+ val_dataset_shuffle:
1705
+ value: false
1706
+ valid_data_path:
1707
+ value: null
1708
+ variable_seq_lengths:
1709
+ value: false
1710
+ virtual_pipeline_model_parallel_size:
1711
+ value: null
1712
+ vision_backbone_type:
1713
+ value: vit
1714
+ vision_pretraining:
1715
+ value: false
1716
+ vision_pretraining_type:
1717
+ value: classify
1718
+ vit_gradient_checkpointing:
1719
+ value: true
1720
+ vit_lr:
1721
+ value: null
1722
+ vllm_disable_cascade_attn:
1723
+ value: false
1724
+ vllm_enable_prefix_caching:
1725
+ value: true
1726
+ vllm_enforce_eager:
1727
+ value: false
1728
+ vllm_engine_kwargs:
1729
+ value: null
1730
+ vllm_gpu_memory_utilization:
1731
+ value: 0.9
1732
+ vllm_limit_mm_per_prompt:
1733
+ value: null
1734
+ vllm_max_model_len:
1735
+ value: null
1736
+ vllm_max_num_seqs:
1737
+ value: null
1738
+ vllm_mm_processor_cache_gb:
1739
+ value: null
1740
+ vllm_mode:
1741
+ value: null
1742
+ vllm_server_base_url:
1743
+ value: null
1744
+ vllm_server_group_port:
1745
+ value: null
1746
+ vllm_server_host:
1747
+ value: null
1748
+ vllm_server_pass_dataset:
1749
+ value: false
1750
+ vllm_server_port:
1751
+ value:
1752
+ - 8000
1753
+ vllm_server_timeout:
1754
+ value: 240
1755
+ vllm_tensor_parallel_size:
1756
+ value: 1
1757
+ vocab_extra_ids:
1758
+ value: 0
1759
+ vocab_file:
1760
+ value: null
1761
+ vocab_size:
1762
+ value: null
1763
+ wandb_exp_name:
1764
+ value: plt_1
1765
+ wandb_log_unique_prompts:
1766
+ value: null
1767
+ wandb_project:
1768
+ value: plt
1769
+ wandb_save_dir:
1770
+ value: ""
1771
+ weight_decay:
1772
+ value: 0.1
1773
+ weight_decay_incr_style:
1774
+ value: constant
1775
+ wgrad_deferral_limit:
1776
+ value: 0
1777
+ world_size:
1778
+ value: 8
1779
+ yaml_cfg:
1780
+ value: null
wandb/wandb/run-20251223_145018-ogu6y2pr/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20251223_145018-ogu6y2pr/files/requirements.txt ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip==25.3
2
+ setuptools==80.9.0
3
+ wheel==0.45.1
4
+ multidict==6.7.0
5
+ cffi==2.0.0
6
+ typing-inspection==0.4.2
7
+ smmap==5.0.2
8
+ sentry-sdk==2.48.0
9
+ pydantic_core==2.41.5
10
+ pydantic_core==2.41.4
11
+ protobuf==6.33.2
12
+ annotated-types==0.7.0
13
+ pydantic==2.12.5
14
+ pydantic==2.12.3
15
+ gitdb==4.0.12
16
+ GitPython==3.1.45
17
+ wandb==0.23.1
18
+ sortedcontainers==2.4.0
19
+ pytz==2025.2
20
+ pydub==0.25.1
21
+ jieba==0.42.1
22
+ crcmod==1.7
23
+ cpm-kernels==1.0.11
24
+ brotli==1.2.0
25
+ antlr4-python3-runtime==4.9.3
26
+ addict==2.4.0
27
+ zstandard==0.25.0
28
+ zipp==3.23.0
29
+ xxhash==3.6.0
30
+ Werkzeug==3.1.4
31
+ websockets==15.0.1
32
+ uvicorn==0.40.0
33
+ tzdata==2025.3
34
+ tomlkit==0.13.3
35
+ tensorboard-data-server==0.7.2
36
+ sniffio==1.3.1
37
+ simplejson==3.20.2
38
+ semantic-version==2.10.0
39
+ scipy==1.16.3
40
+ safetensors==0.7.0
41
+ ruff==0.14.10
42
+ rouge==1.0.1
43
+ regex==2025.11.3
44
+ python-multipart==0.0.21
45
+ pyparsing==3.3.1
46
+ pycryptodome==3.23.0
47
+ pycparser==2.23
48
+ pyarrow==22.0.0
49
+ propcache==0.4.1
50
+ mdurl==0.1.2
51
+ pillow==11.3.0
52
+ orjson==3.11.5
53
+ omegaconf==2.3.0
54
+ Markdown==3.10
55
+ kiwisolver==1.4.9
56
+ json_repair==0.54.3
57
+ joblib==1.5.3
58
+ jmespath==0.10.0
59
+ jiter==0.12.0
60
+ grpcio==1.76.0
61
+ groovy==0.1.2
62
+ future==1.0.0
63
+ trl==0.24.0
64
+ fsspec==2025.3.0
65
+ frozenlist==1.8.0
66
+ fonttools==4.61.1
67
+ ffmpy==1.0.0
68
+ einops==0.8.1
69
+ distro==1.9.0
70
+ dill==0.3.8
71
+ dacite==1.9.2
72
+ cycler==0.12.1
73
+ contourpy==1.3.3
74
+ attrs==25.4.0
75
+ attrdict==2.0.1
76
+ annotated-doc==0.0.4
77
+ aiohappyeyeballs==2.6.1
78
+ aiofiles==24.1.0
79
+ absl-py==2.3.1
80
+ yarl==1.22.0
81
+ tiktoken==0.12.0
82
+ tensorboard==2.20.0
83
+ starlette==0.50.0
84
+ pandas==2.3.3
85
+ nltk==3.9.2
86
+ multiprocess==0.70.16
87
+ modelscope==1.33.0
88
+ matplotlib==3.10.8
89
+ markdown-it-py==4.0.0
90
+ importlib_metadata==8.7.1
91
+ huggingface-hub==0.36.0
92
+ binpacking==1.5.2
93
+ aiosignal==1.4.0
94
+ tokenizers==0.22.1
95
+ safehttpx==0.1.7
96
+ rich==14.2.0
97
+ openai==2.14.0
98
+ gradio_client==1.14.0
99
+ fastapi==0.127.0
100
+ cryptography==46.0.3
101
+ aiohttp==3.13.2
102
+ typer==0.20.1
103
+ transformers==4.57.3
104
+ aliyun-python-sdk-core==2.16.0
105
+ accelerate==1.12.0
106
+ transformers-stream-generator==0.0.5
107
+ peft==0.18.0
108
+ gradio==5.50.0
109
+ datasets==3.6.0
110
+ aliyun-python-sdk-kms==2.16.5
111
+ oss2==2.19.1
112
+ ms_swift==3.12.0.dev0
113
+ liger_kernel==0.6.4
114
+ hf_transfer==0.1.9
115
+ pybind11==3.0.1
116
+ transformer_engine==2.10.0
117
+ ml_dtypes==0.5.4
118
+ onnx==1.20.0
119
+ transformer_engine_cu12==2.10.0
120
+ onnx-ir==0.1.13
121
+ onnxscript==0.5.7
122
+ transformer_engine_torch==2.10.0
123
+ apex==0.1
124
+ numpy==1.26.4
125
+ megatron-core==0.15.0
126
+ flash_attn==2.8.3
127
+ charset-normalizer==3.4.4
128
+ Jinja2==3.1.6
129
+ MarkupSafe==3.0.3
130
+ mpmath==1.3.0
131
+ networkx==3.6.1
132
+ nvidia-cublas-cu12==12.8.4.1
133
+ nvidia-cuda-cupti-cu12==12.8.90
134
+ nvidia-cuda-nvrtc-cu12==12.8.93
135
+ nvidia-cuda-runtime-cu12==12.8.90
136
+ nvidia-cudnn-cu12==9.10.2.21
137
+ nvidia-cufft-cu12==11.3.3.83
138
+ nvidia-cufile-cu12==1.13.1.3
139
+ nvidia-curand-cu12==10.3.9.90
140
+ nvidia-cusolver-cu12==11.7.3.90
141
+ nvidia-cusparse-cu12==12.5.8.93
142
+ nvidia-cusparselt-cu12==0.7.1
143
+ nvidia-nccl-cu12==2.27.5
144
+ nvidia-nvjitlink-cu12==12.8.93
145
+ nvidia-nvshmem-cu12==3.3.20
146
+ nvidia-nvtx-cu12==12.8.90
147
+ requests==2.32.5
148
+ sentencepiece==0.2.1
149
+ sympy==1.14.0
150
+ torch==2.9.1+cu128
151
+ torchaudio==2.9.1+cu128
152
+ torchcodec==0.9.1
153
+ torchdata==0.10.0
154
+ torchtext==0.6.0
155
+ torchvision==0.24.1+cu128
156
+ triton==3.5.1
157
+ urllib3==2.6.2
158
+ anyio==4.12.0
159
+ asttokens==3.0.1
160
+ certifi==2025.11.12
161
+ click==8.3.1
162
+ comm==0.2.3
163
+ debugpy==1.8.18
164
+ decorator==5.2.1
165
+ executing==2.2.1
166
+ filelock==3.20.0
167
+ h11==0.16.0
168
+ hf-xet==1.2.0
169
+ httpcore==1.0.9
170
+ httpx==0.28.1
171
+ idna==3.11
172
+ ipykernel==7.1.0
173
+ ipython==9.8.0
174
+ ipython_pygments_lexers==1.1.1
175
+ ipywidgets==8.1.8
176
+ jedi==0.19.2
177
+ jupyter_client==8.7.0
178
+ jupyter_core==5.9.1
179
+ jupyterlab_widgets==3.0.16
180
+ matplotlib-inline==0.2.1
181
+ nest-asyncio==1.6.0
182
+ packaging==25.0
183
+ parso==0.8.5
184
+ pexpect==4.9.0
185
+ platformdirs==4.5.1
186
+ prompt_toolkit==3.0.52
187
+ psutil==7.1.3
188
+ ptyprocess==0.7.0
189
+ pure_eval==0.2.3
190
+ Pygments==2.19.2
191
+ python-dateutil==2.9.0.post0
192
+ PyYAML==6.0.3
193
+ pyzmq==27.1.0
194
+ shellingham==1.5.4
195
+ six==1.17.0
196
+ stack-data==0.6.3
197
+ tornado==6.5.3
198
+ tqdm==4.67.1
199
+ traitlets==5.14.3
200
+ typer-slim==0.20.0
201
+ typing_extensions==4.15.0
202
+ wcwidth==0.2.14
203
+ widgetsnbextension==4.0.15
204
+ autocommand==2.2.2
205
+ backports.tarfile==1.2.0
206
+ importlib_metadata==8.0.0
207
+ inflect==7.3.1
208
+ jaraco.collections==5.1.0
209
+ jaraco.context==5.3.0
210
+ jaraco.functools==4.0.1
211
+ jaraco.text==3.12.1
212
+ more-itertools==10.3.0
213
+ packaging==24.2
214
+ platformdirs==4.2.2
215
+ tomli==2.0.1
216
+ typeguard==4.3.0
217
+ typing_extensions==4.12.2
218
+ wheel==0.45.1
219
+ zipp==3.19.2
wandb/wandb/run-20251223_145018-ogu6y2pr/files/wandb-metadata.json ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-58-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.12",
4
+ "startedAt": "2025-12-23T14:50:18.904388Z",
5
+ "args": [
6
+ "--seed",
7
+ "42",
8
+ "--micro-batch-size",
9
+ "2",
10
+ "--global-batch-size",
11
+ "256",
12
+ "--recompute-granularity",
13
+ "full",
14
+ "--recompute-method",
15
+ "uniform",
16
+ "--recompute-num-layers",
17
+ "1",
18
+ "--recompute-modules",
19
+ "core_attn",
20
+ "--train-iters",
21
+ "6350",
22
+ "--log-interval",
23
+ "1",
24
+ "--tensorboard-dir",
25
+ "/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs",
26
+ "--cross-entropy-loss-fusion",
27
+ "--cross-entropy-fusion-impl",
28
+ "native",
29
+ "--calculate-per-token-loss",
30
+ "--attention-backend",
31
+ "flash",
32
+ "--optimizer",
33
+ "adam",
34
+ "--optimizer-offload-fraction",
35
+ "1.0",
36
+ "--use-precision-aware-optimizer",
37
+ "--main-grads-dtype",
38
+ "fp32",
39
+ "--main-params-dtype",
40
+ "fp32",
41
+ "--exp-avg-dtype",
42
+ "fp32",
43
+ "--exp-avg-sq-dtype",
44
+ "fp32",
45
+ "--dataloader-type",
46
+ "cyclic",
47
+ "--manual-gc-interval",
48
+ "0",
49
+ "--lr",
50
+ "0.0001",
51
+ "--lr-decay-style",
52
+ "cosine",
53
+ "--lr-warmup-iters",
54
+ "0",
55
+ "--lr-warmup-fraction",
56
+ "0.05",
57
+ "--min-lr",
58
+ "3e-06",
59
+ "--weight-decay",
60
+ "0.1",
61
+ "--clip-grad",
62
+ "1.0",
63
+ "--adam-beta1",
64
+ "0.9",
65
+ "--adam-beta2",
66
+ "0.95",
67
+ "--adam-eps",
68
+ "1e-08",
69
+ "--sgd-momentum",
70
+ "0.9",
71
+ "--save",
72
+ "/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727",
73
+ "--save-interval",
74
+ "100",
75
+ "--load",
76
+ "/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT",
77
+ "--finetune",
78
+ "--ckpt-format",
79
+ "torch_dist",
80
+ "--no-initialization",
81
+ "--auto-detect-ckpt-format",
82
+ "--exit-on-missing-checkpoint",
83
+ "--distributed-backend",
84
+ "nccl",
85
+ "--local-rank",
86
+ "7",
87
+ "--use-distributed-optimizer",
88
+ "--tensor-model-parallel-size",
89
+ "1",
90
+ "--pipeline-model-parallel-size",
91
+ "1",
92
+ "--context-parallel-size",
93
+ "1",
94
+ "--overlap-grad-reduce",
95
+ "--overlap-param-gather",
96
+ "--distributed-timeout-minutes",
97
+ "300000",
98
+ "--num-layers",
99
+ "28",
100
+ "--hidden-size",
101
+ "1024",
102
+ "--ffn-hidden-size",
103
+ "3072",
104
+ "--num-attention-heads",
105
+ "16",
106
+ "--group-query-attention",
107
+ "--num-query-groups",
108
+ "8",
109
+ "--max-position-embeddings",
110
+ "32768",
111
+ "--position-embedding-type",
112
+ "rope",
113
+ "--rotary-base",
114
+ "1000000",
115
+ "--rotary-percent",
116
+ "1.0",
117
+ "--normalization",
118
+ "RMSNorm",
119
+ "--norm-epsilon",
120
+ "1e-06",
121
+ "--swiglu",
122
+ "--disable-bias-linear",
123
+ "--attention-dropout",
124
+ "0.0",
125
+ "--hidden-dropout",
126
+ "0.0",
127
+ "--kv-channels",
128
+ "128",
129
+ "--qk-layernorm",
130
+ "--transformer-impl",
131
+ "transformer_engine",
132
+ "--moe-layer-freq",
133
+ "1",
134
+ "--moe-router-topk",
135
+ "2",
136
+ "--moe-router-dtype",
137
+ "fp32",
138
+ "--moe-router-score-function",
139
+ "softmax",
140
+ "--moe-router-load-balancing-type",
141
+ "aux_loss",
142
+ "--expert-model-parallel-size",
143
+ "1",
144
+ "--expert-tensor-parallel-size",
145
+ "1",
146
+ "--moe-token-dispatcher-type",
147
+ "alltoall",
148
+ "--moe-grouped-gemm",
149
+ "--moe-aux-loss-coeff",
150
+ "0.0",
151
+ "--moe-token-drop-policy",
152
+ "probs",
153
+ "--kv-lora-rank",
154
+ "32",
155
+ "--qk-head-dim",
156
+ "128",
157
+ "--qk-pos-emb-head-dim",
158
+ "64",
159
+ "--mtp-loss-scaling-factor",
160
+ "0.1",
161
+ "--fp8-recipe",
162
+ "delayed",
163
+ "--fp8-amax-history-len",
164
+ "1024",
165
+ "--fp8-amax-compute-algo",
166
+ "max",
167
+ "--bf16",
168
+ "--attention-softmax-in-fp32",
169
+ "--tensorboard-log-interval",
170
+ "1",
171
+ "--tensorboard-queue-size",
172
+ "50",
173
+ "--log-timers-to-tensorboard",
174
+ "--log-validation-ppl-to-tensorboard",
175
+ "--log-memory-to-tensorboard",
176
+ "--logging-level",
177
+ "20",
178
+ "--wandb-project",
179
+ "plt",
180
+ "--wandb-exp-name",
181
+ "plt_1",
182
+ "--eval-iters",
183
+ "-1",
184
+ "--eval-interval",
185
+ "100",
186
+ "--seq-length",
187
+ "16384",
188
+ "--num-workers",
189
+ "32"
190
+ ],
191
+ "program": "/workspace/halcyon-recipe2/swift/cli/_megatron/pt.py",
192
+ "codePath": "swift/cli/_megatron/pt.py",
193
+ "codePathLocal": "swift/cli/_megatron/pt.py",
194
+ "git": {
195
+ "remote": "https://github.com/weak-kajuma/halcyon-recipe2.git",
196
+ "commit": "ea7cc214b68fb511dd83bff83a504b7f43053577"
197
+ },
198
+ "email": "[email protected]",
199
+ "root": "/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb",
200
+ "host": "36fd00e7b21c",
201
+ "executable": "/venv/main/bin/python3.12",
202
+ "cpu_count": 72,
203
+ "cpu_count_logical": 144,
204
+ "gpu": "NVIDIA GeForce RTX 5090",
205
+ "gpu_count": 8,
206
+ "disk": {
207
+ "/": {
208
+ "total": "7669363507200",
209
+ "used": "923719221248"
210
+ }
211
+ },
212
+ "memory": {
213
+ "total": "540643262464"
214
+ },
215
+ "gpu_nvidia": [
216
+ {
217
+ "name": "NVIDIA GeForce RTX 5090",
218
+ "memoryTotal": "34190917632",
219
+ "cudaCores": 21760,
220
+ "architecture": "Blackwell",
221
+ "uuid": "GPU-5d40e56e-9cf1-0a97-080a-30624a8f6da3"
222
+ },
223
+ {
224
+ "name": "NVIDIA GeForce RTX 5090",
225
+ "memoryTotal": "34190917632",
226
+ "cudaCores": 21760,
227
+ "architecture": "Blackwell",
228
+ "uuid": "GPU-23ca8669-46fc-19eb-348b-e51e591c150d"
229
+ },
230
+ {
231
+ "name": "NVIDIA GeForce RTX 5090",
232
+ "memoryTotal": "34190917632",
233
+ "cudaCores": 21760,
234
+ "architecture": "Blackwell",
235
+ "uuid": "GPU-c4c1ca99-b237-b12b-43fd-7c0b428ed152"
236
+ },
237
+ {
238
+ "name": "NVIDIA GeForce RTX 5090",
239
+ "memoryTotal": "34190917632",
240
+ "cudaCores": 21760,
241
+ "architecture": "Blackwell",
242
+ "uuid": "GPU-d48e64fd-956c-1ce4-4e95-b9d198ba26e9"
243
+ },
244
+ {
245
+ "name": "NVIDIA GeForce RTX 5090",
246
+ "memoryTotal": "34190917632",
247
+ "cudaCores": 21760,
248
+ "architecture": "Blackwell",
249
+ "uuid": "GPU-29d31f97-dff9-6078-7bf6-d8fc65ada1b7"
250
+ },
251
+ {
252
+ "name": "NVIDIA GeForce RTX 5090",
253
+ "memoryTotal": "34190917632",
254
+ "cudaCores": 21760,
255
+ "architecture": "Blackwell",
256
+ "uuid": "GPU-ed004a01-be7c-9fc0-6742-ac7f7a0bea49"
257
+ },
258
+ {
259
+ "name": "NVIDIA GeForce RTX 5090",
260
+ "memoryTotal": "34190917632",
261
+ "cudaCores": 21760,
262
+ "architecture": "Blackwell",
263
+ "uuid": "GPU-56cdc53f-360e-a64f-2cd5-2ba3daaf5a7b"
264
+ },
265
+ {
266
+ "name": "NVIDIA GeForce RTX 5090",
267
+ "memoryTotal": "34190917632",
268
+ "cudaCores": 21760,
269
+ "architecture": "Blackwell",
270
+ "uuid": "GPU-aa4a1a25-49c1-62ec-3a38-070d6c7912ef"
271
+ }
272
+ ],
273
+ "cudaVersion": "13.0",
274
+ "writerId": "5txfscc07xx6ws9ptnhwj8isjczvektf"
275
+ }
wandb/wandb/run-20251223_145018-ogu6y2pr/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"iteration-time":6.732069969177246,"loss-scale":1,"grad-norm":0.2582673132419586,"_wandb":{"runtime":41997},"_runtime":41997.687498348,"samples vs steps":1628672,"lm loss":4.852498531341553,"batch-size":256,"_timestamp":1.7665434152263432e+09,"_step":6362,"learning-rate":3.000000106112566e-06}
wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-23T14:50:18.968299735Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpkv9b_kjn/port-40598.txt","pid":40598,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-23T14:50:18.970283526Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":40598}
3
+ {"time":"2025-12-23T14:50:18.970250071Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-40598-43370-2183720552/socket","Net":"unix"}}
4
+ {"time":"2025-12-23T14:50:19.151289716Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-23T14:50:19.157777281Z","level":"INFO","msg":"handleInformInit: received","streamId":"ogu6y2pr","id":"1(@)"}
6
+ {"time":"2025-12-23T14:50:19.453125586Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ogu6y2pr","id":"1(@)"}
7
+ {"time":"2025-12-24T02:30:18.155545103Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"ogu6y2pr","id":"1(@)"}
8
+ {"time":"2025-12-24T02:30:18.161882248Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"ogu6y2pr","id":"1(@)"}
9
+ {"time":"2025-12-24T02:30:22.730426005Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2025-12-24T02:30:22.730508603Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2025-12-24T02:30:22.730522955Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2025-12-24T02:30:22.730570938Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2025-12-24T02:30:22.730656762Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-40598-43370-2183720552/socket","Net":"unix"}}
14
+ {"time":"2025-12-24T02:30:22.730848261Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
15
+ {"time":"2025-12-24T02:30:22.730865711Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
16
+ {"time":"2025-12-24T02:30:22.730879627Z","level":"INFO","msg":"server is closed"}
wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug-internal.log ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-23T14:50:19.157923178Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-23T14:50:19.452689851Z","level":"INFO","msg":"stream: created new stream","id":"ogu6y2pr"}
3
+ {"time":"2025-12-23T14:50:19.452982231Z","level":"INFO","msg":"handler: started","stream_id":"ogu6y2pr"}
4
+ {"time":"2025-12-23T14:50:19.453108747Z","level":"INFO","msg":"stream: started","id":"ogu6y2pr"}
5
+ {"time":"2025-12-23T14:50:19.453203475Z","level":"INFO","msg":"writer: started","stream_id":"ogu6y2pr"}
6
+ {"time":"2025-12-23T14:50:19.453240834Z","level":"INFO","msg":"sender: started","stream_id":"ogu6y2pr"}
7
+ {"time":"2025-12-23T15:01:38.571944052Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
8
+ {"time":"2025-12-23T15:01:38.572286Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
9
+ {"time":"2025-12-23T15:12:36.396153575Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
10
+ {"time":"2025-12-23T15:12:36.396527132Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
11
+ {"time":"2025-12-23T15:23:29.324008023Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
12
+ {"time":"2025-12-23T15:23:29.324352406Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
13
+ {"time":"2025-12-23T15:34:29.472612198Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
14
+ {"time":"2025-12-23T15:34:29.47297197Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
15
+ {"time":"2025-12-23T15:45:30.426297295Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
16
+ {"time":"2025-12-23T15:45:30.426633692Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
17
+ {"time":"2025-12-23T15:56:27.157101003Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
18
+ {"time":"2025-12-23T15:56:27.157277527Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
19
+ {"time":"2025-12-23T16:07:25.783471884Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
20
+ {"time":"2025-12-23T16:07:25.78382365Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
21
+ {"time":"2025-12-23T16:18:28.498810989Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
22
+ {"time":"2025-12-23T16:18:28.499138715Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
23
+ {"time":"2025-12-23T16:29:31.444349128Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
24
+ {"time":"2025-12-23T16:29:31.444726336Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
25
+ {"time":"2025-12-23T16:40:32.834491152Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
26
+ {"time":"2025-12-23T16:40:32.834850242Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
27
+ {"time":"2025-12-23T16:51:29.42362482Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
28
+ {"time":"2025-12-23T16:51:29.423959343Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
29
+ {"time":"2025-12-23T17:02:30.793211082Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
30
+ {"time":"2025-12-23T17:02:30.793539822Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
31
+ {"time":"2025-12-23T17:13:33.060094819Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
32
+ {"time":"2025-12-23T17:13:33.060295815Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
33
+ {"time":"2025-12-23T17:24:32.958248966Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
34
+ {"time":"2025-12-23T17:24:32.958591544Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
35
+ {"time":"2025-12-23T17:35:34.024128754Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
36
+ {"time":"2025-12-23T17:35:34.024472732Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
37
+ {"time":"2025-12-23T17:46:30.37232901Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
38
+ {"time":"2025-12-23T17:46:30.372775091Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
39
+ {"time":"2025-12-23T17:57:32.302526996Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
40
+ {"time":"2025-12-23T17:57:32.30288267Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
41
+ {"time":"2025-12-23T18:08:31.048703971Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
42
+ {"time":"2025-12-23T18:08:31.049055711Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
43
+ {"time":"2025-12-23T18:19:30.830781514Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
44
+ {"time":"2025-12-23T18:19:30.831097265Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
45
+ {"time":"2025-12-23T18:30:33.803044335Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
46
+ {"time":"2025-12-23T18:30:33.803369724Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
47
+ {"time":"2025-12-23T18:41:30.279364466Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
48
+ {"time":"2025-12-23T18:41:30.279584052Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
49
+ {"time":"2025-12-23T18:52:31.826751928Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
50
+ {"time":"2025-12-23T18:52:31.827089624Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
51
+ {"time":"2025-12-23T19:03:30.598572678Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
52
+ {"time":"2025-12-23T19:03:30.598924055Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
53
+ {"time":"2025-12-23T19:14:32.616037222Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
54
+ {"time":"2025-12-23T19:14:32.616197778Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
55
+ {"time":"2025-12-23T19:25:32.273259098Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
56
+ {"time":"2025-12-23T19:25:32.273602938Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
57
+ {"time":"2025-12-23T19:36:32.889615697Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
58
+ {"time":"2025-12-23T19:36:32.889787719Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
59
+ {"time":"2025-12-23T19:47:37.77571508Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
60
+ {"time":"2025-12-23T19:47:37.776047293Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
61
+ {"time":"2025-12-23T19:58:39.630164384Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
62
+ {"time":"2025-12-23T19:58:39.630513085Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
63
+ {"time":"2025-12-23T20:09:41.513708638Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
64
+ {"time":"2025-12-23T20:09:41.51405603Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
65
+ {"time":"2025-12-23T20:20:43.088173421Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
66
+ {"time":"2025-12-23T20:20:43.088393383Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
67
+ {"time":"2025-12-23T20:31:43.221503093Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
68
+ {"time":"2025-12-23T20:31:43.221686078Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
69
+ {"time":"2025-12-23T20:42:41.052656025Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
70
+ {"time":"2025-12-23T20:42:41.052839564Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
71
+ {"time":"2025-12-23T20:53:42.488501769Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
72
+ {"time":"2025-12-23T20:53:42.488862537Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
73
+ {"time":"2025-12-23T21:04:41.351069333Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
74
+ {"time":"2025-12-23T21:04:41.35141939Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
75
+ {"time":"2025-12-23T21:15:38.340545121Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
76
+ {"time":"2025-12-23T21:15:38.340904893Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
77
+ {"time":"2025-12-23T21:26:37.510508084Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
78
+ {"time":"2025-12-23T21:26:37.510875914Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
79
+ {"time":"2025-12-23T21:37:36.292525017Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
80
+ {"time":"2025-12-23T21:37:36.292900087Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
81
+ {"time":"2025-12-23T21:48:34.770970878Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
82
+ {"time":"2025-12-23T21:48:34.771291232Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
83
+ {"time":"2025-12-23T21:59:33.737624435Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
84
+ {"time":"2025-12-23T21:59:33.737981585Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
85
+ {"time":"2025-12-23T22:10:39.042583666Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
86
+ {"time":"2025-12-23T22:10:39.042916749Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
87
+ {"time":"2025-12-23T22:21:41.112891247Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
88
+ {"time":"2025-12-23T22:21:41.11321011Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
89
+ {"time":"2025-12-23T22:32:40.580325627Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
90
+ {"time":"2025-12-23T22:32:40.580465985Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
91
+ {"time":"2025-12-23T22:43:36.940776917Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
92
+ {"time":"2025-12-23T22:43:36.941123182Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
93
+ {"time":"2025-12-23T22:54:33.373651394Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
94
+ {"time":"2025-12-23T22:54:33.37402137Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
95
+ {"time":"2025-12-23T23:05:35.325478747Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
96
+ {"time":"2025-12-23T23:05:35.325854894Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
97
+ {"time":"2025-12-23T23:16:36.760827184Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
98
+ {"time":"2025-12-23T23:16:36.760987996Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
99
+ {"time":"2025-12-23T23:27:34.549769482Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
100
+ {"time":"2025-12-23T23:27:34.550122142Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
101
+ {"time":"2025-12-23T23:38:34.786053357Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
102
+ {"time":"2025-12-23T23:38:34.786371337Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
103
+ {"time":"2025-12-23T23:49:33.035575592Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
104
+ {"time":"2025-12-23T23:49:33.035971484Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
105
+ {"time":"2025-12-24T00:00:31.990522355Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
106
+ {"time":"2025-12-24T00:00:31.990721359Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
107
+ {"time":"2025-12-24T00:11:34.140353801Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
108
+ {"time":"2025-12-24T00:11:34.140717429Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
109
+ {"time":"2025-12-24T00:22:34.75725217Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
110
+ {"time":"2025-12-24T00:22:34.757560563Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
111
+ {"time":"2025-12-24T00:33:30.609082628Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
112
+ {"time":"2025-12-24T00:33:30.609405858Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
113
+ {"time":"2025-12-24T00:44:24.763501577Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
114
+ {"time":"2025-12-24T00:44:24.763833284Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
115
+ {"time":"2025-12-24T00:55:22.589601724Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
116
+ {"time":"2025-12-24T00:55:22.589953341Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
117
+ {"time":"2025-12-24T01:06:21.284476721Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
118
+ {"time":"2025-12-24T01:06:21.28472507Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
119
+ {"time":"2025-12-24T01:17:23.427057704Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
120
+ {"time":"2025-12-24T01:17:23.427909522Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
121
+ {"time":"2025-12-24T01:28:21.205072065Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
122
+ {"time":"2025-12-24T01:28:21.205400369Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
123
+ {"time":"2025-12-24T01:39:23.227311862Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
124
+ {"time":"2025-12-24T01:39:23.227635463Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
125
+ {"time":"2025-12-24T01:50:21.711610423Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
126
+ {"time":"2025-12-24T01:50:21.711992483Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
127
+ {"time":"2025-12-24T02:01:26.168432549Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
128
+ {"time":"2025-12-24T02:01:26.169382559Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
129
+ {"time":"2025-12-24T02:12:27.119904169Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
130
+ {"time":"2025-12-24T02:12:27.12021425Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
131
+ {"time":"2025-12-24T02:23:29.280347891Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
132
+ {"time":"2025-12-24T02:23:29.280692788Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
133
+ {"time":"2025-12-24T02:30:17.524116078Z","level":"ERROR","msg":"HTTP error","status":400,"method":"POST","url":"https://api.wandb.ai/graphql"}
134
+ {"time":"2025-12-24T02:30:17.524323977Z","level":"ERROR","msg":"sender: failed to log artifact: ArtifactSaver.createArtifact: returned error 400: {\"data\":{\"createArtifact\":null},\"errors\":[{\"message\":\"Invalid Client ID digest\",\"path\":[\"createArtifact\"]}]}","artifactID":""}
135
+ {"time":"2025-12-24T02:30:17.911709787Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
136
+ {"time":"2025-12-24T02:30:18.150633972Z","level":"INFO","msg":"handler: operation stats","stats":{}}
137
+ {"time":"2025-12-24T02:30:18.155591363Z","level":"INFO","msg":"stream: closing","id":"ogu6y2pr"}
138
+ {"time":"2025-12-24T02:30:18.1556065Z","level":"INFO","msg":"handler: closed","stream_id":"ogu6y2pr"}
139
+ {"time":"2025-12-24T02:30:18.15573719Z","level":"INFO","msg":"sender: closed","stream_id":"ogu6y2pr"}
140
+ {"time":"2025-12-24T02:30:18.15575783Z","level":"INFO","msg":"stream: closed","id":"ogu6y2pr"}
wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Configure stats pid to 40598
3
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from /workspace/halcyon-recipe2/wandb/settings
5
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug.log
7
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/wandb/wandb/run-20251223_145018-ogu6y2pr/logs/debug-internal.log
8
+ 2025-12-23 14:50:18,907 INFO MainThread:40598 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-23 14:50:18,908 INFO MainThread:40598 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 28, 'encoder_num_layers': 28, 'decoder_num_layers': None, 'hidden_size': 1024, 'ffn_hidden_size': 3072, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 8, 'max_position_embeddings': 32768, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 1000000, 'rotary_percent': 1.0, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': False, 'multi_latent_attention': False, 'mtp_num_layers': None, 'mtp_loss_scaling_factor': 0.1, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 2, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': 'full', 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': 'uniform', 'recompute_num_layers': 1, 'recompute_modules': ['core_attn'], 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': True, 'train_sync_interval': None, 'train_iters': 6350, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': '/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727/runs', 'masked_softmax_fusion': True, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'use_fused_weighted_squared_relu': False, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'rope_type': None, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': False, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'cyclic', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': False, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 42, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'embedding_init_method_std': None, 'init_method_xavier_uniform': False, 'lr': 0.0001, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': 0.05, 'lr_warmup_iters': 0, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 3e-06, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': '/workspace/halcyon-recipe2/megatron_output/patch/v3-20251223-144727', 'save_interval': 100, 'save_retain_interval': None, 'no_save_optim': None, 'no_save_rng': None, 'load': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT', 'no_load_optim': None, 'load_main_params_from_ckpt': None, 'no_load_rng': None, 'strict_fsdp_dtensor_load': True, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': True, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': False, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': True, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': True, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': None, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 300000, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': True, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': True, 'nccl_ub': False, 'use_sharp': False, 'sharp_enabled_group': None, 'use_megatron_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache': False, 'enable_full_sharding_in_hsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'full_validation': False, 'multiple_validation_sets': False, 'eval_iters': -1, 'eval_interval': 100, 'test_mode': False, 'skip_train': False, 'data_path': None, 'split': None, 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 16384, 'encoder_seq_length': 16384, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 32, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, 'create_attention_mask_in_dataloader': True, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': None, 'padded_vocab_size': 151936, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': None, 'tokenizer_model': None, 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 1, 'expert_tensor_parallel_size': 1, 'num_experts': None, 'moe_layer_freq': 1, 'moe_ffn_hidden_size': None, 'moe_shared_expert_intermediate_size': None, 'moe_shared_expert_overlap': False, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'moe_router_fusion': False, 'moe_router_score_function': 'softmax', 'moe_router_topk': 2, 'moe_router_pre_softmax': False, 'moe_router_num_groups': None, 'moe_router_group_topk': None, 'moe_router_topk_scaling_factor': None, 'moe_router_enable_expert_bias': False, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': None, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': None, 'moe_pad_expert_input_to_capacity': False, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'overlap_moe_expert_parallel_comm': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 1.0, 'mscale': 1.0, 'mscale_all_dim': 0.0, 'cache_mla_latents': False, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': False, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 50, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': False, 'wandb_project': 'plt', 'wandb_exp_name': 'plt_1', 'wandb_save_dir': '', 'logging_level': 20, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'inference_dynamic_batching_num_cuda_graphs': 16, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1024, 'fp8_amax_compute_algo': 'max', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': True, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'validate_results', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 8, 'model_dir': '/workspace/.hf_home/hub/models--Qwen--Qwen3-0.6B-Base/snapshots/da87bfb608c14b7cf20ba1ce41287e8de496c0cd', 'is_multimodal': False, 'hf_model_type': 'qwen3_plt', 'use_ray': False, 'ray_exp_name': None, 'device_groups': None, 'model': 'Qwen/Qwen3-0.6B-Base', 'model_type': 'qwen3_plt', 'model_revision': None, 'task_type': 'causal_lm', 'torch_dtype': torch.bfloat16, 'attn_impl': None, 'new_special_tokens': [], 'num_labels': None, 'problem_type': None, 'rope_scaling': None, 'device_map': None, 'max_memory': {}, 'max_model_len': None, 'local_repo_path': None, 'init_strategy': None, 'template': 'qwen3', 'system': None, 'max_length': 16384, 'truncation_strategy': 'right', 'max_pixels': None, 'agent_template': None, 'norm_bbox': None, 'use_chat_template': False, 'padding_free': True, 'padding_side': 'right', 'sequence_parallel_size': 1, 'response_prefix': None, 'template_backend': 'swift', 'dataset': [], 'val_dataset': [], 'cached_dataset': ['/workspace/2of3'], 'cached_val_dataset': [], 'split_dataset_ratio': 0.0, 'data_seed': 42, 'dataset_num_proc': 32, 'load_from_cache_file': False, 'dataset_shuffle': True, 'val_dataset_shuffle': False, 'streaming': False, 'interleave_prob': None, 'stopping_strategy': 'first_exhausted', 'shuffle_buffer_size': 1000, 'download_mode': 'reuse_dataset_if_exists', 'columns': {}, 'strict': False, 'remove_unused_columns': True, 'model_name': None, 'model_author': None, 'custom_dataset_info': [], 'quant_method': None, 'quant_bits': None, 'hqq_axis': None, 'bnb_4bit_compute_dtype': torch.bfloat16, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_quant_storage': None, 'max_new_tokens': None, 'temperature': None, 'top_k': 50, 'top_p': 0.9, 'repetition_penalty': 1.0, 'num_beams': 1, 'stream': False, 'stop_words': [], 'logprobs': False, 'top_logprobs': None, 'ckpt_dir': '/workspace/halcyon-recipe2/Qwen3-0.6B-Base_PLT', 'lora_modules': [], 'tuner_backend': 'peft', 'train_type': 'full', 'adapters': [], 'external_plugins': [], 'model_kwargs': {}, 'load_args': False, 'load_data_args': False, 'packing': True, 'packing_length': 16384, 'packing_num_proc': 1, 'lazy_tokenize': False, 'custom_register_path': ['custom_model/custom_register.py'], 'use_hf': True, 'hub_token': None, 'ddp_timeout': 18000000, 'ddp_backend': None, 'ignore_args_error': False, 'use_swift_lora': False, 'freeze_llm': False, 'freeze_vit': True, 'freeze_aligner': True, 'freeze_parameters': [], 'freeze_parameters_regex': None, 'freeze_parameters_ratio': 0.0, 'trainable_parameters': [], 'trainable_parameters_regex': None, 'adapter_load': None, 'target_modules': ['all-linear'], 'target_regex': None, 'modules_to_save': [], 'lora_rank': 8, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_bias': 'none', 'lora_dtype': None, 'use_rslora': False, 'rlhf_type': None, 'ref_load': None, 'ref_adapter_load': None, 'beta': 0.1, 'rpo_alpha': None, 'reference_free': False, 'label_smoothing': 0.0, 'f_divergence_type': 'reverse_kl', 'loss_type': None, 'desirable_weight': 1.0, 'undesirable_weight': 1.0, 'calculate_KL': None, 'center_rewards_coefficient': None, 'generation_batch_size': None, 'steps_per_generation': None, 'num_generations': 8, 'max_completion_length': 512, 'importance_sampling_level': 'token', 'tau_pos': 1.0, 'tau_neg': 1.05, 'epsilon': 0.2, 'epsilon_high': None, 'delta': None, 'use_vllm': True, 'vllm_mode': None, 'vllm_enable_prefix_caching': True, 'vllm_gpu_memory_utilization': 0.9, 'vllm_tensor_parallel_size': 1, 'vllm_max_model_len': None, 'vllm_enforce_eager': False, 'vllm_limit_mm_per_prompt': None, 'vllm_disable_cascade_attn': False, 'vllm_max_num_seqs': None, 'vllm_mm_processor_cache_gb': None, 'vllm_engine_kwargs': None, 'sleep_level': 0, 'offload_optimizer': False, 'offload_model': False, 'offload_bridge': False, 'vllm_server_base_url': None, 'vllm_server_host': None, 'vllm_server_port': [8000], 'vllm_server_timeout': 240.0, 'vllm_server_group_port': None, 'reward_funcs': [], 'reward_weights': None, 'cosine_min_len_value_wrong': -0.5, 'cosine_max_len_value_wrong': 0.0, 'cosine_min_len_value_correct': 1.0, 'cosine_max_len_value_correct': 0.5, 'cosine_max_len': None, 'repetition_n_grams': 3, 'repetition_max_penalty': -1.0, 'soft_max_length': None, 'soft_cache_length': None, 'dynamic_sample': False, 'max_resample_times': 3, 'overlong_filter': False, 'scale_rewards': 'group', 'advantage_estimator': 'grpo', 'kl_in_reward': False, 'wandb_log_unique_prompts': None, 'log_completions': False, 'rollout_importance_sampling_mode': None, 'rollout_importance_sampling_threshold': 2.0, 'log_rollout_offpolicy_metrics': False, 'off_policy_sequence_mask_delta': None, 'reward_model': None, 'reward_model_plugin': None, 'sync_ref_model': False, 'ref_model_sync_steps': 512, 'ref_model_mixup_alpha': 0.6, 'async_generate': False, 'move_model_batches': None, 'multi_turn_scheduler': None, 'max_turns': None, 'completion_length_limit_scope': 'per_round', 'vllm_server_pass_dataset': False, 'log_entropy': False, 'top_entropy_quantile': 1.0, 'num_iterations': 1, 'check_model': True, 'initialize_embedding': False, 'mlp_padding_free': False, 'load_safetensors': False, 'save_safetensors': False, 'ref_model': None, 'ref_adapters': [], 'merge_lora': False, 'max_shard_size': '5GB', 'train_dataloader_shuffle': True, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': True, 'dataloader_prefetch_factor': 10, 'architectures': 'Qwen3ForCausalLM', 'llm_architectures': 'Qwen3ForCausalLM', 'max_epochs': 1, 'enable_dft_loss': False, 'enable_channel_loss': False, 'patch_size': 4, 'save_strategy': 'steps', 'original_max_position_embeddings': None, 'partial_rotary_factor': None, 'use_shared_expert_gate': False, 'vit_gradient_checkpointing': True, 'vit_lr': None, 'aligner_lr': None, 'gradient_checkpointing_kwargs': None, 'linear_num_value_heads': None, 'linear_num_key_heads': None, 'linear_key_head_dim': None, 'linear_value_head_dim': None, 'linear_conv_kernel_dim': None, 'layer_types': None, 'mrope_interleaved': False, 'add_version': True, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, '_wandb': {}}
11
+ 2025-12-23 14:50:18,908 INFO MainThread:40598 [wandb_init.py:init():889] starting backend
12
+ 2025-12-23 14:50:19,151 INFO MainThread:40598 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-23 14:50:19,154 INFO MainThread:40598 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-23 14:50:19,157 INFO MainThread:40598 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-23 14:50:19,163 INFO MainThread:40598 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-23 14:50:19,679 INFO MainThread:40598 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-23 14:50:19,771 INFO MainThread:40598 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-23 14:50:19,775 INFO MainThread:40598 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-24 02:30:17,368 INFO MainThread:40598 [wandb_run.py:_finish():2287] finishing run tepic/plt/ogu6y2pr
23
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
24
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_restore():2468] restore
25
+ 2025-12-24 02:30:17,369 INFO MainThread:40598 [wandb_run.py:_restore():2474] restore done
26
+ 2025-12-24 02:30:18,154 INFO MainThread:40598 [wandb_run.py:_footer_sync_info():3862] logging synced files
wandb/wandb/run-20251223_145018-ogu6y2pr/run-ogu6y2pr.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1acb2ccfb0ccb9fef1cf1852986b85de853724bd3b3d05dc491971340f10a04e
3
+ size 16674565