MegaTronX commited on
Commit
646fb97
·
verified ·
1 Parent(s): efe5ed1

Upload flux_train_utils.py

Browse files
Files changed (1) hide show
  1. library/flux_train_utils.py +619 -0
library/flux_train_utils.py ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import math
3
+ import os
4
+ import numpy as np
5
+ import toml
6
+ import json
7
+ import time
8
+ from typing import Callable, Dict, List, Optional, Tuple, Union
9
+
10
+ import torch
11
+ from accelerate import Accelerator, PartialState
12
+ from transformers import CLIPTextModel
13
+ from tqdm import tqdm
14
+ from PIL import Image
15
+ from safetensors.torch import save_file
16
+
17
+ from library import flux_models, flux_utils, strategy_base, train_util
18
+ from library.device_utils import init_ipex, clean_memory_on_device
19
+
20
+ init_ipex()
21
+
22
+ from .utils import setup_logging, mem_eff_save_file
23
+
24
+ setup_logging()
25
+ import logging
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ # region sample images
31
+
32
+
33
+ def sample_images(
34
+ accelerator: Accelerator,
35
+ args: argparse.Namespace,
36
+ epoch,
37
+ steps,
38
+ flux,
39
+ ae,
40
+ text_encoders,
41
+ sample_prompts_te_outputs,
42
+ prompt_replacement=None,
43
+ controlnet=None
44
+ ):
45
+ if steps == 0:
46
+ if not args.sample_at_first:
47
+ return
48
+ else:
49
+ if args.sample_every_n_steps is None and args.sample_every_n_epochs is None:
50
+ return
51
+ if args.sample_every_n_epochs is not None:
52
+ # sample_every_n_steps は無視する
53
+ if epoch is None or epoch % args.sample_every_n_epochs != 0:
54
+ return
55
+ else:
56
+ if steps % args.sample_every_n_steps != 0 or epoch is not None: # steps is not divisible or end of epoch
57
+ return
58
+
59
+ logger.info("")
60
+ logger.info(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
61
+ if not os.path.isfile(args.sample_prompts) and sample_prompts_te_outputs is None:
62
+ logger.error(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
63
+ return
64
+
65
+ distributed_state = PartialState() # for multi gpu distributed inference. this is a singleton, so it's safe to use it here
66
+
67
+ # unwrap unet and text_encoder(s)
68
+ flux = accelerator.unwrap_model(flux)
69
+ if text_encoders is not None:
70
+ text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
71
+ if controlnet is not None:
72
+ controlnet = accelerator.unwrap_model(controlnet)
73
+ # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
74
+
75
+ prompts = train_util.load_prompts(args.sample_prompts)
76
+
77
+ save_dir = args.output_dir + "/sample"
78
+ os.makedirs(save_dir, exist_ok=True)
79
+
80
+ # save random state to restore later
81
+ rng_state = torch.get_rng_state()
82
+ cuda_rng_state = None
83
+ try:
84
+ cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
85
+ except Exception:
86
+ pass
87
+
88
+ if distributed_state.num_processes <= 1:
89
+ # If only one device is available, just use the original prompt list. We don't need to care about the distribution of prompts.
90
+ with torch.no_grad(), accelerator.autocast():
91
+ for prompt_dict in prompts:
92
+ sample_image_inference(
93
+ accelerator,
94
+ args,
95
+ flux,
96
+ text_encoders,
97
+ ae,
98
+ save_dir,
99
+ prompt_dict,
100
+ epoch,
101
+ steps,
102
+ sample_prompts_te_outputs,
103
+ prompt_replacement,
104
+ controlnet
105
+ )
106
+ else:
107
+ # Creating list with N elements, where each element is a list of prompt_dicts, and N is the number of processes available (number of devices available)
108
+ # prompt_dicts are assigned to lists based on order of processes, to attempt to time the image creation time to match enum order. Probably only works when steps and sampler are identical.
109
+ per_process_prompts = [] # list of lists
110
+ for i in range(distributed_state.num_processes):
111
+ per_process_prompts.append(prompts[i :: distributed_state.num_processes])
112
+
113
+ with torch.no_grad():
114
+ with distributed_state.split_between_processes(per_process_prompts) as prompt_dict_lists:
115
+ for prompt_dict in prompt_dict_lists[0]:
116
+ sample_image_inference(
117
+ accelerator,
118
+ args,
119
+ flux,
120
+ text_encoders,
121
+ ae,
122
+ save_dir,
123
+ prompt_dict,
124
+ epoch,
125
+ steps,
126
+ sample_prompts_te_outputs,
127
+ prompt_replacement,
128
+ controlnet
129
+ )
130
+
131
+ torch.set_rng_state(rng_state)
132
+ if cuda_rng_state is not None:
133
+ torch.cuda.set_rng_state(cuda_rng_state)
134
+
135
+ clean_memory_on_device(accelerator.device)
136
+
137
+
138
+ def sample_image_inference(
139
+ accelerator: Accelerator,
140
+ args: argparse.Namespace,
141
+ flux: flux_models.Flux,
142
+ text_encoders: Optional[List[CLIPTextModel]],
143
+ ae: flux_models.AutoEncoder,
144
+ save_dir,
145
+ prompt_dict,
146
+ epoch,
147
+ steps,
148
+ sample_prompts_te_outputs,
149
+ prompt_replacement,
150
+ controlnet
151
+ ):
152
+ assert isinstance(prompt_dict, dict)
153
+ # negative_prompt = prompt_dict.get("negative_prompt")
154
+ sample_steps = prompt_dict.get("sample_steps", 20)
155
+ width = prompt_dict.get("width", 512)
156
+ height = prompt_dict.get("height", 512)
157
+ scale = prompt_dict.get("scale", 3.5)
158
+ seed = prompt_dict.get("seed")
159
+ controlnet_image = prompt_dict.get("controlnet_image")
160
+ prompt: str = prompt_dict.get("prompt", "")
161
+ # sampler_name: str = prompt_dict.get("sample_sampler", args.sample_sampler)
162
+
163
+ if prompt_replacement is not None:
164
+ prompt = prompt.replace(prompt_replacement[0], prompt_replacement[1])
165
+ # if negative_prompt is not None:
166
+ # negative_prompt = negative_prompt.replace(prompt_replacement[0], prompt_replacement[1])
167
+
168
+ if seed is not None:
169
+ torch.manual_seed(seed)
170
+ torch.cuda.manual_seed(seed)
171
+ else:
172
+ # True random sample image generation
173
+ torch.seed()
174
+ torch.cuda.seed()
175
+
176
+ # if negative_prompt is None:
177
+ # negative_prompt = ""
178
+ height = max(64, height - height % 16) # round to divisible by 16
179
+ width = max(64, width - width % 16) # round to divisible by 16
180
+ logger.info(f"prompt: {prompt}")
181
+ # logger.info(f"negative_prompt: {negative_prompt}")
182
+ logger.info(f"height: {height}")
183
+ logger.info(f"width: {width}")
184
+ logger.info(f"sample_steps: {sample_steps}")
185
+ logger.info(f"scale: {scale}")
186
+ # logger.info(f"sample_sampler: {sampler_name}")
187
+ if seed is not None:
188
+ logger.info(f"seed: {seed}")
189
+
190
+ # encode prompts
191
+ tokenize_strategy = strategy_base.TokenizeStrategy.get_strategy()
192
+ encoding_strategy = strategy_base.TextEncodingStrategy.get_strategy()
193
+
194
+ text_encoder_conds = []
195
+ if sample_prompts_te_outputs and prompt in sample_prompts_te_outputs:
196
+ text_encoder_conds = sample_prompts_te_outputs[prompt]
197
+ print(f"Using cached text encoder outputs for prompt: {prompt}")
198
+ if text_encoders is not None:
199
+ print(f"Encoding prompt: {prompt}")
200
+ tokens_and_masks = tokenize_strategy.tokenize(prompt)
201
+ # strategy has apply_t5_attn_mask option
202
+ encoded_text_encoder_conds = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, tokens_and_masks)
203
+
204
+ # if text_encoder_conds is not cached, use encoded_text_encoder_conds
205
+ if len(text_encoder_conds) == 0:
206
+ text_encoder_conds = encoded_text_encoder_conds
207
+ else:
208
+ # if encoded_text_encoder_conds is not None, update cached text_encoder_conds
209
+ for i in range(len(encoded_text_encoder_conds)):
210
+ if encoded_text_encoder_conds[i] is not None:
211
+ text_encoder_conds[i] = encoded_text_encoder_conds[i]
212
+
213
+ l_pooled, t5_out, txt_ids, t5_attn_mask = text_encoder_conds
214
+
215
+ # sample image
216
+ weight_dtype = ae.dtype # TOFO give dtype as argument
217
+ packed_latent_height = height // 16
218
+ packed_latent_width = width // 16
219
+ noise = torch.randn(
220
+ 1,
221
+ packed_latent_height * packed_latent_width,
222
+ 16 * 2 * 2,
223
+ device=accelerator.device,
224
+ dtype=weight_dtype,
225
+ generator=torch.Generator(device=accelerator.device).manual_seed(seed) if seed is not None else None,
226
+ )
227
+ timesteps = get_schedule(sample_steps, noise.shape[1], shift=True) # FLUX.1 dev -> shift=True
228
+ img_ids = flux_utils.prepare_img_ids(1, packed_latent_height, packed_latent_width).to(accelerator.device, weight_dtype)
229
+ t5_attn_mask = t5_attn_mask.to(accelerator.device) if args.apply_t5_attn_mask else None
230
+
231
+ if controlnet_image is not None:
232
+ controlnet_image = Image.open(controlnet_image).convert("RGB")
233
+ controlnet_image = controlnet_image.resize((width, height), Image.LANCZOS)
234
+ controlnet_image = torch.from_numpy((np.array(controlnet_image) / 127.5) - 1)
235
+ controlnet_image = controlnet_image.permute(2, 0, 1).unsqueeze(0).to(weight_dtype).to(accelerator.device)
236
+
237
+ with accelerator.autocast(), torch.no_grad():
238
+ x = denoise(flux, noise, img_ids, t5_out, txt_ids, l_pooled, timesteps=timesteps, guidance=scale, t5_attn_mask=t5_attn_mask, controlnet=controlnet, controlnet_img=controlnet_image)
239
+
240
+ x = flux_utils.unpack_latents(x, packed_latent_height, packed_latent_width)
241
+
242
+ # latent to image
243
+ clean_memory_on_device(accelerator.device)
244
+ org_vae_device = ae.device # will be on cpu
245
+ ae.to(accelerator.device) # distributed_state.device is same as accelerator.device
246
+ with accelerator.autocast(), torch.no_grad():
247
+ x = ae.decode(x)
248
+ ae.to(org_vae_device)
249
+ clean_memory_on_device(accelerator.device)
250
+
251
+ x = x.clamp(-1, 1)
252
+ x = x.permute(0, 2, 3, 1)
253
+ image = Image.fromarray((127.5 * (x + 1.0)).float().cpu().numpy().astype(np.uint8)[0])
254
+
255
+ # adding accelerator.wait_for_everyone() here should sync up and ensure that sample images are saved in the same order as the original prompt list
256
+ # but adding 'enum' to the filename should be enough
257
+
258
+ ts_str = time.strftime("%Y%m%d%H%M%S", time.localtime())
259
+ num_suffix = f"e{epoch:06d}" if epoch is not None else f"{steps:06d}"
260
+ seed_suffix = "" if seed is None else f"_{seed}"
261
+ i: int = prompt_dict["enum"]
262
+ img_filename = f"{'' if args.output_name is None else args.output_name + '_'}{num_suffix}_{i:02d}_{ts_str}{seed_suffix}.png"
263
+ image.save(os.path.join(save_dir, img_filename))
264
+
265
+ # send images to wandb if enabled
266
+ if "wandb" in [tracker.name for tracker in accelerator.trackers]:
267
+ wandb_tracker = accelerator.get_tracker("wandb")
268
+
269
+ import wandb
270
+
271
+ # not to commit images to avoid inconsistency between training and logging steps
272
+ wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False) # positive prompt as a caption
273
+
274
+
275
+ def time_shift(mu: float, sigma: float, t: torch.Tensor):
276
+ return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
277
+
278
+
279
+ def get_lin_function(x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15) -> Callable[[float], float]:
280
+ m = (y2 - y1) / (x2 - x1)
281
+ b = y1 - m * x1
282
+ return lambda x: m * x + b
283
+
284
+
285
+ def get_schedule(
286
+ num_steps: int,
287
+ image_seq_len: int,
288
+ base_shift: float = 0.5,
289
+ max_shift: float = 1.15,
290
+ shift: bool = True,
291
+ ) -> list[float]:
292
+ # extra step for zero
293
+ timesteps = torch.linspace(1, 0, num_steps + 1)
294
+
295
+ # shifting the schedule to favor high timesteps for higher signal images
296
+ if shift:
297
+ # eastimate mu based on linear estimation between two points
298
+ mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
299
+ timesteps = time_shift(mu, 1.0, timesteps)
300
+
301
+ return timesteps.tolist()
302
+
303
+
304
+ def denoise(
305
+ model: flux_models.Flux,
306
+ img: torch.Tensor,
307
+ img_ids: torch.Tensor,
308
+ txt: torch.Tensor,
309
+ txt_ids: torch.Tensor,
310
+ vec: torch.Tensor,
311
+ timesteps: list[float],
312
+ guidance: float = 4.0,
313
+ t5_attn_mask: Optional[torch.Tensor] = None,
314
+ controlnet: Optional[flux_models.ControlNetFlux] = None,
315
+ controlnet_img: Optional[torch.Tensor] = None,
316
+ ):
317
+ # this is ignored for schnell
318
+ guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
319
+
320
+
321
+ for t_curr, t_prev in zip(tqdm(timesteps[:-1]), timesteps[1:]):
322
+ t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
323
+ model.prepare_block_swap_before_forward()
324
+ if controlnet is not None:
325
+ block_samples, block_single_samples = controlnet(
326
+ img=img,
327
+ img_ids=img_ids,
328
+ controlnet_cond=controlnet_img,
329
+ txt=txt,
330
+ txt_ids=txt_ids,
331
+ y=vec,
332
+ timesteps=t_vec,
333
+ guidance=guidance_vec,
334
+ txt_attention_mask=t5_attn_mask,
335
+ )
336
+ else:
337
+ block_samples = None
338
+ block_single_samples = None
339
+ pred = model(
340
+ img=img,
341
+ img_ids=img_ids,
342
+ txt=txt,
343
+ txt_ids=txt_ids,
344
+ y=vec,
345
+ block_controlnet_hidden_states=block_samples,
346
+ block_controlnet_single_hidden_states=block_single_samples,
347
+ timesteps=t_vec,
348
+ guidance=guidance_vec,
349
+ txt_attention_mask=t5_attn_mask,
350
+ )
351
+
352
+ img = img + (t_prev - t_curr) * pred
353
+
354
+ model.prepare_block_swap_before_forward()
355
+ return img
356
+
357
+
358
+ # endregion
359
+
360
+
361
+ # region train
362
+ def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
363
+ sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
364
+ schedule_timesteps = noise_scheduler.timesteps.to(device)
365
+ timesteps = timesteps.to(device)
366
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
367
+
368
+ sigma = sigmas[step_indices].flatten()
369
+ while len(sigma.shape) < n_dim:
370
+ sigma = sigma.unsqueeze(-1)
371
+ return sigma
372
+
373
+
374
+ def compute_density_for_timestep_sampling(
375
+ weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
376
+ ):
377
+ """Compute the density for sampling the timesteps when doing SD3 training.
378
+
379
+ Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
380
+
381
+ SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
382
+ """
383
+ if weighting_scheme == "logit_normal":
384
+ # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
385
+ u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
386
+ u = torch.nn.functional.sigmoid(u)
387
+ elif weighting_scheme == "mode":
388
+ u = torch.rand(size=(batch_size,), device="cpu")
389
+ u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
390
+ else:
391
+ u = torch.rand(size=(batch_size,), device="cpu")
392
+ return u
393
+
394
+
395
+ def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
396
+ """Computes loss weighting scheme for SD3 training.
397
+
398
+ Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
399
+
400
+ SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
401
+ """
402
+ if weighting_scheme == "sigma_sqrt":
403
+ weighting = (sigmas**-2.0).float()
404
+ elif weighting_scheme == "cosmap":
405
+ bot = 1 - 2 * sigmas + 2 * sigmas**2
406
+ weighting = 2 / (math.pi * bot)
407
+ else:
408
+ weighting = torch.ones_like(sigmas)
409
+ return weighting
410
+
411
+
412
+ def get_noisy_model_input_and_timesteps(
413
+ args, noise_scheduler, latents, noise, device, dtype
414
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
415
+ bsz, _, h, w = latents.shape
416
+ sigmas = None
417
+
418
+ if args.timestep_sampling == "uniform" or args.timestep_sampling == "sigmoid":
419
+ # Simple random t-based noise sampling
420
+ if args.timestep_sampling == "sigmoid":
421
+ # https://github.com/XLabs-AI/x-flux/tree/main
422
+ t = torch.sigmoid(args.sigmoid_scale * torch.randn((bsz,), device=device))
423
+ else:
424
+ t = torch.rand((bsz,), device=device)
425
+
426
+ timesteps = t * 1000.0
427
+ t = t.view(-1, 1, 1, 1)
428
+ noisy_model_input = (1 - t) * latents + t * noise
429
+ elif args.timestep_sampling == "shift":
430
+ shift = args.discrete_flow_shift
431
+ logits_norm = torch.randn(bsz, device=device)
432
+ logits_norm = logits_norm * args.sigmoid_scale # larger scale for more uniform sampling
433
+ timesteps = logits_norm.sigmoid()
434
+ timesteps = (timesteps * shift) / (1 + (shift - 1) * timesteps)
435
+
436
+ t = timesteps.view(-1, 1, 1, 1)
437
+ timesteps = timesteps * 1000.0
438
+ noisy_model_input = (1 - t) * latents + t * noise
439
+ elif args.timestep_sampling == "flux_shift":
440
+ logits_norm = torch.randn(bsz, device=device)
441
+ logits_norm = logits_norm * args.sigmoid_scale # larger scale for more uniform sampling
442
+ timesteps = logits_norm.sigmoid()
443
+ mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
444
+ timesteps = time_shift(mu, 1.0, timesteps)
445
+
446
+ t = timesteps.view(-1, 1, 1, 1)
447
+ timesteps = timesteps * 1000.0
448
+ noisy_model_input = (1 - t) * latents + t * noise
449
+ else:
450
+ # Sample a random timestep for each image
451
+ # for weighting schemes where we sample timesteps non-uniformly
452
+ u = compute_density_for_timestep_sampling(
453
+ weighting_scheme=args.weighting_scheme,
454
+ batch_size=bsz,
455
+ logit_mean=args.logit_mean,
456
+ logit_std=args.logit_std,
457
+ mode_scale=args.mode_scale,
458
+ )
459
+ indices = (u * noise_scheduler.config.num_train_timesteps).long()
460
+ timesteps = noise_scheduler.timesteps[indices].to(device=device)
461
+
462
+ # Add noise according to flow matching.
463
+ sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
464
+ noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
465
+
466
+ return noisy_model_input.to(dtype), timesteps.to(dtype), sigmas
467
+
468
+
469
+ def apply_model_prediction_type(args, model_pred, noisy_model_input, sigmas):
470
+ weighting = None
471
+ if args.model_prediction_type == "raw":
472
+ pass
473
+ elif args.model_prediction_type == "additive":
474
+ # add the model_pred to the noisy_model_input
475
+ model_pred = model_pred + noisy_model_input
476
+ elif args.model_prediction_type == "sigma_scaled":
477
+ # apply sigma scaling
478
+ model_pred = model_pred * (-sigmas) + noisy_model_input
479
+
480
+ # these weighting schemes use a uniform timestep sampling
481
+ # and instead post-weight the loss
482
+ weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
483
+
484
+ return model_pred, weighting
485
+
486
+
487
+ def save_models(
488
+ ckpt_path: str,
489
+ flux: flux_models.Flux,
490
+ sai_metadata: Optional[dict],
491
+ save_dtype: Optional[torch.dtype] = None,
492
+ use_mem_eff_save: bool = False,
493
+ ):
494
+ state_dict = {}
495
+
496
+ def update_sd(prefix, sd):
497
+ for k, v in sd.items():
498
+ key = prefix + k
499
+ if save_dtype is not None and v.dtype != save_dtype:
500
+ v = v.detach().clone().to("cpu").to(save_dtype)
501
+ state_dict[key] = v
502
+
503
+ update_sd("", flux.state_dict())
504
+
505
+ if not use_mem_eff_save:
506
+ save_file(state_dict, ckpt_path, metadata=sai_metadata)
507
+ else:
508
+ mem_eff_save_file(state_dict, ckpt_path, metadata=sai_metadata)
509
+
510
+
511
+ def save_flux_model_on_train_end(
512
+ args: argparse.Namespace, save_dtype: torch.dtype, epoch: int, global_step: int, flux: flux_models.Flux
513
+ ):
514
+ def sd_saver(ckpt_file, epoch_no, global_step):
515
+ sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
516
+ save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
517
+
518
+ train_util.save_sd_model_on_train_end_common(args, True, True, epoch, global_step, sd_saver, None)
519
+
520
+
521
+ # epochとstepの保存���メタデータにepoch/stepが含まれ引数が同じになるため、統合している
522
+ # on_epoch_end: Trueならepoch終了時、Falseならstep経過時
523
+ def save_flux_model_on_epoch_end_or_stepwise(
524
+ args: argparse.Namespace,
525
+ on_epoch_end: bool,
526
+ accelerator,
527
+ save_dtype: torch.dtype,
528
+ epoch: int,
529
+ num_train_epochs: int,
530
+ global_step: int,
531
+ flux: flux_models.Flux,
532
+ ):
533
+ def sd_saver(ckpt_file, epoch_no, global_step):
534
+ sai_metadata = train_util.get_sai_model_spec(None, args, False, False, False, is_stable_diffusion_ckpt=True, flux="dev")
535
+ save_models(ckpt_file, flux, sai_metadata, save_dtype, args.mem_eff_save)
536
+
537
+ train_util.save_sd_model_on_epoch_end_or_stepwise_common(
538
+ args,
539
+ on_epoch_end,
540
+ accelerator,
541
+ True,
542
+ True,
543
+ epoch,
544
+ num_train_epochs,
545
+ global_step,
546
+ sd_saver,
547
+ None,
548
+ )
549
+
550
+
551
+ # endregion
552
+
553
+
554
+ def add_flux_train_arguments(parser: argparse.ArgumentParser):
555
+ parser.add_argument(
556
+ "--clip_l",
557
+ type=str,
558
+ help="path to clip_l (*.sft or *.safetensors), should be float16 / clip_lのパス(*.sftまたは*.safetensors)、float16が前提",
559
+ )
560
+ parser.add_argument(
561
+ "--t5xxl",
562
+ type=str,
563
+ help="path to t5xxl (*.sft or *.safetensors), should be float16 / t5xxlのパス(*.sftまたは*.safetensors)、float16が前提",
564
+ )
565
+ parser.add_argument("--ae", type=str, help="path to ae (*.sft or *.safetensors) / aeのパス(*.sftまたは*.safetensors)")
566
+ parser.add_argument(
567
+ "--controlnet_model_name_or_path",
568
+ type=str,
569
+ default=None,
570
+ help="path to controlnet (*.sft or *.safetensors) / controlnetのパス(*.sftまたは*.safetensors)"
571
+ )
572
+ parser.add_argument(
573
+ "--t5xxl_max_token_length",
574
+ type=int,
575
+ default=None,
576
+ help="maximum token length for T5-XXL. if omitted, 256 for schnell and 512 for dev"
577
+ " / T5-XXLの最大トークン長。省略された場合、schnellの場合は256、devの場合は512",
578
+ )
579
+ parser.add_argument(
580
+ "--apply_t5_attn_mask",
581
+ action="store_true",
582
+ help="apply attention mask to T5-XXL encode and FLUX double blocks / T5-XXLエンコードとFLUXダブルブロックにアテンションマスクを適用する",
583
+ )
584
+
585
+ parser.add_argument(
586
+ "--guidance_scale",
587
+ type=float,
588
+ default=3.5,
589
+ help="the FLUX.1 dev variant is a guidance distilled model",
590
+ )
591
+
592
+ parser.add_argument(
593
+ "--timestep_sampling",
594
+ choices=["sigma", "uniform", "sigmoid", "shift", "flux_shift"],
595
+ default="sigma",
596
+ help="Method to sample timesteps: sigma-based, uniform random, sigmoid of random normal, shift of sigmoid and FLUX.1 shifting."
597
+ " / タイムステップをサンプリングする方法:sigma、random uniform、random normalのsigmoid、sigmoidのシフト、FLUX.1のシフト。",
598
+ )
599
+ parser.add_argument(
600
+ "--sigmoid_scale",
601
+ type=float,
602
+ default=1.0,
603
+ help='Scale factor for sigmoid timestep sampling (only used when timestep-sampling is "sigmoid"). / sigmoidタイムステップサンプリングの倍率(timestep-samplingが"sigmoid"の場合のみ有効)。',
604
+ )
605
+ parser.add_argument(
606
+ "--model_prediction_type",
607
+ choices=["raw", "additive", "sigma_scaled"],
608
+ default="sigma_scaled",
609
+ help="How to interpret and process the model prediction: "
610
+ "raw (use as is), additive (add to noisy input), sigma_scaled (apply sigma scaling)."
611
+ " / モデル予測の解釈と処理方法:"
612
+ "raw(そのまま使用)、additive(ノイズ入力に加算)、sigma_scaled(シグマスケーリングを適用)。",
613
+ )
614
+ parser.add_argument(
615
+ "--discrete_flow_shift",
616
+ type=float,
617
+ default=3.0,
618
+ help="Discrete flow shift for the Euler Discrete Scheduler, default is 3.0. / Euler Discrete Schedulerの離散フローシフト、デフォルトは3.0。",
619
+ )