Spaces:

toninio19
/

keysync-demo

Running

keysync-demo / interpolation.yaml

Antoni Bigata

add checkpoint download

13f5b7a 6 months ago

4.71 kB

	model:
	target: sgm.models.diffusion.DiffusionEngine
	params:
	scale_factor: 0.18215
	disable_first_stage_autocast: True
	ckpt_path:

	denoiser_config:
	target: sgm.modules.diffusionmodules.denoiser.DenoiserDub
	params:
	scaling_config:
	target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise

	network_wrapper:
	target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper
	params:
	im_size: [512, 512] # USER: adapt this to your dataset
	n_channels: 4
	starting_mask_method: zeros
	add_mask: True

	network_config:
	target: sgm.modules.diffusionmodules.video_model.VideoUNet
	params:
	adm_in_channels: 0
	num_classes: sequential
	use_checkpoint: True
	in_channels: 9
	out_channels: 4
	model_channels: 320
	attention_resolutions: [4, 2, 1]
	num_res_blocks: 2
	channel_mult: [1, 2, 4, 4]
	num_head_channels: 64
	use_linear_in_transformer: True
	transformer_depth: 1
	context_dim: 1024
	spatial_transformer_attn_type: softmax-xformers
	extra_ff_mix_layer: True
	use_spatial_context: True
	merge_strategy: learned_with_images
	video_kernel_size: [3, 1, 1]
	fine_tuning_method: null
	audio_cond_method: both_keyframes
	additional_audio_frames: 0
	audio_dim: 1024
	unfreeze_blocks: ["input"]

	conditioner_config:
	target: sgm.modules.GeneralConditioner
	params:
	emb_models:

	- input_key: cond_frames
	is_trainable: False
	target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
	params:
	disable_encoder_autocast: True
	n_cond_frames: 2
	n_copies: 1
	is_ae: True
	encoder_config:
	target: sgm.models.autoencoder.AutoencoderKLModeOnly
	params:
	embed_dim: 4
	monitor: val/rec_loss
	ddconfig:
	attn_type: vanilla-xformers
	double_z: True
	z_channels: 4
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult: [1, 2, 4, 4]
	num_res_blocks: 2
	attn_resolutions: []
	dropout: 0.0
	lossconfig:
	target: torch.nn.Identity

	- input_key: gt # allows to use the ground truth as a condition
	is_trainable: False
	target: sgm.modules.encoders.modules.IdentityEncoder
	params:
	cond_type: gt

	- input_key: audio_emb
	is_trainable: True
	target: sgm.modules.encoders.modules.WhisperAudioEmbedder
	params:
	merge_method: mean
	linear_dim: 1024
	cond_type: crossattn
	audio_dim: 768

	- input_key: masks
	is_trainable: False
	target: sgm.modules.encoders.modules.IdentityEncoder
	params:
	cond_type: masks

	first_stage_config:
	target: sgm.models.autoencoder.AutoencodingEngine
	params:
	loss_config:
	target: torch.nn.Identity
	regularizer_config:
	target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
	encoder_config:
	target: sgm.modules.diffusionmodules.model.Encoder
	params:
	attn_type: vanilla
	double_z: True
	z_channels: 4
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult: [1, 2, 4, 4]
	num_res_blocks: 2
	attn_resolutions: []
	dropout: 0.0
	decoder_config:
	target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
	params:
	attn_type: vanilla
	double_z: True
	z_channels: 4
	resolution: 256
	in_channels: 3
	out_ch: 3
	ch: 128
	ch_mult: [1, 2, 4, 4]
	num_res_blocks: 2
	attn_resolutions: []
	dropout: 0.0
	video_kernel_size: [3, 1, 1]

	sampler_config:
	target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
	params:
	num_steps: 10
	discretization_config:
	target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization

	guider_config:
	target: sgm.modules.diffusionmodules.guiders.AudioRefMultiCondGuider
	params:
	audio_ratio: 5.0
	ref_ratio: 2.0