Fabrice-TIERCELIN commited on
Commit
7d1e42c
·
verified ·
1 Parent(s): c812274

Useless test folder

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. diffusers/tests/__init__.py +0 -0
  2. diffusers/tests/conftest.py +0 -44
  3. diffusers/tests/fixtures/custom_pipeline/pipeline.py +0 -101
  4. diffusers/tests/fixtures/custom_pipeline/what_ever.py +0 -101
  5. diffusers/tests/fixtures/elise_format0.mid +0 -0
  6. diffusers/tests/models/__init__.py +0 -0
  7. diffusers/tests/models/test_models_unet_1d.py +0 -284
  8. diffusers/tests/models/test_models_unet_2d.py +0 -297
  9. diffusers/tests/models/test_models_unet_2d_condition.py +0 -944
  10. diffusers/tests/models/test_models_unet_2d_flax.py +0 -104
  11. diffusers/tests/models/test_models_unet_3d_condition.py +0 -241
  12. diffusers/tests/models/test_models_vae.py +0 -345
  13. diffusers/tests/models/test_models_vae_flax.py +0 -39
  14. diffusers/tests/models/test_models_vq.py +0 -94
  15. diffusers/tests/pipeline_params.py +0 -121
  16. diffusers/tests/pipelines/__init__.py +0 -0
  17. diffusers/tests/pipelines/altdiffusion/__init__.py +0 -0
  18. diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py +0 -244
  19. diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +0 -299
  20. diffusers/tests/pipelines/audio_diffusion/__init__.py +0 -0
  21. diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py +0 -191
  22. diffusers/tests/pipelines/audioldm/__init__.py +0 -0
  23. diffusers/tests/pipelines/audioldm/test_audioldm.py +0 -416
  24. diffusers/tests/pipelines/dance_diffusion/__init__.py +0 -0
  25. diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py +0 -160
  26. diffusers/tests/pipelines/ddim/__init__.py +0 -0
  27. diffusers/tests/pipelines/ddim/test_ddim.py +0 -132
  28. diffusers/tests/pipelines/ddpm/__init__.py +0 -0
  29. diffusers/tests/pipelines/ddpm/test_ddpm.py +0 -111
  30. diffusers/tests/pipelines/dit/__init__.py +0 -0
  31. diffusers/tests/pipelines/dit/test_dit.py +0 -152
  32. diffusers/tests/pipelines/karras_ve/__init__.py +0 -0
  33. diffusers/tests/pipelines/karras_ve/test_karras_ve.py +0 -86
  34. diffusers/tests/pipelines/latent_diffusion/__init__.py +0 -0
  35. diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py +0 -202
  36. diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +0 -131
  37. diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py +0 -116
  38. diffusers/tests/pipelines/paint_by_example/__init__.py +0 -0
  39. diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py +0 -210
  40. diffusers/tests/pipelines/pndm/__init__.py +0 -0
  41. diffusers/tests/pipelines/pndm/test_pndm.py +0 -87
  42. diffusers/tests/pipelines/repaint/__init__.py +0 -0
  43. diffusers/tests/pipelines/repaint/test_repaint.py +0 -162
  44. diffusers/tests/pipelines/score_sde_ve/__init__.py +0 -0
  45. diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py +0 -91
  46. diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py +0 -0
  47. diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py +0 -601
  48. diffusers/tests/pipelines/spectrogram_diffusion/__init__.py +0 -0
  49. diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +0 -235
  50. diffusers/tests/pipelines/stable_diffusion/__init__.py +0 -0
diffusers/tests/__init__.py DELETED
File without changes
diffusers/tests/conftest.py DELETED
@@ -1,44 +0,0 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- # tests directory-specific settings - this file is run automatically
16
- # by pytest before any tests are run
17
-
18
- import sys
19
- import warnings
20
- from os.path import abspath, dirname, join
21
-
22
-
23
- # allow having multiple repository checkouts and not needing to remember to rerun
24
- # 'pip install -e .[dev]' when switching between checkouts and running tests.
25
- git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
26
- sys.path.insert(1, git_repo_path)
27
-
28
- # silence FutureWarning warnings in tests since often we can't act on them until
29
- # they become normal warnings - i.e. the tests still need to test the current functionality
30
- warnings.simplefilter(action="ignore", category=FutureWarning)
31
-
32
-
33
- def pytest_addoption(parser):
34
- from diffusers.utils.testing_utils import pytest_addoption_shared
35
-
36
- pytest_addoption_shared(parser)
37
-
38
-
39
- def pytest_terminal_summary(terminalreporter):
40
- from diffusers.utils.testing_utils import pytest_terminal_summary_main
41
-
42
- make_reports = terminalreporter.config.getoption("--make-reports")
43
- if make_reports:
44
- pytest_terminal_summary_main(terminalreporter, id=make_reports)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/fixtures/custom_pipeline/pipeline.py DELETED
@@ -1,101 +0,0 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
-
14
- # limitations under the License.
15
-
16
-
17
- from typing import Optional, Tuple, Union
18
-
19
- import torch
20
-
21
- from diffusers import DiffusionPipeline, ImagePipelineOutput
22
-
23
-
24
- class CustomLocalPipeline(DiffusionPipeline):
25
- r"""
26
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
27
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
28
-
29
- Parameters:
30
- unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
31
- scheduler ([`SchedulerMixin`]):
32
- A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
33
- [`DDPMScheduler`], or [`DDIMScheduler`].
34
- """
35
-
36
- def __init__(self, unet, scheduler):
37
- super().__init__()
38
- self.register_modules(unet=unet, scheduler=scheduler)
39
-
40
- @torch.no_grad()
41
- def __call__(
42
- self,
43
- batch_size: int = 1,
44
- generator: Optional[torch.Generator] = None,
45
- num_inference_steps: int = 50,
46
- output_type: Optional[str] = "pil",
47
- return_dict: bool = True,
48
- **kwargs,
49
- ) -> Union[ImagePipelineOutput, Tuple]:
50
- r"""
51
- Args:
52
- batch_size (`int`, *optional*, defaults to 1):
53
- The number of images to generate.
54
- generator (`torch.Generator`, *optional*):
55
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
56
- deterministic.
57
- eta (`float`, *optional*, defaults to 0.0):
58
- The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
59
- num_inference_steps (`int`, *optional*, defaults to 50):
60
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
61
- expense of slower inference.
62
- output_type (`str`, *optional*, defaults to `"pil"`):
63
- The output format of the generate image. Choose between
64
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
65
- return_dict (`bool`, *optional*, defaults to `True`):
66
- Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
67
-
68
- Returns:
69
- [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
70
- `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
71
- generated images.
72
- """
73
-
74
- # Sample gaussian noise to begin loop
75
- image = torch.randn(
76
- (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
77
- generator=generator,
78
- )
79
- image = image.to(self.device)
80
-
81
- # set step values
82
- self.scheduler.set_timesteps(num_inference_steps)
83
-
84
- for t in self.progress_bar(self.scheduler.timesteps):
85
- # 1. predict noise model_output
86
- model_output = self.unet(image, t).sample
87
-
88
- # 2. predict previous mean of image x_t-1 and add variance depending on eta
89
- # eta corresponds to η in paper and should be between [0, 1]
90
- # do x_t -> x_t-1
91
- image = self.scheduler.step(model_output, t, image).prev_sample
92
-
93
- image = (image / 2 + 0.5).clamp(0, 1)
94
- image = image.cpu().permute(0, 2, 3, 1).numpy()
95
- if output_type == "pil":
96
- image = self.numpy_to_pil(image)
97
-
98
- if not return_dict:
99
- return (image,), "This is a local test"
100
-
101
- return ImagePipelineOutput(images=image), "This is a local test"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/fixtures/custom_pipeline/what_ever.py DELETED
@@ -1,101 +0,0 @@
1
- # Copyright 2023 The HuggingFace Team. All rights reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
-
14
- # limitations under the License.
15
-
16
-
17
- from typing import Optional, Tuple, Union
18
-
19
- import torch
20
-
21
- from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
22
-
23
-
24
- class CustomLocalPipeline(DiffusionPipeline):
25
- r"""
26
- This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
27
- library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
28
-
29
- Parameters:
30
- unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
31
- scheduler ([`SchedulerMixin`]):
32
- A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
33
- [`DDPMScheduler`], or [`DDIMScheduler`].
34
- """
35
-
36
- def __init__(self, unet, scheduler):
37
- super().__init__()
38
- self.register_modules(unet=unet, scheduler=scheduler)
39
-
40
- @torch.no_grad()
41
- def __call__(
42
- self,
43
- batch_size: int = 1,
44
- generator: Optional[torch.Generator] = None,
45
- num_inference_steps: int = 50,
46
- output_type: Optional[str] = "pil",
47
- return_dict: bool = True,
48
- **kwargs,
49
- ) -> Union[ImagePipelineOutput, Tuple]:
50
- r"""
51
- Args:
52
- batch_size (`int`, *optional*, defaults to 1):
53
- The number of images to generate.
54
- generator (`torch.Generator`, *optional*):
55
- A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
56
- deterministic.
57
- eta (`float`, *optional*, defaults to 0.0):
58
- The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
59
- num_inference_steps (`int`, *optional*, defaults to 50):
60
- The number of denoising steps. More denoising steps usually lead to a higher quality image at the
61
- expense of slower inference.
62
- output_type (`str`, *optional*, defaults to `"pil"`):
63
- The output format of the generate image. Choose between
64
- [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
65
- return_dict (`bool`, *optional*, defaults to `True`):
66
- Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
67
-
68
- Returns:
69
- [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
70
- `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
71
- generated images.
72
- """
73
-
74
- # Sample gaussian noise to begin loop
75
- image = torch.randn(
76
- (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
77
- generator=generator,
78
- )
79
- image = image.to(self.device)
80
-
81
- # set step values
82
- self.scheduler.set_timesteps(num_inference_steps)
83
-
84
- for t in self.progress_bar(self.scheduler.timesteps):
85
- # 1. predict noise model_output
86
- model_output = self.unet(image, t).sample
87
-
88
- # 2. predict previous mean of image x_t-1 and add variance depending on eta
89
- # eta corresponds to η in paper and should be between [0, 1]
90
- # do x_t -> x_t-1
91
- image = self.scheduler.step(model_output, t, image).prev_sample
92
-
93
- image = (image / 2 + 0.5).clamp(0, 1)
94
- image = image.cpu().permute(0, 2, 3, 1).numpy()
95
- if output_type == "pil":
96
- image = self.numpy_to_pil(image)
97
-
98
- if not return_dict:
99
- return (image,), "This is a local test"
100
-
101
- return ImagePipelineOutput(images=image), "This is a local test"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/fixtures/elise_format0.mid DELETED
Binary file (14.2 kB)
 
diffusers/tests/models/__init__.py DELETED
File without changes
diffusers/tests/models/test_models_unet_1d.py DELETED
@@ -1,284 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import torch
19
-
20
- from diffusers import UNet1DModel
21
- from diffusers.utils import floats_tensor, slow, torch_device
22
-
23
- from ..test_modeling_common import ModelTesterMixin
24
-
25
-
26
- torch.backends.cuda.matmul.allow_tf32 = False
27
-
28
-
29
- class UNet1DModelTests(ModelTesterMixin, unittest.TestCase):
30
- model_class = UNet1DModel
31
-
32
- @property
33
- def dummy_input(self):
34
- batch_size = 4
35
- num_features = 14
36
- seq_len = 16
37
-
38
- noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
39
- time_step = torch.tensor([10] * batch_size).to(torch_device)
40
-
41
- return {"sample": noise, "timestep": time_step}
42
-
43
- @property
44
- def input_shape(self):
45
- return (4, 14, 16)
46
-
47
- @property
48
- def output_shape(self):
49
- return (4, 14, 16)
50
-
51
- def test_ema_training(self):
52
- pass
53
-
54
- def test_training(self):
55
- pass
56
-
57
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
58
- def test_determinism(self):
59
- super().test_determinism()
60
-
61
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
62
- def test_outputs_equivalence(self):
63
- super().test_outputs_equivalence()
64
-
65
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
66
- def test_from_save_pretrained(self):
67
- super().test_from_save_pretrained()
68
-
69
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
70
- def test_from_save_pretrained_variant(self):
71
- super().test_from_save_pretrained_variant()
72
-
73
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
74
- def test_model_from_pretrained(self):
75
- super().test_model_from_pretrained()
76
-
77
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
78
- def test_output(self):
79
- super().test_output()
80
-
81
- def prepare_init_args_and_inputs_for_common(self):
82
- init_dict = {
83
- "block_out_channels": (32, 64, 128, 256),
84
- "in_channels": 14,
85
- "out_channels": 14,
86
- "time_embedding_type": "positional",
87
- "use_timestep_embedding": True,
88
- "flip_sin_to_cos": False,
89
- "freq_shift": 1.0,
90
- "out_block_type": "OutConv1DBlock",
91
- "mid_block_type": "MidResTemporalBlock1D",
92
- "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
93
- "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
94
- "act_fn": "mish",
95
- }
96
- inputs_dict = self.dummy_input
97
- return init_dict, inputs_dict
98
-
99
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
100
- def test_from_pretrained_hub(self):
101
- model, loading_info = UNet1DModel.from_pretrained(
102
- "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet"
103
- )
104
- self.assertIsNotNone(model)
105
- self.assertEqual(len(loading_info["missing_keys"]), 0)
106
-
107
- model.to(torch_device)
108
- image = model(**self.dummy_input)
109
-
110
- assert image is not None, "Make sure output is not None"
111
-
112
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
113
- def test_output_pretrained(self):
114
- model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
115
- torch.manual_seed(0)
116
- if torch.cuda.is_available():
117
- torch.cuda.manual_seed_all(0)
118
-
119
- num_features = model.in_channels
120
- seq_len = 16
121
- noise = torch.randn((1, seq_len, num_features)).permute(
122
- 0, 2, 1
123
- ) # match original, we can update values and remove
124
- time_step = torch.full((num_features,), 0)
125
-
126
- with torch.no_grad():
127
- output = model(noise, time_step).sample.permute(0, 2, 1)
128
-
129
- output_slice = output[0, -3:, -3:].flatten()
130
- # fmt: off
131
- expected_output_slice = torch.tensor([-2.137172, 1.1426016, 0.3688687, -0.766922, 0.7303146, 0.11038864, -0.4760633, 0.13270172, 0.02591348])
132
- # fmt: on
133
- self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
134
-
135
- def test_forward_with_norm_groups(self):
136
- # Not implemented yet for this UNet
137
- pass
138
-
139
- @slow
140
- def test_unet_1d_maestro(self):
141
- model_id = "harmonai/maestro-150k"
142
- model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
143
- model.to(torch_device)
144
-
145
- sample_size = 65536
146
- noise = torch.sin(torch.arange(sample_size)[None, None, :].repeat(1, 2, 1)).to(torch_device)
147
- timestep = torch.tensor([1]).to(torch_device)
148
-
149
- with torch.no_grad():
150
- output = model(noise, timestep).sample
151
-
152
- output_sum = output.abs().sum()
153
- output_max = output.abs().max()
154
-
155
- assert (output_sum - 224.0896).abs() < 4e-2
156
- assert (output_max - 0.0607).abs() < 4e-4
157
-
158
-
159
- class UNetRLModelTests(ModelTesterMixin, unittest.TestCase):
160
- model_class = UNet1DModel
161
-
162
- @property
163
- def dummy_input(self):
164
- batch_size = 4
165
- num_features = 14
166
- seq_len = 16
167
-
168
- noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
169
- time_step = torch.tensor([10] * batch_size).to(torch_device)
170
-
171
- return {"sample": noise, "timestep": time_step}
172
-
173
- @property
174
- def input_shape(self):
175
- return (4, 14, 16)
176
-
177
- @property
178
- def output_shape(self):
179
- return (4, 14, 1)
180
-
181
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
182
- def test_determinism(self):
183
- super().test_determinism()
184
-
185
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
186
- def test_outputs_equivalence(self):
187
- super().test_outputs_equivalence()
188
-
189
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
190
- def test_from_save_pretrained(self):
191
- super().test_from_save_pretrained()
192
-
193
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
194
- def test_from_save_pretrained_variant(self):
195
- super().test_from_save_pretrained_variant()
196
-
197
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
198
- def test_model_from_pretrained(self):
199
- super().test_model_from_pretrained()
200
-
201
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
202
- def test_output(self):
203
- # UNetRL is a value-function is different output shape
204
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
205
- model = self.model_class(**init_dict)
206
- model.to(torch_device)
207
- model.eval()
208
-
209
- with torch.no_grad():
210
- output = model(**inputs_dict)
211
-
212
- if isinstance(output, dict):
213
- output = output.sample
214
-
215
- self.assertIsNotNone(output)
216
- expected_shape = torch.Size((inputs_dict["sample"].shape[0], 1))
217
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
218
-
219
- def test_ema_training(self):
220
- pass
221
-
222
- def test_training(self):
223
- pass
224
-
225
- def prepare_init_args_and_inputs_for_common(self):
226
- init_dict = {
227
- "in_channels": 14,
228
- "out_channels": 14,
229
- "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"],
230
- "up_block_types": [],
231
- "out_block_type": "ValueFunction",
232
- "mid_block_type": "ValueFunctionMidBlock1D",
233
- "block_out_channels": [32, 64, 128, 256],
234
- "layers_per_block": 1,
235
- "downsample_each_block": True,
236
- "use_timestep_embedding": True,
237
- "freq_shift": 1.0,
238
- "flip_sin_to_cos": False,
239
- "time_embedding_type": "positional",
240
- "act_fn": "mish",
241
- }
242
- inputs_dict = self.dummy_input
243
- return init_dict, inputs_dict
244
-
245
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
246
- def test_from_pretrained_hub(self):
247
- value_function, vf_loading_info = UNet1DModel.from_pretrained(
248
- "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
249
- )
250
- self.assertIsNotNone(value_function)
251
- self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
252
-
253
- value_function.to(torch_device)
254
- image = value_function(**self.dummy_input)
255
-
256
- assert image is not None, "Make sure output is not None"
257
-
258
- @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
259
- def test_output_pretrained(self):
260
- value_function, vf_loading_info = UNet1DModel.from_pretrained(
261
- "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
262
- )
263
- torch.manual_seed(0)
264
- if torch.cuda.is_available():
265
- torch.cuda.manual_seed_all(0)
266
-
267
- num_features = value_function.in_channels
268
- seq_len = 14
269
- noise = torch.randn((1, seq_len, num_features)).permute(
270
- 0, 2, 1
271
- ) # match original, we can update values and remove
272
- time_step = torch.full((num_features,), 0)
273
-
274
- with torch.no_grad():
275
- output = value_function(noise, time_step).sample
276
-
277
- # fmt: off
278
- expected_output_slice = torch.tensor([165.25] * seq_len)
279
- # fmt: on
280
- self.assertTrue(torch.allclose(output, expected_output_slice, rtol=1e-3))
281
-
282
- def test_forward_with_norm_groups(self):
283
- # Not implemented yet for this UNet
284
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_unet_2d.py DELETED
@@ -1,297 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import math
18
- import unittest
19
-
20
- import torch
21
-
22
- from diffusers import UNet2DModel
23
- from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device
24
-
25
- from ..test_modeling_common import ModelTesterMixin
26
-
27
-
28
- logger = logging.get_logger(__name__)
29
- torch.backends.cuda.matmul.allow_tf32 = False
30
-
31
-
32
- class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
33
- model_class = UNet2DModel
34
-
35
- @property
36
- def dummy_input(self):
37
- batch_size = 4
38
- num_channels = 3
39
- sizes = (32, 32)
40
-
41
- noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
42
- time_step = torch.tensor([10]).to(torch_device)
43
-
44
- return {"sample": noise, "timestep": time_step}
45
-
46
- @property
47
- def input_shape(self):
48
- return (3, 32, 32)
49
-
50
- @property
51
- def output_shape(self):
52
- return (3, 32, 32)
53
-
54
- def prepare_init_args_and_inputs_for_common(self):
55
- init_dict = {
56
- "block_out_channels": (32, 64),
57
- "down_block_types": ("DownBlock2D", "AttnDownBlock2D"),
58
- "up_block_types": ("AttnUpBlock2D", "UpBlock2D"),
59
- "attention_head_dim": None,
60
- "out_channels": 3,
61
- "in_channels": 3,
62
- "layers_per_block": 2,
63
- "sample_size": 32,
64
- }
65
- inputs_dict = self.dummy_input
66
- return init_dict, inputs_dict
67
-
68
-
69
- class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
70
- model_class = UNet2DModel
71
-
72
- @property
73
- def dummy_input(self):
74
- batch_size = 4
75
- num_channels = 4
76
- sizes = (32, 32)
77
-
78
- noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
79
- time_step = torch.tensor([10]).to(torch_device)
80
-
81
- return {"sample": noise, "timestep": time_step}
82
-
83
- @property
84
- def input_shape(self):
85
- return (4, 32, 32)
86
-
87
- @property
88
- def output_shape(self):
89
- return (4, 32, 32)
90
-
91
- def prepare_init_args_and_inputs_for_common(self):
92
- init_dict = {
93
- "sample_size": 32,
94
- "in_channels": 4,
95
- "out_channels": 4,
96
- "layers_per_block": 2,
97
- "block_out_channels": (32, 64),
98
- "attention_head_dim": 32,
99
- "down_block_types": ("DownBlock2D", "DownBlock2D"),
100
- "up_block_types": ("UpBlock2D", "UpBlock2D"),
101
- }
102
- inputs_dict = self.dummy_input
103
- return init_dict, inputs_dict
104
-
105
- def test_from_pretrained_hub(self):
106
- model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
107
-
108
- self.assertIsNotNone(model)
109
- self.assertEqual(len(loading_info["missing_keys"]), 0)
110
-
111
- model.to(torch_device)
112
- image = model(**self.dummy_input).sample
113
-
114
- assert image is not None, "Make sure output is not None"
115
-
116
- @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
117
- def test_from_pretrained_accelerate(self):
118
- model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
119
- model.to(torch_device)
120
- image = model(**self.dummy_input).sample
121
-
122
- assert image is not None, "Make sure output is not None"
123
-
124
- @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
125
- def test_from_pretrained_accelerate_wont_change_results(self):
126
- # by defautl model loading will use accelerate as `low_cpu_mem_usage=True`
127
- model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
128
- model_accelerate.to(torch_device)
129
- model_accelerate.eval()
130
-
131
- noise = torch.randn(
132
- 1,
133
- model_accelerate.config.in_channels,
134
- model_accelerate.config.sample_size,
135
- model_accelerate.config.sample_size,
136
- generator=torch.manual_seed(0),
137
- )
138
- noise = noise.to(torch_device)
139
- time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
140
-
141
- arr_accelerate = model_accelerate(noise, time_step)["sample"]
142
-
143
- # two models don't need to stay in the device at the same time
144
- del model_accelerate
145
- torch.cuda.empty_cache()
146
- gc.collect()
147
-
148
- model_normal_load, _ = UNet2DModel.from_pretrained(
149
- "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
150
- )
151
- model_normal_load.to(torch_device)
152
- model_normal_load.eval()
153
- arr_normal_load = model_normal_load(noise, time_step)["sample"]
154
-
155
- assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3)
156
-
157
- def test_output_pretrained(self):
158
- model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
159
- model.eval()
160
- model.to(torch_device)
161
-
162
- noise = torch.randn(
163
- 1,
164
- model.config.in_channels,
165
- model.config.sample_size,
166
- model.config.sample_size,
167
- generator=torch.manual_seed(0),
168
- )
169
- noise = noise.to(torch_device)
170
- time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
171
-
172
- with torch.no_grad():
173
- output = model(noise, time_step).sample
174
-
175
- output_slice = output[0, -1, -3:, -3:].flatten().cpu()
176
- # fmt: off
177
- expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
178
- # fmt: on
179
-
180
- self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3))
181
-
182
-
183
- class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
184
- model_class = UNet2DModel
185
-
186
- @property
187
- def dummy_input(self, sizes=(32, 32)):
188
- batch_size = 4
189
- num_channels = 3
190
-
191
- noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
192
- time_step = torch.tensor(batch_size * [10]).to(dtype=torch.int32, device=torch_device)
193
-
194
- return {"sample": noise, "timestep": time_step}
195
-
196
- @property
197
- def input_shape(self):
198
- return (3, 32, 32)
199
-
200
- @property
201
- def output_shape(self):
202
- return (3, 32, 32)
203
-
204
- def prepare_init_args_and_inputs_for_common(self):
205
- init_dict = {
206
- "block_out_channels": [32, 64, 64, 64],
207
- "in_channels": 3,
208
- "layers_per_block": 1,
209
- "out_channels": 3,
210
- "time_embedding_type": "fourier",
211
- "norm_eps": 1e-6,
212
- "mid_block_scale_factor": math.sqrt(2.0),
213
- "norm_num_groups": None,
214
- "down_block_types": [
215
- "SkipDownBlock2D",
216
- "AttnSkipDownBlock2D",
217
- "SkipDownBlock2D",
218
- "SkipDownBlock2D",
219
- ],
220
- "up_block_types": [
221
- "SkipUpBlock2D",
222
- "SkipUpBlock2D",
223
- "AttnSkipUpBlock2D",
224
- "SkipUpBlock2D",
225
- ],
226
- }
227
- inputs_dict = self.dummy_input
228
- return init_dict, inputs_dict
229
-
230
- @slow
231
- def test_from_pretrained_hub(self):
232
- model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
233
- self.assertIsNotNone(model)
234
- self.assertEqual(len(loading_info["missing_keys"]), 0)
235
-
236
- model.to(torch_device)
237
- inputs = self.dummy_input
238
- noise = floats_tensor((4, 3) + (256, 256)).to(torch_device)
239
- inputs["sample"] = noise
240
- image = model(**inputs)
241
-
242
- assert image is not None, "Make sure output is not None"
243
-
244
- @slow
245
- def test_output_pretrained_ve_mid(self):
246
- model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
247
- model.to(torch_device)
248
-
249
- torch.manual_seed(0)
250
- if torch.cuda.is_available():
251
- torch.cuda.manual_seed_all(0)
252
-
253
- batch_size = 4
254
- num_channels = 3
255
- sizes = (256, 256)
256
-
257
- noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
258
- time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
259
-
260
- with torch.no_grad():
261
- output = model(noise, time_step).sample
262
-
263
- output_slice = output[0, -3:, -3:, -1].flatten().cpu()
264
- # fmt: off
265
- expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
266
- # fmt: on
267
-
268
- self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
269
-
270
- def test_output_pretrained_ve_large(self):
271
- model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
272
- model.to(torch_device)
273
-
274
- torch.manual_seed(0)
275
- if torch.cuda.is_available():
276
- torch.cuda.manual_seed_all(0)
277
-
278
- batch_size = 4
279
- num_channels = 3
280
- sizes = (32, 32)
281
-
282
- noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
283
- time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
284
-
285
- with torch.no_grad():
286
- output = model(noise, time_step).sample
287
-
288
- output_slice = output[0, -3:, -3:, -1].flatten().cpu()
289
- # fmt: off
290
- expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
291
- # fmt: on
292
-
293
- self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
294
-
295
- def test_forward_with_norm_groups(self):
296
- # not required for this model
297
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_unet_2d_condition.py DELETED
@@ -1,944 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import os
18
- import tempfile
19
- import unittest
20
-
21
- import torch
22
- from parameterized import parameterized
23
-
24
- from diffusers import UNet2DConditionModel
25
- from diffusers.models.attention_processor import LoRAAttnProcessor
26
- from diffusers.utils import (
27
- floats_tensor,
28
- load_hf_numpy,
29
- logging,
30
- require_torch_gpu,
31
- slow,
32
- torch_all_close,
33
- torch_device,
34
- )
35
- from diffusers.utils.import_utils import is_xformers_available
36
-
37
- from ..test_modeling_common import ModelTesterMixin
38
-
39
-
40
- logger = logging.get_logger(__name__)
41
- torch.backends.cuda.matmul.allow_tf32 = False
42
-
43
-
44
- def create_lora_layers(model):
45
- lora_attn_procs = {}
46
- for name in model.attn_processors.keys():
47
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
48
- if name.startswith("mid_block"):
49
- hidden_size = model.config.block_out_channels[-1]
50
- elif name.startswith("up_blocks"):
51
- block_id = int(name[len("up_blocks.")])
52
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
53
- elif name.startswith("down_blocks"):
54
- block_id = int(name[len("down_blocks.")])
55
- hidden_size = model.config.block_out_channels[block_id]
56
-
57
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
58
- lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
59
-
60
- # add 1 to weights to mock trained weights
61
- with torch.no_grad():
62
- lora_attn_procs[name].to_q_lora.up.weight += 1
63
- lora_attn_procs[name].to_k_lora.up.weight += 1
64
- lora_attn_procs[name].to_v_lora.up.weight += 1
65
- lora_attn_procs[name].to_out_lora.up.weight += 1
66
-
67
- return lora_attn_procs
68
-
69
-
70
- class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
71
- model_class = UNet2DConditionModel
72
-
73
- @property
74
- def dummy_input(self):
75
- batch_size = 4
76
- num_channels = 4
77
- sizes = (32, 32)
78
-
79
- noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
80
- time_step = torch.tensor([10]).to(torch_device)
81
- encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device)
82
-
83
- return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
84
-
85
- @property
86
- def input_shape(self):
87
- return (4, 32, 32)
88
-
89
- @property
90
- def output_shape(self):
91
- return (4, 32, 32)
92
-
93
- def prepare_init_args_and_inputs_for_common(self):
94
- init_dict = {
95
- "block_out_channels": (32, 64),
96
- "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
97
- "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
98
- "cross_attention_dim": 32,
99
- "attention_head_dim": 8,
100
- "out_channels": 4,
101
- "in_channels": 4,
102
- "layers_per_block": 2,
103
- "sample_size": 32,
104
- }
105
- inputs_dict = self.dummy_input
106
- return init_dict, inputs_dict
107
-
108
- @unittest.skipIf(
109
- torch_device != "cuda" or not is_xformers_available(),
110
- reason="XFormers attention is only available with CUDA and `xformers` installed",
111
- )
112
- def test_xformers_enable_works(self):
113
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
114
- model = self.model_class(**init_dict)
115
-
116
- model.enable_xformers_memory_efficient_attention()
117
-
118
- assert (
119
- model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
120
- == "XFormersAttnProcessor"
121
- ), "xformers is not enabled"
122
-
123
- @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
124
- def test_gradient_checkpointing(self):
125
- # enable deterministic behavior for gradient checkpointing
126
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
127
- model = self.model_class(**init_dict)
128
- model.to(torch_device)
129
-
130
- assert not model.is_gradient_checkpointing and model.training
131
-
132
- out = model(**inputs_dict).sample
133
- # run the backwards pass on the model. For backwards pass, for simplicity purpose,
134
- # we won't calculate the loss and rather backprop on out.sum()
135
- model.zero_grad()
136
-
137
- labels = torch.randn_like(out)
138
- loss = (out - labels).mean()
139
- loss.backward()
140
-
141
- # re-instantiate the model now enabling gradient checkpointing
142
- model_2 = self.model_class(**init_dict)
143
- # clone model
144
- model_2.load_state_dict(model.state_dict())
145
- model_2.to(torch_device)
146
- model_2.enable_gradient_checkpointing()
147
-
148
- assert model_2.is_gradient_checkpointing and model_2.training
149
-
150
- out_2 = model_2(**inputs_dict).sample
151
- # run the backwards pass on the model. For backwards pass, for simplicity purpose,
152
- # we won't calculate the loss and rather backprop on out.sum()
153
- model_2.zero_grad()
154
- loss_2 = (out_2 - labels).mean()
155
- loss_2.backward()
156
-
157
- # compare the output and parameters gradients
158
- self.assertTrue((loss - loss_2).abs() < 1e-5)
159
- named_params = dict(model.named_parameters())
160
- named_params_2 = dict(model_2.named_parameters())
161
- for name, param in named_params.items():
162
- self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
163
-
164
- def test_model_with_attention_head_dim_tuple(self):
165
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
166
-
167
- init_dict["attention_head_dim"] = (8, 16)
168
-
169
- model = self.model_class(**init_dict)
170
- model.to(torch_device)
171
- model.eval()
172
-
173
- with torch.no_grad():
174
- output = model(**inputs_dict)
175
-
176
- if isinstance(output, dict):
177
- output = output.sample
178
-
179
- self.assertIsNotNone(output)
180
- expected_shape = inputs_dict["sample"].shape
181
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
182
-
183
- def test_model_with_use_linear_projection(self):
184
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
185
-
186
- init_dict["use_linear_projection"] = True
187
-
188
- model = self.model_class(**init_dict)
189
- model.to(torch_device)
190
- model.eval()
191
-
192
- with torch.no_grad():
193
- output = model(**inputs_dict)
194
-
195
- if isinstance(output, dict):
196
- output = output.sample
197
-
198
- self.assertIsNotNone(output)
199
- expected_shape = inputs_dict["sample"].shape
200
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
201
-
202
- def test_model_with_cross_attention_dim_tuple(self):
203
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
204
-
205
- init_dict["cross_attention_dim"] = (32, 32)
206
-
207
- model = self.model_class(**init_dict)
208
- model.to(torch_device)
209
- model.eval()
210
-
211
- with torch.no_grad():
212
- output = model(**inputs_dict)
213
-
214
- if isinstance(output, dict):
215
- output = output.sample
216
-
217
- self.assertIsNotNone(output)
218
- expected_shape = inputs_dict["sample"].shape
219
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
220
-
221
- def test_model_with_simple_projection(self):
222
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
223
-
224
- batch_size, _, _, sample_size = inputs_dict["sample"].shape
225
-
226
- init_dict["class_embed_type"] = "simple_projection"
227
- init_dict["projection_class_embeddings_input_dim"] = sample_size
228
-
229
- inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
230
-
231
- model = self.model_class(**init_dict)
232
- model.to(torch_device)
233
- model.eval()
234
-
235
- with torch.no_grad():
236
- output = model(**inputs_dict)
237
-
238
- if isinstance(output, dict):
239
- output = output.sample
240
-
241
- self.assertIsNotNone(output)
242
- expected_shape = inputs_dict["sample"].shape
243
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
244
-
245
- def test_model_with_class_embeddings_concat(self):
246
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
247
-
248
- batch_size, _, _, sample_size = inputs_dict["sample"].shape
249
-
250
- init_dict["class_embed_type"] = "simple_projection"
251
- init_dict["projection_class_embeddings_input_dim"] = sample_size
252
- init_dict["class_embeddings_concat"] = True
253
-
254
- inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
255
-
256
- model = self.model_class(**init_dict)
257
- model.to(torch_device)
258
- model.eval()
259
-
260
- with torch.no_grad():
261
- output = model(**inputs_dict)
262
-
263
- if isinstance(output, dict):
264
- output = output.sample
265
-
266
- self.assertIsNotNone(output)
267
- expected_shape = inputs_dict["sample"].shape
268
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
269
-
270
- def test_model_attention_slicing(self):
271
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
272
-
273
- init_dict["attention_head_dim"] = (8, 16)
274
-
275
- model = self.model_class(**init_dict)
276
- model.to(torch_device)
277
- model.eval()
278
-
279
- model.set_attention_slice("auto")
280
- with torch.no_grad():
281
- output = model(**inputs_dict)
282
- assert output is not None
283
-
284
- model.set_attention_slice("max")
285
- with torch.no_grad():
286
- output = model(**inputs_dict)
287
- assert output is not None
288
-
289
- model.set_attention_slice(2)
290
- with torch.no_grad():
291
- output = model(**inputs_dict)
292
- assert output is not None
293
-
294
- def test_model_sliceable_head_dim(self):
295
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
296
-
297
- init_dict["attention_head_dim"] = (8, 16)
298
-
299
- model = self.model_class(**init_dict)
300
-
301
- def check_sliceable_dim_attr(module: torch.nn.Module):
302
- if hasattr(module, "set_attention_slice"):
303
- assert isinstance(module.sliceable_head_dim, int)
304
-
305
- for child in module.children():
306
- check_sliceable_dim_attr(child)
307
-
308
- # retrieve number of attention layers
309
- for module in model.children():
310
- check_sliceable_dim_attr(module)
311
-
312
- def test_special_attn_proc(self):
313
- class AttnEasyProc(torch.nn.Module):
314
- def __init__(self, num):
315
- super().__init__()
316
- self.weight = torch.nn.Parameter(torch.tensor(num))
317
- self.is_run = False
318
- self.number = 0
319
- self.counter = 0
320
-
321
- def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, number=None):
322
- batch_size, sequence_length, _ = hidden_states.shape
323
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
324
-
325
- query = attn.to_q(hidden_states)
326
-
327
- encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
328
- key = attn.to_k(encoder_hidden_states)
329
- value = attn.to_v(encoder_hidden_states)
330
-
331
- query = attn.head_to_batch_dim(query)
332
- key = attn.head_to_batch_dim(key)
333
- value = attn.head_to_batch_dim(value)
334
-
335
- attention_probs = attn.get_attention_scores(query, key, attention_mask)
336
- hidden_states = torch.bmm(attention_probs, value)
337
- hidden_states = attn.batch_to_head_dim(hidden_states)
338
-
339
- # linear proj
340
- hidden_states = attn.to_out[0](hidden_states)
341
- # dropout
342
- hidden_states = attn.to_out[1](hidden_states)
343
-
344
- hidden_states += self.weight
345
-
346
- self.is_run = True
347
- self.counter += 1
348
- self.number = number
349
-
350
- return hidden_states
351
-
352
- # enable deterministic behavior for gradient checkpointing
353
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
354
-
355
- init_dict["attention_head_dim"] = (8, 16)
356
-
357
- model = self.model_class(**init_dict)
358
- model.to(torch_device)
359
-
360
- processor = AttnEasyProc(5.0)
361
-
362
- model.set_attn_processor(processor)
363
- model(**inputs_dict, cross_attention_kwargs={"number": 123}).sample
364
-
365
- assert processor.counter == 12
366
- assert processor.is_run
367
- assert processor.number == 123
368
-
369
- def test_lora_processors(self):
370
- # enable deterministic behavior for gradient checkpointing
371
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
372
-
373
- init_dict["attention_head_dim"] = (8, 16)
374
-
375
- model = self.model_class(**init_dict)
376
- model.to(torch_device)
377
-
378
- with torch.no_grad():
379
- sample1 = model(**inputs_dict).sample
380
-
381
- lora_attn_procs = {}
382
- for name in model.attn_processors.keys():
383
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
384
- if name.startswith("mid_block"):
385
- hidden_size = model.config.block_out_channels[-1]
386
- elif name.startswith("up_blocks"):
387
- block_id = int(name[len("up_blocks.")])
388
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
389
- elif name.startswith("down_blocks"):
390
- block_id = int(name[len("down_blocks.")])
391
- hidden_size = model.config.block_out_channels[block_id]
392
-
393
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
394
-
395
- # add 1 to weights to mock trained weights
396
- with torch.no_grad():
397
- lora_attn_procs[name].to_q_lora.up.weight += 1
398
- lora_attn_procs[name].to_k_lora.up.weight += 1
399
- lora_attn_procs[name].to_v_lora.up.weight += 1
400
- lora_attn_procs[name].to_out_lora.up.weight += 1
401
-
402
- # make sure we can set a list of attention processors
403
- model.set_attn_processor(lora_attn_procs)
404
- model.to(torch_device)
405
-
406
- # test that attn processors can be set to itself
407
- model.set_attn_processor(model.attn_processors)
408
-
409
- with torch.no_grad():
410
- sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
411
- sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
412
- sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
413
-
414
- assert (sample1 - sample2).abs().max() < 1e-4
415
- assert (sample3 - sample4).abs().max() < 1e-4
416
-
417
- # sample 2 and sample 3 should be different
418
- assert (sample2 - sample3).abs().max() > 1e-4
419
-
420
- def test_lora_save_load(self):
421
- # enable deterministic behavior for gradient checkpointing
422
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
423
-
424
- init_dict["attention_head_dim"] = (8, 16)
425
-
426
- torch.manual_seed(0)
427
- model = self.model_class(**init_dict)
428
- model.to(torch_device)
429
-
430
- with torch.no_grad():
431
- old_sample = model(**inputs_dict).sample
432
-
433
- lora_attn_procs = create_lora_layers(model)
434
- model.set_attn_processor(lora_attn_procs)
435
-
436
- with torch.no_grad():
437
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
438
-
439
- with tempfile.TemporaryDirectory() as tmpdirname:
440
- model.save_attn_procs(tmpdirname)
441
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
442
- torch.manual_seed(0)
443
- new_model = self.model_class(**init_dict)
444
- new_model.to(torch_device)
445
- new_model.load_attn_procs(tmpdirname)
446
-
447
- with torch.no_grad():
448
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
449
-
450
- assert (sample - new_sample).abs().max() < 1e-4
451
-
452
- # LoRA and no LoRA should NOT be the same
453
- assert (sample - old_sample).abs().max() > 1e-4
454
-
455
- def test_lora_save_load_safetensors(self):
456
- # enable deterministic behavior for gradient checkpointing
457
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
458
-
459
- init_dict["attention_head_dim"] = (8, 16)
460
-
461
- torch.manual_seed(0)
462
- model = self.model_class(**init_dict)
463
- model.to(torch_device)
464
-
465
- with torch.no_grad():
466
- old_sample = model(**inputs_dict).sample
467
-
468
- lora_attn_procs = {}
469
- for name in model.attn_processors.keys():
470
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
471
- if name.startswith("mid_block"):
472
- hidden_size = model.config.block_out_channels[-1]
473
- elif name.startswith("up_blocks"):
474
- block_id = int(name[len("up_blocks.")])
475
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
476
- elif name.startswith("down_blocks"):
477
- block_id = int(name[len("down_blocks.")])
478
- hidden_size = model.config.block_out_channels[block_id]
479
-
480
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
481
- lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
482
-
483
- # add 1 to weights to mock trained weights
484
- with torch.no_grad():
485
- lora_attn_procs[name].to_q_lora.up.weight += 1
486
- lora_attn_procs[name].to_k_lora.up.weight += 1
487
- lora_attn_procs[name].to_v_lora.up.weight += 1
488
- lora_attn_procs[name].to_out_lora.up.weight += 1
489
-
490
- model.set_attn_processor(lora_attn_procs)
491
-
492
- with torch.no_grad():
493
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
494
-
495
- with tempfile.TemporaryDirectory() as tmpdirname:
496
- model.save_attn_procs(tmpdirname, safe_serialization=True)
497
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
498
- torch.manual_seed(0)
499
- new_model = self.model_class(**init_dict)
500
- new_model.to(torch_device)
501
- new_model.load_attn_procs(tmpdirname)
502
-
503
- with torch.no_grad():
504
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
505
-
506
- assert (sample - new_sample).abs().max() < 1e-4
507
-
508
- # LoRA and no LoRA should NOT be the same
509
- assert (sample - old_sample).abs().max() > 1e-4
510
-
511
- def test_lora_save_safetensors_load_torch(self):
512
- # enable deterministic behavior for gradient checkpointing
513
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
514
-
515
- init_dict["attention_head_dim"] = (8, 16)
516
-
517
- torch.manual_seed(0)
518
- model = self.model_class(**init_dict)
519
- model.to(torch_device)
520
-
521
- lora_attn_procs = {}
522
- for name in model.attn_processors.keys():
523
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
524
- if name.startswith("mid_block"):
525
- hidden_size = model.config.block_out_channels[-1]
526
- elif name.startswith("up_blocks"):
527
- block_id = int(name[len("up_blocks.")])
528
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
529
- elif name.startswith("down_blocks"):
530
- block_id = int(name[len("down_blocks.")])
531
- hidden_size = model.config.block_out_channels[block_id]
532
-
533
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
534
- lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
535
-
536
- model.set_attn_processor(lora_attn_procs)
537
- # Saving as torch, properly reloads with directly filename
538
- with tempfile.TemporaryDirectory() as tmpdirname:
539
- model.save_attn_procs(tmpdirname)
540
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
541
- torch.manual_seed(0)
542
- new_model = self.model_class(**init_dict)
543
- new_model.to(torch_device)
544
- new_model.load_attn_procs(tmpdirname, weight_name="pytorch_lora_weights.bin")
545
-
546
- def test_lora_save_torch_force_load_safetensors_error(self):
547
- # enable deterministic behavior for gradient checkpointing
548
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
549
-
550
- init_dict["attention_head_dim"] = (8, 16)
551
-
552
- torch.manual_seed(0)
553
- model = self.model_class(**init_dict)
554
- model.to(torch_device)
555
-
556
- lora_attn_procs = {}
557
- for name in model.attn_processors.keys():
558
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
559
- if name.startswith("mid_block"):
560
- hidden_size = model.config.block_out_channels[-1]
561
- elif name.startswith("up_blocks"):
562
- block_id = int(name[len("up_blocks.")])
563
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
564
- elif name.startswith("down_blocks"):
565
- block_id = int(name[len("down_blocks.")])
566
- hidden_size = model.config.block_out_channels[block_id]
567
-
568
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
569
- lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
570
-
571
- model.set_attn_processor(lora_attn_procs)
572
- # Saving as torch, properly reloads with directly filename
573
- with tempfile.TemporaryDirectory() as tmpdirname:
574
- model.save_attn_procs(tmpdirname)
575
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
576
- torch.manual_seed(0)
577
- new_model = self.model_class(**init_dict)
578
- new_model.to(torch_device)
579
- with self.assertRaises(IOError) as e:
580
- new_model.load_attn_procs(tmpdirname, use_safetensors=True)
581
- self.assertIn("Error no file named pytorch_lora_weights.safetensors", str(e.exception))
582
-
583
- def test_lora_on_off(self):
584
- # enable deterministic behavior for gradient checkpointing
585
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
586
-
587
- init_dict["attention_head_dim"] = (8, 16)
588
-
589
- torch.manual_seed(0)
590
- model = self.model_class(**init_dict)
591
- model.to(torch_device)
592
-
593
- with torch.no_grad():
594
- old_sample = model(**inputs_dict).sample
595
-
596
- lora_attn_procs = create_lora_layers(model)
597
- model.set_attn_processor(lora_attn_procs)
598
-
599
- with torch.no_grad():
600
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
601
-
602
- model.set_default_attn_processor()
603
-
604
- with torch.no_grad():
605
- new_sample = model(**inputs_dict).sample
606
-
607
- assert (sample - new_sample).abs().max() < 1e-4
608
- assert (sample - old_sample).abs().max() < 1e-4
609
-
610
- @unittest.skipIf(
611
- torch_device != "cuda" or not is_xformers_available(),
612
- reason="XFormers attention is only available with CUDA and `xformers` installed",
613
- )
614
- def test_lora_xformers_on_off(self):
615
- # enable deterministic behavior for gradient checkpointing
616
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
617
-
618
- init_dict["attention_head_dim"] = (8, 16)
619
-
620
- torch.manual_seed(0)
621
- model = self.model_class(**init_dict)
622
- model.to(torch_device)
623
- lora_attn_procs = create_lora_layers(model)
624
- model.set_attn_processor(lora_attn_procs)
625
-
626
- # default
627
- with torch.no_grad():
628
- sample = model(**inputs_dict).sample
629
-
630
- model.enable_xformers_memory_efficient_attention()
631
- on_sample = model(**inputs_dict).sample
632
-
633
- model.disable_xformers_memory_efficient_attention()
634
- off_sample = model(**inputs_dict).sample
635
-
636
- assert (sample - on_sample).abs().max() < 1e-4
637
- assert (sample - off_sample).abs().max() < 1e-4
638
-
639
-
640
- @slow
641
- class UNet2DConditionModelIntegrationTests(unittest.TestCase):
642
- def get_file_format(self, seed, shape):
643
- return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
644
-
645
- def tearDown(self):
646
- # clean up the VRAM after each test
647
- super().tearDown()
648
- gc.collect()
649
- torch.cuda.empty_cache()
650
-
651
- def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
652
- dtype = torch.float16 if fp16 else torch.float32
653
- image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
654
- return image
655
-
656
- def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
657
- revision = "fp16" if fp16 else None
658
- torch_dtype = torch.float16 if fp16 else torch.float32
659
-
660
- model = UNet2DConditionModel.from_pretrained(
661
- model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision
662
- )
663
- model.to(torch_device).eval()
664
-
665
- return model
666
-
667
- def test_set_attention_slice_auto(self):
668
- torch.cuda.empty_cache()
669
- torch.cuda.reset_max_memory_allocated()
670
- torch.cuda.reset_peak_memory_stats()
671
-
672
- unet = self.get_unet_model()
673
- unet.set_attention_slice("auto")
674
-
675
- latents = self.get_latents(33)
676
- encoder_hidden_states = self.get_encoder_hidden_states(33)
677
- timestep = 1
678
-
679
- with torch.no_grad():
680
- _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
681
-
682
- mem_bytes = torch.cuda.max_memory_allocated()
683
-
684
- assert mem_bytes < 5 * 10**9
685
-
686
- def test_set_attention_slice_max(self):
687
- torch.cuda.empty_cache()
688
- torch.cuda.reset_max_memory_allocated()
689
- torch.cuda.reset_peak_memory_stats()
690
-
691
- unet = self.get_unet_model()
692
- unet.set_attention_slice("max")
693
-
694
- latents = self.get_latents(33)
695
- encoder_hidden_states = self.get_encoder_hidden_states(33)
696
- timestep = 1
697
-
698
- with torch.no_grad():
699
- _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
700
-
701
- mem_bytes = torch.cuda.max_memory_allocated()
702
-
703
- assert mem_bytes < 5 * 10**9
704
-
705
- def test_set_attention_slice_int(self):
706
- torch.cuda.empty_cache()
707
- torch.cuda.reset_max_memory_allocated()
708
- torch.cuda.reset_peak_memory_stats()
709
-
710
- unet = self.get_unet_model()
711
- unet.set_attention_slice(2)
712
-
713
- latents = self.get_latents(33)
714
- encoder_hidden_states = self.get_encoder_hidden_states(33)
715
- timestep = 1
716
-
717
- with torch.no_grad():
718
- _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
719
-
720
- mem_bytes = torch.cuda.max_memory_allocated()
721
-
722
- assert mem_bytes < 5 * 10**9
723
-
724
- def test_set_attention_slice_list(self):
725
- torch.cuda.empty_cache()
726
- torch.cuda.reset_max_memory_allocated()
727
- torch.cuda.reset_peak_memory_stats()
728
-
729
- # there are 32 sliceable layers
730
- slice_list = 16 * [2, 3]
731
- unet = self.get_unet_model()
732
- unet.set_attention_slice(slice_list)
733
-
734
- latents = self.get_latents(33)
735
- encoder_hidden_states = self.get_encoder_hidden_states(33)
736
- timestep = 1
737
-
738
- with torch.no_grad():
739
- _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
740
-
741
- mem_bytes = torch.cuda.max_memory_allocated()
742
-
743
- assert mem_bytes < 5 * 10**9
744
-
745
- def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
746
- dtype = torch.float16 if fp16 else torch.float32
747
- hidden_states = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
748
- return hidden_states
749
-
750
- @parameterized.expand(
751
- [
752
- # fmt: off
753
- [33, 4, [-0.4424, 0.1510, -0.1937, 0.2118, 0.3746, -0.3957, 0.0160, -0.0435]],
754
- [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.2540, 0.3633, -0.0821, 0.1719, -0.0207]],
755
- [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]],
756
- [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]],
757
- # fmt: on
758
- ]
759
- )
760
- @require_torch_gpu
761
- def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
762
- model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
763
- latents = self.get_latents(seed)
764
- encoder_hidden_states = self.get_encoder_hidden_states(seed)
765
-
766
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
767
-
768
- with torch.no_grad():
769
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
770
-
771
- assert sample.shape == latents.shape
772
-
773
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
774
- expected_output_slice = torch.tensor(expected_slice)
775
-
776
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
777
-
778
- @parameterized.expand(
779
- [
780
- # fmt: off
781
- [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
782
- [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
783
- [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
784
- [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
785
- # fmt: on
786
- ]
787
- )
788
- @require_torch_gpu
789
- def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
790
- model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
791
- latents = self.get_latents(seed, fp16=True)
792
- encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
793
-
794
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
795
-
796
- with torch.no_grad():
797
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
798
-
799
- assert sample.shape == latents.shape
800
-
801
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
802
- expected_output_slice = torch.tensor(expected_slice)
803
-
804
- assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
805
-
806
- @parameterized.expand(
807
- [
808
- # fmt: off
809
- [33, 4, [-0.4430, 0.1570, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
810
- [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.3430, -0.0536, 0.2114, -0.0436]],
811
- [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.1750]],
812
- [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]],
813
- # fmt: on
814
- ]
815
- )
816
- @require_torch_gpu
817
- def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
818
- model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
819
- latents = self.get_latents(seed)
820
- encoder_hidden_states = self.get_encoder_hidden_states(seed)
821
-
822
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
823
-
824
- with torch.no_grad():
825
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
826
-
827
- assert sample.shape == latents.shape
828
-
829
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
830
- expected_output_slice = torch.tensor(expected_slice)
831
-
832
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
833
-
834
- @parameterized.expand(
835
- [
836
- # fmt: off
837
- [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]],
838
- [17, 0.55, [-0.1290, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]],
839
- [8, 0.89, [-0.5283, 0.1198, 0.0870, -0.1141, 0.9189, -0.0150, 0.5474, 0.4319]],
840
- [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.0280, -1.0020]],
841
- # fmt: on
842
- ]
843
- )
844
- @require_torch_gpu
845
- def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
846
- model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
847
- latents = self.get_latents(seed, fp16=True)
848
- encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
849
-
850
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
851
-
852
- with torch.no_grad():
853
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
854
-
855
- assert sample.shape == latents.shape
856
-
857
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
858
- expected_output_slice = torch.tensor(expected_slice)
859
-
860
- assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
861
-
862
- @parameterized.expand(
863
- [
864
- # fmt: off
865
- [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]],
866
- [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]],
867
- [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.4120, -0.5621, -0.0397, -1.0430]],
868
- [9, 1000, [0.1600, 0.7303, -1.0556, -0.3515, -0.7440, -1.2037, -1.8149, -1.8931]],
869
- # fmt: on
870
- ]
871
- )
872
- @require_torch_gpu
873
- def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
874
- model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
875
- latents = self.get_latents(seed, shape=(4, 9, 64, 64))
876
- encoder_hidden_states = self.get_encoder_hidden_states(seed)
877
-
878
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
879
-
880
- with torch.no_grad():
881
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
882
-
883
- assert sample.shape == (4, 4, 64, 64)
884
-
885
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
886
- expected_output_slice = torch.tensor(expected_slice)
887
-
888
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
889
-
890
- @parameterized.expand(
891
- [
892
- # fmt: off
893
- [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]],
894
- [17, 0.55, [0.0975, -0.2856, -0.3508, -0.4600, 0.3376, 0.2930, -0.2747, -0.7026]],
895
- [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]],
896
- [3, 1000, [0.4790, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]],
897
- # fmt: on
898
- ]
899
- )
900
- @require_torch_gpu
901
- def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
902
- model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
903
- latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
904
- encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
905
-
906
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
907
-
908
- with torch.no_grad():
909
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
910
-
911
- assert sample.shape == (4, 4, 64, 64)
912
-
913
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
914
- expected_output_slice = torch.tensor(expected_slice)
915
-
916
- assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
917
-
918
- @parameterized.expand(
919
- [
920
- # fmt: off
921
- [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
922
- [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
923
- [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
924
- [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
925
- # fmt: on
926
- ]
927
- )
928
- @require_torch_gpu
929
- def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
930
- model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
931
- latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
932
- encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
933
-
934
- timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
935
-
936
- with torch.no_grad():
937
- sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
938
-
939
- assert sample.shape == latents.shape
940
-
941
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
942
- expected_output_slice = torch.tensor(expected_slice)
943
-
944
- assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_unet_2d_flax.py DELETED
@@ -1,104 +0,0 @@
1
- import gc
2
- import unittest
3
-
4
- from parameterized import parameterized
5
-
6
- from diffusers import FlaxUNet2DConditionModel
7
- from diffusers.utils import is_flax_available
8
- from diffusers.utils.testing_utils import load_hf_numpy, require_flax, slow
9
-
10
-
11
- if is_flax_available():
12
- import jax
13
- import jax.numpy as jnp
14
-
15
-
16
- @slow
17
- @require_flax
18
- class FlaxUNet2DConditionModelIntegrationTests(unittest.TestCase):
19
- def get_file_format(self, seed, shape):
20
- return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
21
-
22
- def tearDown(self):
23
- # clean up the VRAM after each test
24
- super().tearDown()
25
- gc.collect()
26
-
27
- def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
28
- dtype = jnp.bfloat16 if fp16 else jnp.float32
29
- image = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
30
- return image
31
-
32
- def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
33
- dtype = jnp.bfloat16 if fp16 else jnp.float32
34
- revision = "bf16" if fp16 else None
35
-
36
- model, params = FlaxUNet2DConditionModel.from_pretrained(
37
- model_id, subfolder="unet", dtype=dtype, revision=revision
38
- )
39
- return model, params
40
-
41
- def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
42
- dtype = jnp.bfloat16 if fp16 else jnp.float32
43
- hidden_states = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
44
- return hidden_states
45
-
46
- @parameterized.expand(
47
- [
48
- # fmt: off
49
- [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
50
- [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
51
- [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
52
- [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
53
- # fmt: on
54
- ]
55
- )
56
- def test_compvis_sd_v1_4_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
57
- model, params = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
58
- latents = self.get_latents(seed, fp16=True)
59
- encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
60
-
61
- sample = model.apply(
62
- {"params": params},
63
- latents,
64
- jnp.array(timestep, dtype=jnp.int32),
65
- encoder_hidden_states=encoder_hidden_states,
66
- ).sample
67
-
68
- assert sample.shape == latents.shape
69
-
70
- output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
71
- expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
72
-
73
- # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, in the same hardware
74
- assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
75
-
76
- @parameterized.expand(
77
- [
78
- # fmt: off
79
- [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
80
- [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
81
- [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
82
- [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
83
- # fmt: on
84
- ]
85
- )
86
- def test_stabilityai_sd_v2_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
87
- model, params = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
88
- latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
89
- encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
90
-
91
- sample = model.apply(
92
- {"params": params},
93
- latents,
94
- jnp.array(timestep, dtype=jnp.int32),
95
- encoder_hidden_states=encoder_hidden_states,
96
- ).sample
97
-
98
- assert sample.shape == latents.shape
99
-
100
- output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
101
- expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
102
-
103
- # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, on the same hardware
104
- assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_unet_3d_condition.py DELETED
@@ -1,241 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers.models import ModelMixin, UNet3DConditionModel
22
- from diffusers.models.attention_processor import LoRAAttnProcessor
23
- from diffusers.utils import (
24
- floats_tensor,
25
- logging,
26
- skip_mps,
27
- torch_device,
28
- )
29
- from diffusers.utils.import_utils import is_xformers_available
30
-
31
- from ..test_modeling_common import ModelTesterMixin
32
-
33
-
34
- logger = logging.get_logger(__name__)
35
- torch.backends.cuda.matmul.allow_tf32 = False
36
-
37
-
38
- def create_lora_layers(model):
39
- lora_attn_procs = {}
40
- for name in model.attn_processors.keys():
41
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
42
- if name.startswith("mid_block"):
43
- hidden_size = model.config.block_out_channels[-1]
44
- elif name.startswith("up_blocks"):
45
- block_id = int(name[len("up_blocks.")])
46
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
47
- elif name.startswith("down_blocks"):
48
- block_id = int(name[len("down_blocks.")])
49
- hidden_size = model.config.block_out_channels[block_id]
50
-
51
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
52
- lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
53
-
54
- # add 1 to weights to mock trained weights
55
- with torch.no_grad():
56
- lora_attn_procs[name].to_q_lora.up.weight += 1
57
- lora_attn_procs[name].to_k_lora.up.weight += 1
58
- lora_attn_procs[name].to_v_lora.up.weight += 1
59
- lora_attn_procs[name].to_out_lora.up.weight += 1
60
-
61
- return lora_attn_procs
62
-
63
-
64
- @skip_mps
65
- class UNet3DConditionModelTests(ModelTesterMixin, unittest.TestCase):
66
- model_class = UNet3DConditionModel
67
-
68
- @property
69
- def dummy_input(self):
70
- batch_size = 4
71
- num_channels = 4
72
- num_frames = 4
73
- sizes = (32, 32)
74
-
75
- noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
76
- time_step = torch.tensor([10]).to(torch_device)
77
- encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device)
78
-
79
- return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
80
-
81
- @property
82
- def input_shape(self):
83
- return (4, 4, 32, 32)
84
-
85
- @property
86
- def output_shape(self):
87
- return (4, 4, 32, 32)
88
-
89
- def prepare_init_args_and_inputs_for_common(self):
90
- init_dict = {
91
- "block_out_channels": (32, 64),
92
- "down_block_types": (
93
- "CrossAttnDownBlock3D",
94
- "DownBlock3D",
95
- ),
96
- "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
97
- "cross_attention_dim": 32,
98
- "attention_head_dim": 8,
99
- "out_channels": 4,
100
- "in_channels": 4,
101
- "layers_per_block": 1,
102
- "sample_size": 32,
103
- }
104
- inputs_dict = self.dummy_input
105
- return init_dict, inputs_dict
106
-
107
- @unittest.skipIf(
108
- torch_device != "cuda" or not is_xformers_available(),
109
- reason="XFormers attention is only available with CUDA and `xformers` installed",
110
- )
111
- def test_xformers_enable_works(self):
112
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
113
- model = self.model_class(**init_dict)
114
-
115
- model.enable_xformers_memory_efficient_attention()
116
-
117
- assert (
118
- model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
119
- == "XFormersAttnProcessor"
120
- ), "xformers is not enabled"
121
-
122
- # Overriding to set `norm_num_groups` needs to be different for this model.
123
- def test_forward_with_norm_groups(self):
124
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
125
-
126
- init_dict["norm_num_groups"] = 32
127
-
128
- model = self.model_class(**init_dict)
129
- model.to(torch_device)
130
- model.eval()
131
-
132
- with torch.no_grad():
133
- output = model(**inputs_dict)
134
-
135
- if isinstance(output, dict):
136
- output = output.sample
137
-
138
- self.assertIsNotNone(output)
139
- expected_shape = inputs_dict["sample"].shape
140
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
141
-
142
- # Overriding since the UNet3D outputs a different structure.
143
- def test_determinism(self):
144
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
145
- model = self.model_class(**init_dict)
146
- model.to(torch_device)
147
- model.eval()
148
-
149
- with torch.no_grad():
150
- # Warmup pass when using mps (see #372)
151
- if torch_device == "mps" and isinstance(model, ModelMixin):
152
- model(**self.dummy_input)
153
-
154
- first = model(**inputs_dict)
155
- if isinstance(first, dict):
156
- first = first.sample
157
-
158
- second = model(**inputs_dict)
159
- if isinstance(second, dict):
160
- second = second.sample
161
-
162
- out_1 = first.cpu().numpy()
163
- out_2 = second.cpu().numpy()
164
- out_1 = out_1[~np.isnan(out_1)]
165
- out_2 = out_2[~np.isnan(out_2)]
166
- max_diff = np.amax(np.abs(out_1 - out_2))
167
- self.assertLessEqual(max_diff, 1e-5)
168
-
169
- def test_model_attention_slicing(self):
170
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
171
-
172
- init_dict["attention_head_dim"] = 8
173
-
174
- model = self.model_class(**init_dict)
175
- model.to(torch_device)
176
- model.eval()
177
-
178
- model.set_attention_slice("auto")
179
- with torch.no_grad():
180
- output = model(**inputs_dict)
181
- assert output is not None
182
-
183
- model.set_attention_slice("max")
184
- with torch.no_grad():
185
- output = model(**inputs_dict)
186
- assert output is not None
187
-
188
- model.set_attention_slice(2)
189
- with torch.no_grad():
190
- output = model(**inputs_dict)
191
- assert output is not None
192
-
193
- # (`attn_processors`) needs to be implemented in this model for this test.
194
- # def test_lora_processors(self):
195
-
196
- # (`attn_processors`) needs to be implemented in this model for this test.
197
- # def test_lora_save_load(self):
198
-
199
- # (`attn_processors`) needs to be implemented for this test in the model.
200
- # def test_lora_save_load_safetensors(self):
201
-
202
- # (`attn_processors`) needs to be implemented for this test in the model.
203
- # def test_lora_save_safetensors_load_torch(self):
204
-
205
- # (`attn_processors`) needs to be implemented for this test.
206
- # def test_lora_save_torch_force_load_safetensors_error(self):
207
-
208
- # (`attn_processors`) needs to be added for this test.
209
- # def test_lora_on_off(self):
210
-
211
- @unittest.skipIf(
212
- torch_device != "cuda" or not is_xformers_available(),
213
- reason="XFormers attention is only available with CUDA and `xformers` installed",
214
- )
215
- def test_lora_xformers_on_off(self):
216
- # enable deterministic behavior for gradient checkpointing
217
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
218
-
219
- init_dict["attention_head_dim"] = 4
220
-
221
- torch.manual_seed(0)
222
- model = self.model_class(**init_dict)
223
- model.to(torch_device)
224
- lora_attn_procs = create_lora_layers(model)
225
- model.set_attn_processor(lora_attn_procs)
226
-
227
- # default
228
- with torch.no_grad():
229
- sample = model(**inputs_dict).sample
230
-
231
- model.enable_xformers_memory_efficient_attention()
232
- on_sample = model(**inputs_dict).sample
233
-
234
- model.disable_xformers_memory_efficient_attention()
235
- off_sample = model(**inputs_dict).sample
236
-
237
- assert (sample - on_sample).abs().max() < 1e-4
238
- assert (sample - off_sample).abs().max() < 1e-4
239
-
240
-
241
- # (todo: sayakpaul) implement SLOW tests.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_vae.py DELETED
@@ -1,345 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import torch
20
- from parameterized import parameterized
21
-
22
- from diffusers import AutoencoderKL
23
- from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
24
-
25
- from ..test_modeling_common import ModelTesterMixin
26
-
27
-
28
- torch.backends.cuda.matmul.allow_tf32 = False
29
-
30
-
31
- class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
32
- model_class = AutoencoderKL
33
-
34
- @property
35
- def dummy_input(self):
36
- batch_size = 4
37
- num_channels = 3
38
- sizes = (32, 32)
39
-
40
- image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
41
-
42
- return {"sample": image}
43
-
44
- @property
45
- def input_shape(self):
46
- return (3, 32, 32)
47
-
48
- @property
49
- def output_shape(self):
50
- return (3, 32, 32)
51
-
52
- def prepare_init_args_and_inputs_for_common(self):
53
- init_dict = {
54
- "block_out_channels": [32, 64],
55
- "in_channels": 3,
56
- "out_channels": 3,
57
- "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
58
- "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
59
- "latent_channels": 4,
60
- }
61
- inputs_dict = self.dummy_input
62
- return init_dict, inputs_dict
63
-
64
- def test_forward_signature(self):
65
- pass
66
-
67
- def test_training(self):
68
- pass
69
-
70
- @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
71
- def test_gradient_checkpointing(self):
72
- # enable deterministic behavior for gradient checkpointing
73
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
74
- model = self.model_class(**init_dict)
75
- model.to(torch_device)
76
-
77
- assert not model.is_gradient_checkpointing and model.training
78
-
79
- out = model(**inputs_dict).sample
80
- # run the backwards pass on the model. For backwards pass, for simplicity purpose,
81
- # we won't calculate the loss and rather backprop on out.sum()
82
- model.zero_grad()
83
-
84
- labels = torch.randn_like(out)
85
- loss = (out - labels).mean()
86
- loss.backward()
87
-
88
- # re-instantiate the model now enabling gradient checkpointing
89
- model_2 = self.model_class(**init_dict)
90
- # clone model
91
- model_2.load_state_dict(model.state_dict())
92
- model_2.to(torch_device)
93
- model_2.enable_gradient_checkpointing()
94
-
95
- assert model_2.is_gradient_checkpointing and model_2.training
96
-
97
- out_2 = model_2(**inputs_dict).sample
98
- # run the backwards pass on the model. For backwards pass, for simplicity purpose,
99
- # we won't calculate the loss and rather backprop on out.sum()
100
- model_2.zero_grad()
101
- loss_2 = (out_2 - labels).mean()
102
- loss_2.backward()
103
-
104
- # compare the output and parameters gradients
105
- self.assertTrue((loss - loss_2).abs() < 1e-5)
106
- named_params = dict(model.named_parameters())
107
- named_params_2 = dict(model_2.named_parameters())
108
- for name, param in named_params.items():
109
- self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
110
-
111
- def test_from_pretrained_hub(self):
112
- model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
113
- self.assertIsNotNone(model)
114
- self.assertEqual(len(loading_info["missing_keys"]), 0)
115
-
116
- model.to(torch_device)
117
- image = model(**self.dummy_input)
118
-
119
- assert image is not None, "Make sure output is not None"
120
-
121
- def test_output_pretrained(self):
122
- model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
123
- model = model.to(torch_device)
124
- model.eval()
125
-
126
- if torch_device == "mps":
127
- generator = torch.manual_seed(0)
128
- else:
129
- generator = torch.Generator(device=torch_device).manual_seed(0)
130
-
131
- image = torch.randn(
132
- 1,
133
- model.config.in_channels,
134
- model.config.sample_size,
135
- model.config.sample_size,
136
- generator=torch.manual_seed(0),
137
- )
138
- image = image.to(torch_device)
139
- with torch.no_grad():
140
- output = model(image, sample_posterior=True, generator=generator).sample
141
-
142
- output_slice = output[0, -1, -3:, -3:].flatten().cpu()
143
-
144
- # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
145
- # the expected output slices are not the same for CPU and GPU.
146
- if torch_device == "mps":
147
- expected_output_slice = torch.tensor(
148
- [
149
- -4.0078e-01,
150
- -3.8323e-04,
151
- -1.2681e-01,
152
- -1.1462e-01,
153
- 2.0095e-01,
154
- 1.0893e-01,
155
- -8.8247e-02,
156
- -3.0361e-01,
157
- -9.8644e-03,
158
- ]
159
- )
160
- elif torch_device == "cpu":
161
- expected_output_slice = torch.tensor(
162
- [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026]
163
- )
164
- else:
165
- expected_output_slice = torch.tensor(
166
- [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485]
167
- )
168
-
169
- self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
170
-
171
-
172
- @slow
173
- class AutoencoderKLIntegrationTests(unittest.TestCase):
174
- def get_file_format(self, seed, shape):
175
- return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
176
-
177
- def tearDown(self):
178
- # clean up the VRAM after each test
179
- super().tearDown()
180
- gc.collect()
181
- torch.cuda.empty_cache()
182
-
183
- def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
184
- dtype = torch.float16 if fp16 else torch.float32
185
- image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
186
- return image
187
-
188
- def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
189
- revision = "fp16" if fp16 else None
190
- torch_dtype = torch.float16 if fp16 else torch.float32
191
-
192
- model = AutoencoderKL.from_pretrained(
193
- model_id,
194
- subfolder="vae",
195
- torch_dtype=torch_dtype,
196
- revision=revision,
197
- )
198
- model.to(torch_device).eval()
199
-
200
- return model
201
-
202
- def get_generator(self, seed=0):
203
- if torch_device == "mps":
204
- return torch.manual_seed(seed)
205
- return torch.Generator(device=torch_device).manual_seed(seed)
206
-
207
- @parameterized.expand(
208
- [
209
- # fmt: off
210
- [33, [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]],
211
- [47, [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]],
212
- # fmt: on
213
- ]
214
- )
215
- def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
216
- model = self.get_sd_vae_model()
217
- image = self.get_sd_image(seed)
218
- generator = self.get_generator(seed)
219
-
220
- with torch.no_grad():
221
- sample = model(image, generator=generator, sample_posterior=True).sample
222
-
223
- assert sample.shape == image.shape
224
-
225
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
226
- expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
227
-
228
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
229
-
230
- @parameterized.expand(
231
- [
232
- # fmt: off
233
- [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
234
- [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
235
- # fmt: on
236
- ]
237
- )
238
- @require_torch_gpu
239
- def test_stable_diffusion_fp16(self, seed, expected_slice):
240
- model = self.get_sd_vae_model(fp16=True)
241
- image = self.get_sd_image(seed, fp16=True)
242
- generator = self.get_generator(seed)
243
-
244
- with torch.no_grad():
245
- sample = model(image, generator=generator, sample_posterior=True).sample
246
-
247
- assert sample.shape == image.shape
248
-
249
- output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
250
- expected_output_slice = torch.tensor(expected_slice)
251
-
252
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
253
-
254
- @parameterized.expand(
255
- [
256
- # fmt: off
257
- [33, [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]],
258
- [47, [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]],
259
- # fmt: on
260
- ]
261
- )
262
- def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
263
- model = self.get_sd_vae_model()
264
- image = self.get_sd_image(seed)
265
-
266
- with torch.no_grad():
267
- sample = model(image).sample
268
-
269
- assert sample.shape == image.shape
270
-
271
- output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
272
- expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
273
-
274
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
275
-
276
- @parameterized.expand(
277
- [
278
- # fmt: off
279
- [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
280
- [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
281
- # fmt: on
282
- ]
283
- )
284
- @require_torch_gpu
285
- def test_stable_diffusion_decode(self, seed, expected_slice):
286
- model = self.get_sd_vae_model()
287
- encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
288
-
289
- with torch.no_grad():
290
- sample = model.decode(encoding).sample
291
-
292
- assert list(sample.shape) == [3, 3, 512, 512]
293
-
294
- output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
295
- expected_output_slice = torch.tensor(expected_slice)
296
-
297
- assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
298
-
299
- @parameterized.expand(
300
- [
301
- # fmt: off
302
- [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
303
- [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
304
- # fmt: on
305
- ]
306
- )
307
- @require_torch_gpu
308
- def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
309
- model = self.get_sd_vae_model(fp16=True)
310
- encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
311
-
312
- with torch.no_grad():
313
- sample = model.decode(encoding).sample
314
-
315
- assert list(sample.shape) == [3, 3, 512, 512]
316
-
317
- output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
318
- expected_output_slice = torch.tensor(expected_slice)
319
-
320
- assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
321
-
322
- @parameterized.expand(
323
- [
324
- # fmt: off
325
- [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
326
- [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
327
- # fmt: on
328
- ]
329
- )
330
- def test_stable_diffusion_encode_sample(self, seed, expected_slice):
331
- model = self.get_sd_vae_model()
332
- image = self.get_sd_image(seed)
333
- generator = self.get_generator(seed)
334
-
335
- with torch.no_grad():
336
- dist = model.encode(image).latent_dist
337
- sample = dist.sample(generator=generator)
338
-
339
- assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
340
-
341
- output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
342
- expected_output_slice = torch.tensor(expected_slice)
343
-
344
- tolerance = 1e-3 if torch_device != "mps" else 1e-2
345
- assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_vae_flax.py DELETED
@@ -1,39 +0,0 @@
1
- import unittest
2
-
3
- from diffusers import FlaxAutoencoderKL
4
- from diffusers.utils import is_flax_available
5
- from diffusers.utils.testing_utils import require_flax
6
-
7
- from ..test_modeling_common_flax import FlaxModelTesterMixin
8
-
9
-
10
- if is_flax_available():
11
- import jax
12
-
13
-
14
- @require_flax
15
- class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase):
16
- model_class = FlaxAutoencoderKL
17
-
18
- @property
19
- def dummy_input(self):
20
- batch_size = 4
21
- num_channels = 3
22
- sizes = (32, 32)
23
-
24
- prng_key = jax.random.PRNGKey(0)
25
- image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes))
26
-
27
- return {"sample": image, "prng_key": prng_key}
28
-
29
- def prepare_init_args_and_inputs_for_common(self):
30
- init_dict = {
31
- "block_out_channels": [32, 64],
32
- "in_channels": 3,
33
- "out_channels": 3,
34
- "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
35
- "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
36
- "latent_channels": 4,
37
- }
38
- inputs_dict = self.dummy_input
39
- return init_dict, inputs_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/models/test_models_vq.py DELETED
@@ -1,94 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import torch
19
-
20
- from diffusers import VQModel
21
- from diffusers.utils import floats_tensor, torch_device
22
-
23
- from ..test_modeling_common import ModelTesterMixin
24
-
25
-
26
- torch.backends.cuda.matmul.allow_tf32 = False
27
-
28
-
29
- class VQModelTests(ModelTesterMixin, unittest.TestCase):
30
- model_class = VQModel
31
-
32
- @property
33
- def dummy_input(self, sizes=(32, 32)):
34
- batch_size = 4
35
- num_channels = 3
36
-
37
- image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
38
-
39
- return {"sample": image}
40
-
41
- @property
42
- def input_shape(self):
43
- return (3, 32, 32)
44
-
45
- @property
46
- def output_shape(self):
47
- return (3, 32, 32)
48
-
49
- def prepare_init_args_and_inputs_for_common(self):
50
- init_dict = {
51
- "block_out_channels": [32, 64],
52
- "in_channels": 3,
53
- "out_channels": 3,
54
- "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
55
- "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
56
- "latent_channels": 3,
57
- }
58
- inputs_dict = self.dummy_input
59
- return init_dict, inputs_dict
60
-
61
- def test_forward_signature(self):
62
- pass
63
-
64
- def test_training(self):
65
- pass
66
-
67
- def test_from_pretrained_hub(self):
68
- model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
69
- self.assertIsNotNone(model)
70
- self.assertEqual(len(loading_info["missing_keys"]), 0)
71
-
72
- model.to(torch_device)
73
- image = model(**self.dummy_input)
74
-
75
- assert image is not None, "Make sure output is not None"
76
-
77
- def test_output_pretrained(self):
78
- model = VQModel.from_pretrained("fusing/vqgan-dummy")
79
- model.to(torch_device).eval()
80
-
81
- torch.manual_seed(0)
82
- if torch.cuda.is_available():
83
- torch.cuda.manual_seed_all(0)
84
-
85
- image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
86
- image = image.to(torch_device)
87
- with torch.no_grad():
88
- output = model(image).sample
89
-
90
- output_slice = output[0, -1, -3:, -3:].flatten().cpu()
91
- # fmt: off
92
- expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
93
- # fmt: on
94
- self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipeline_params.py DELETED
@@ -1,121 +0,0 @@
1
- # These are canonical sets of parameters for different types of pipelines.
2
- # They are set on subclasses of `PipelineTesterMixin` as `params` and
3
- # `batch_params`.
4
- #
5
- # If a pipeline's set of arguments has minor changes from one of the common sets
6
- # of arguments, do not make modifications to the existing common sets of arguments.
7
- # I.e. a text to image pipeline with non-configurable height and width arguments
8
- # should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
9
-
10
- TEXT_TO_IMAGE_PARAMS = frozenset(
11
- [
12
- "prompt",
13
- "height",
14
- "width",
15
- "guidance_scale",
16
- "negative_prompt",
17
- "prompt_embeds",
18
- "negative_prompt_embeds",
19
- "cross_attention_kwargs",
20
- ]
21
- )
22
-
23
- TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
24
-
25
- IMAGE_VARIATION_PARAMS = frozenset(
26
- [
27
- "image",
28
- "height",
29
- "width",
30
- "guidance_scale",
31
- ]
32
- )
33
-
34
- IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
35
-
36
- TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
37
- [
38
- "prompt",
39
- "image",
40
- "height",
41
- "width",
42
- "guidance_scale",
43
- "negative_prompt",
44
- "prompt_embeds",
45
- "negative_prompt_embeds",
46
- ]
47
- )
48
-
49
- TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
50
-
51
- TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
52
- [
53
- # Text guided image variation with an image mask
54
- "prompt",
55
- "image",
56
- "mask_image",
57
- "height",
58
- "width",
59
- "guidance_scale",
60
- "negative_prompt",
61
- "prompt_embeds",
62
- "negative_prompt_embeds",
63
- ]
64
- )
65
-
66
- TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
67
-
68
- IMAGE_INPAINTING_PARAMS = frozenset(
69
- [
70
- # image variation with an image mask
71
- "image",
72
- "mask_image",
73
- "height",
74
- "width",
75
- "guidance_scale",
76
- ]
77
- )
78
-
79
- IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
80
-
81
- IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
82
- [
83
- "example_image",
84
- "image",
85
- "mask_image",
86
- "height",
87
- "width",
88
- "guidance_scale",
89
- ]
90
- )
91
-
92
- IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
93
-
94
- CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
95
-
96
- CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"])
97
-
98
- UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"])
99
-
100
- UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([])
101
-
102
- UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
103
-
104
- UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
105
-
106
- TEXT_TO_AUDIO_PARAMS = frozenset(
107
- [
108
- "prompt",
109
- "audio_length_in_s",
110
- "guidance_scale",
111
- "negative_prompt",
112
- "prompt_embeds",
113
- "negative_prompt_embeds",
114
- "cross_attention_kwargs",
115
- ]
116
- )
117
-
118
- TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
119
- TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
120
-
121
- TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/__init__.py DELETED
File without changes
diffusers/tests/pipelines/altdiffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py DELETED
@@ -1,244 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
- from transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
22
-
23
- from diffusers import AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, PNDMScheduler, UNet2DConditionModel
24
- from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
25
- RobertaSeriesConfig,
26
- RobertaSeriesModelWithTransformation,
27
- )
28
- from diffusers.utils import slow, torch_device
29
- from diffusers.utils.testing_utils import require_torch_gpu
30
-
31
- from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
32
- from ...test_pipelines_common import PipelineTesterMixin
33
-
34
-
35
- torch.backends.cuda.matmul.allow_tf32 = False
36
-
37
-
38
- class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
39
- pipeline_class = AltDiffusionPipeline
40
- params = TEXT_TO_IMAGE_PARAMS
41
- batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
42
-
43
- def get_dummy_components(self):
44
- torch.manual_seed(0)
45
- unet = UNet2DConditionModel(
46
- block_out_channels=(32, 64),
47
- layers_per_block=2,
48
- sample_size=32,
49
- in_channels=4,
50
- out_channels=4,
51
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
52
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
53
- cross_attention_dim=32,
54
- )
55
- scheduler = DDIMScheduler(
56
- beta_start=0.00085,
57
- beta_end=0.012,
58
- beta_schedule="scaled_linear",
59
- clip_sample=False,
60
- set_alpha_to_one=False,
61
- )
62
- torch.manual_seed(0)
63
- vae = AutoencoderKL(
64
- block_out_channels=[32, 64],
65
- in_channels=3,
66
- out_channels=3,
67
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
68
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
69
- latent_channels=4,
70
- )
71
-
72
- # TODO: address the non-deterministic text encoder (fails for save-load tests)
73
- # torch.manual_seed(0)
74
- # text_encoder_config = RobertaSeriesConfig(
75
- # hidden_size=32,
76
- # project_dim=32,
77
- # intermediate_size=37,
78
- # layer_norm_eps=1e-05,
79
- # num_attention_heads=4,
80
- # num_hidden_layers=5,
81
- # vocab_size=5002,
82
- # )
83
- # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
84
-
85
- torch.manual_seed(0)
86
- text_encoder_config = CLIPTextConfig(
87
- bos_token_id=0,
88
- eos_token_id=2,
89
- hidden_size=32,
90
- projection_dim=32,
91
- intermediate_size=37,
92
- layer_norm_eps=1e-05,
93
- num_attention_heads=4,
94
- num_hidden_layers=5,
95
- pad_token_id=1,
96
- vocab_size=5002,
97
- )
98
- text_encoder = CLIPTextModel(text_encoder_config)
99
-
100
- tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
101
- tokenizer.model_max_length = 77
102
-
103
- components = {
104
- "unet": unet,
105
- "scheduler": scheduler,
106
- "vae": vae,
107
- "text_encoder": text_encoder,
108
- "tokenizer": tokenizer,
109
- "safety_checker": None,
110
- "feature_extractor": None,
111
- }
112
- return components
113
-
114
- def get_dummy_inputs(self, device, seed=0):
115
- if str(device).startswith("mps"):
116
- generator = torch.manual_seed(seed)
117
- else:
118
- generator = torch.Generator(device=device).manual_seed(seed)
119
- inputs = {
120
- "prompt": "A painting of a squirrel eating a burger",
121
- "generator": generator,
122
- "num_inference_steps": 2,
123
- "guidance_scale": 6.0,
124
- "output_type": "numpy",
125
- }
126
- return inputs
127
-
128
- def test_alt_diffusion_ddim(self):
129
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
130
-
131
- components = self.get_dummy_components()
132
- torch.manual_seed(0)
133
- text_encoder_config = RobertaSeriesConfig(
134
- hidden_size=32,
135
- project_dim=32,
136
- intermediate_size=37,
137
- layer_norm_eps=1e-05,
138
- num_attention_heads=4,
139
- num_hidden_layers=5,
140
- vocab_size=5002,
141
- )
142
- # TODO: remove after fixing the non-deterministic text encoder
143
- text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
144
- components["text_encoder"] = text_encoder
145
-
146
- alt_pipe = AltDiffusionPipeline(**components)
147
- alt_pipe = alt_pipe.to(device)
148
- alt_pipe.set_progress_bar_config(disable=None)
149
-
150
- inputs = self.get_dummy_inputs(device)
151
- inputs["prompt"] = "A photo of an astronaut"
152
- output = alt_pipe(**inputs)
153
- image = output.images
154
- image_slice = image[0, -3:, -3:, -1]
155
-
156
- assert image.shape == (1, 64, 64, 3)
157
- expected_slice = np.array(
158
- [0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093]
159
- )
160
-
161
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
162
-
163
- def test_alt_diffusion_pndm(self):
164
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
165
-
166
- components = self.get_dummy_components()
167
- components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
168
- torch.manual_seed(0)
169
- text_encoder_config = RobertaSeriesConfig(
170
- hidden_size=32,
171
- project_dim=32,
172
- intermediate_size=37,
173
- layer_norm_eps=1e-05,
174
- num_attention_heads=4,
175
- num_hidden_layers=5,
176
- vocab_size=5002,
177
- )
178
- # TODO: remove after fixing the non-deterministic text encoder
179
- text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
180
- components["text_encoder"] = text_encoder
181
- alt_pipe = AltDiffusionPipeline(**components)
182
- alt_pipe = alt_pipe.to(device)
183
- alt_pipe.set_progress_bar_config(disable=None)
184
-
185
- inputs = self.get_dummy_inputs(device)
186
- output = alt_pipe(**inputs)
187
- image = output.images
188
- image_slice = image[0, -3:, -3:, -1]
189
-
190
- assert image.shape == (1, 64, 64, 3)
191
- expected_slice = np.array(
192
- [0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237]
193
- )
194
-
195
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
196
-
197
-
198
- @slow
199
- @require_torch_gpu
200
- class AltDiffusionPipelineIntegrationTests(unittest.TestCase):
201
- def tearDown(self):
202
- # clean up the VRAM after each test
203
- super().tearDown()
204
- gc.collect()
205
- torch.cuda.empty_cache()
206
-
207
- def test_alt_diffusion(self):
208
- # make sure here that pndm scheduler skips prk
209
- alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
210
- alt_pipe = alt_pipe.to(torch_device)
211
- alt_pipe.set_progress_bar_config(disable=None)
212
-
213
- prompt = "A painting of a squirrel eating a burger"
214
- generator = torch.manual_seed(0)
215
- output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
216
-
217
- image = output.images
218
-
219
- image_slice = image[0, -3:, -3:, -1]
220
-
221
- assert image.shape == (1, 512, 512, 3)
222
- expected_slice = np.array([0.1010, 0.0800, 0.0794, 0.0885, 0.0843, 0.0762, 0.0769, 0.0729, 0.0586])
223
-
224
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
225
-
226
- def test_alt_diffusion_fast_ddim(self):
227
- scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
228
-
229
- alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
230
- alt_pipe = alt_pipe.to(torch_device)
231
- alt_pipe.set_progress_bar_config(disable=None)
232
-
233
- prompt = "A painting of a squirrel eating a burger"
234
- generator = torch.manual_seed(0)
235
-
236
- output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
237
- image = output.images
238
-
239
- image_slice = image[0, -3:, -3:, -1]
240
-
241
- assert image.shape == (1, 512, 512, 3)
242
- expected_slice = np.array([0.4019, 0.4052, 0.3810, 0.4119, 0.3916, 0.3982, 0.4651, 0.4195, 0.5323])
243
-
244
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py DELETED
@@ -1,299 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import random
18
- import unittest
19
-
20
- import numpy as np
21
- import torch
22
- from transformers import XLMRobertaTokenizer
23
-
24
- from diffusers import (
25
- AltDiffusionImg2ImgPipeline,
26
- AutoencoderKL,
27
- PNDMScheduler,
28
- UNet2DConditionModel,
29
- )
30
- from diffusers.image_processor import VaeImageProcessor
31
- from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
32
- RobertaSeriesConfig,
33
- RobertaSeriesModelWithTransformation,
34
- )
35
- from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
36
- from diffusers.utils.testing_utils import require_torch_gpu
37
-
38
-
39
- torch.backends.cuda.matmul.allow_tf32 = False
40
-
41
-
42
- class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
43
- def tearDown(self):
44
- # clean up the VRAM after each test
45
- super().tearDown()
46
- gc.collect()
47
- torch.cuda.empty_cache()
48
-
49
- @property
50
- def dummy_image(self):
51
- batch_size = 1
52
- num_channels = 3
53
- sizes = (32, 32)
54
-
55
- image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
56
- return image
57
-
58
- @property
59
- def dummy_cond_unet(self):
60
- torch.manual_seed(0)
61
- model = UNet2DConditionModel(
62
- block_out_channels=(32, 64),
63
- layers_per_block=2,
64
- sample_size=32,
65
- in_channels=4,
66
- out_channels=4,
67
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
68
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
69
- cross_attention_dim=32,
70
- )
71
- return model
72
-
73
- @property
74
- def dummy_vae(self):
75
- torch.manual_seed(0)
76
- model = AutoencoderKL(
77
- block_out_channels=[32, 64],
78
- in_channels=3,
79
- out_channels=3,
80
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
81
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
82
- latent_channels=4,
83
- )
84
- return model
85
-
86
- @property
87
- def dummy_text_encoder(self):
88
- torch.manual_seed(0)
89
- config = RobertaSeriesConfig(
90
- hidden_size=32,
91
- project_dim=32,
92
- intermediate_size=37,
93
- layer_norm_eps=1e-05,
94
- num_attention_heads=4,
95
- num_hidden_layers=5,
96
- pad_token_id=1,
97
- vocab_size=5006,
98
- )
99
- return RobertaSeriesModelWithTransformation(config)
100
-
101
- @property
102
- def dummy_extractor(self):
103
- def extract(*args, **kwargs):
104
- class Out:
105
- def __init__(self):
106
- self.pixel_values = torch.ones([0])
107
-
108
- def to(self, device):
109
- self.pixel_values.to(device)
110
- return self
111
-
112
- return Out()
113
-
114
- return extract
115
-
116
- def test_stable_diffusion_img2img_default_case(self):
117
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
118
- unet = self.dummy_cond_unet
119
- scheduler = PNDMScheduler(skip_prk_steps=True)
120
- vae = self.dummy_vae
121
- bert = self.dummy_text_encoder
122
- tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
123
- tokenizer.model_max_length = 77
124
-
125
- init_image = self.dummy_image.to(device)
126
-
127
- # make sure here that pndm scheduler skips prk
128
- alt_pipe = AltDiffusionImg2ImgPipeline(
129
- unet=unet,
130
- scheduler=scheduler,
131
- vae=vae,
132
- text_encoder=bert,
133
- tokenizer=tokenizer,
134
- safety_checker=None,
135
- feature_extractor=self.dummy_extractor,
136
- )
137
- alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
138
- alt_pipe = alt_pipe.to(device)
139
- alt_pipe.set_progress_bar_config(disable=None)
140
-
141
- prompt = "A painting of a squirrel eating a burger"
142
- generator = torch.Generator(device=device).manual_seed(0)
143
- output = alt_pipe(
144
- [prompt],
145
- generator=generator,
146
- guidance_scale=6.0,
147
- num_inference_steps=2,
148
- output_type="np",
149
- image=init_image,
150
- )
151
-
152
- image = output.images
153
-
154
- generator = torch.Generator(device=device).manual_seed(0)
155
- image_from_tuple = alt_pipe(
156
- [prompt],
157
- generator=generator,
158
- guidance_scale=6.0,
159
- num_inference_steps=2,
160
- output_type="np",
161
- image=init_image,
162
- return_dict=False,
163
- )[0]
164
-
165
- image_slice = image[0, -3:, -3:, -1]
166
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
167
-
168
- assert image.shape == (1, 32, 32, 3)
169
- expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569])
170
-
171
- assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
172
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3
173
-
174
- @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
175
- def test_stable_diffusion_img2img_fp16(self):
176
- """Test that stable diffusion img2img works with fp16"""
177
- unet = self.dummy_cond_unet
178
- scheduler = PNDMScheduler(skip_prk_steps=True)
179
- vae = self.dummy_vae
180
- bert = self.dummy_text_encoder
181
- tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
182
- tokenizer.model_max_length = 77
183
-
184
- init_image = self.dummy_image.to(torch_device)
185
-
186
- # put models in fp16
187
- unet = unet.half()
188
- vae = vae.half()
189
- bert = bert.half()
190
-
191
- # make sure here that pndm scheduler skips prk
192
- alt_pipe = AltDiffusionImg2ImgPipeline(
193
- unet=unet,
194
- scheduler=scheduler,
195
- vae=vae,
196
- text_encoder=bert,
197
- tokenizer=tokenizer,
198
- safety_checker=None,
199
- feature_extractor=self.dummy_extractor,
200
- )
201
- alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
202
- alt_pipe = alt_pipe.to(torch_device)
203
- alt_pipe.set_progress_bar_config(disable=None)
204
-
205
- prompt = "A painting of a squirrel eating a burger"
206
- generator = torch.manual_seed(0)
207
- image = alt_pipe(
208
- [prompt],
209
- generator=generator,
210
- num_inference_steps=2,
211
- output_type="np",
212
- image=init_image,
213
- ).images
214
-
215
- assert image.shape == (1, 32, 32, 3)
216
-
217
- @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
218
- def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
219
- init_image = load_image(
220
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
221
- "/img2img/sketch-mountains-input.jpg"
222
- )
223
- # resize to resolution that is divisible by 8 but not 16 or 32
224
- init_image = init_image.resize((760, 504))
225
-
226
- model_id = "BAAI/AltDiffusion"
227
- pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
228
- model_id,
229
- safety_checker=None,
230
- )
231
- pipe.to(torch_device)
232
- pipe.set_progress_bar_config(disable=None)
233
- pipe.enable_attention_slicing()
234
-
235
- prompt = "A fantasy landscape, trending on artstation"
236
-
237
- generator = torch.manual_seed(0)
238
- output = pipe(
239
- prompt=prompt,
240
- image=init_image,
241
- strength=0.75,
242
- guidance_scale=7.5,
243
- generator=generator,
244
- output_type="np",
245
- )
246
- image = output.images[0]
247
-
248
- image_slice = image[255:258, 383:386, -1]
249
-
250
- assert image.shape == (504, 760, 3)
251
- expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
252
-
253
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
254
-
255
-
256
- @slow
257
- @require_torch_gpu
258
- class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
259
- def tearDown(self):
260
- # clean up the VRAM after each test
261
- super().tearDown()
262
- gc.collect()
263
- torch.cuda.empty_cache()
264
-
265
- def test_stable_diffusion_img2img_pipeline_default(self):
266
- init_image = load_image(
267
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
268
- "/img2img/sketch-mountains-input.jpg"
269
- )
270
- init_image = init_image.resize((768, 512))
271
- expected_image = load_numpy(
272
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
273
- )
274
-
275
- model_id = "BAAI/AltDiffusion"
276
- pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
277
- model_id,
278
- safety_checker=None,
279
- )
280
- pipe.to(torch_device)
281
- pipe.set_progress_bar_config(disable=None)
282
- pipe.enable_attention_slicing()
283
-
284
- prompt = "A fantasy landscape, trending on artstation"
285
-
286
- generator = torch.manual_seed(0)
287
- output = pipe(
288
- prompt=prompt,
289
- image=init_image,
290
- strength=0.75,
291
- guidance_scale=7.5,
292
- generator=generator,
293
- output_type="np",
294
- )
295
- image = output.images[0]
296
-
297
- assert image.shape == (512, 768, 3)
298
- # img2img is flaky across GPUs even in fp32, so using MAE here
299
- assert np.abs(expected_image - image).max() < 1e-3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/audio_diffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py DELETED
@@ -1,191 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import (
23
- AudioDiffusionPipeline,
24
- AutoencoderKL,
25
- DDIMScheduler,
26
- DDPMScheduler,
27
- DiffusionPipeline,
28
- Mel,
29
- UNet2DConditionModel,
30
- UNet2DModel,
31
- )
32
- from diffusers.utils import slow, torch_device
33
- from diffusers.utils.testing_utils import require_torch_gpu
34
-
35
-
36
- torch.backends.cuda.matmul.allow_tf32 = False
37
-
38
-
39
- class PipelineFastTests(unittest.TestCase):
40
- def tearDown(self):
41
- # clean up the VRAM after each test
42
- super().tearDown()
43
- gc.collect()
44
- torch.cuda.empty_cache()
45
-
46
- @property
47
- def dummy_unet(self):
48
- torch.manual_seed(0)
49
- model = UNet2DModel(
50
- sample_size=(32, 64),
51
- in_channels=1,
52
- out_channels=1,
53
- layers_per_block=2,
54
- block_out_channels=(128, 128),
55
- down_block_types=("AttnDownBlock2D", "DownBlock2D"),
56
- up_block_types=("UpBlock2D", "AttnUpBlock2D"),
57
- )
58
- return model
59
-
60
- @property
61
- def dummy_unet_condition(self):
62
- torch.manual_seed(0)
63
- model = UNet2DConditionModel(
64
- sample_size=(64, 32),
65
- in_channels=1,
66
- out_channels=1,
67
- layers_per_block=2,
68
- block_out_channels=(128, 128),
69
- down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
70
- up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
71
- cross_attention_dim=10,
72
- )
73
- return model
74
-
75
- @property
76
- def dummy_vqvae_and_unet(self):
77
- torch.manual_seed(0)
78
- vqvae = AutoencoderKL(
79
- sample_size=(128, 64),
80
- in_channels=1,
81
- out_channels=1,
82
- latent_channels=1,
83
- layers_per_block=2,
84
- block_out_channels=(128, 128),
85
- down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
86
- up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
87
- )
88
- unet = UNet2DModel(
89
- sample_size=(64, 32),
90
- in_channels=1,
91
- out_channels=1,
92
- layers_per_block=2,
93
- block_out_channels=(128, 128),
94
- down_block_types=("AttnDownBlock2D", "DownBlock2D"),
95
- up_block_types=("UpBlock2D", "AttnUpBlock2D"),
96
- )
97
- return vqvae, unet
98
-
99
- @slow
100
- def test_audio_diffusion(self):
101
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
102
- mel = Mel()
103
-
104
- scheduler = DDPMScheduler()
105
- pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
106
- pipe = pipe.to(device)
107
- pipe.set_progress_bar_config(disable=None)
108
-
109
- generator = torch.Generator(device=device).manual_seed(42)
110
- output = pipe(generator=generator, steps=4)
111
- audio = output.audios[0]
112
- image = output.images[0]
113
-
114
- generator = torch.Generator(device=device).manual_seed(42)
115
- output = pipe(generator=generator, steps=4, return_dict=False)
116
- image_from_tuple = output[0][0]
117
-
118
- assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length)
119
- assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1]
120
- image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
121
- image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
122
- expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127])
123
-
124
- assert np.abs(image_slice.flatten() - expected_slice).max() == 0
125
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0
126
-
127
- scheduler = DDIMScheduler()
128
- dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
129
- pipe = AudioDiffusionPipeline(
130
- vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
131
- )
132
- pipe = pipe.to(device)
133
- pipe.set_progress_bar_config(disable=None)
134
-
135
- np.random.seed(0)
136
- raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,))
137
- generator = torch.Generator(device=device).manual_seed(42)
138
- output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
139
- image = output.images[0]
140
-
141
- assert (
142
- image.height == self.dummy_vqvae_and_unet[0].sample_size[0]
143
- and image.width == self.dummy_vqvae_and_unet[0].sample_size[1]
144
- )
145
- image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
146
- expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])
147
-
148
- assert np.abs(image_slice.flatten() - expected_slice).max() == 0
149
-
150
- dummy_unet_condition = self.dummy_unet_condition
151
- pipe = AudioDiffusionPipeline(
152
- vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler
153
- )
154
-
155
- np.random.seed(0)
156
- encoding = torch.rand((1, 1, 10))
157
- output = pipe(generator=generator, encoding=encoding)
158
- image = output.images[0]
159
- image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
160
- expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144])
161
-
162
- assert np.abs(image_slice.flatten() - expected_slice).max() == 0
163
-
164
-
165
- @slow
166
- @require_torch_gpu
167
- class PipelineIntegrationTests(unittest.TestCase):
168
- def tearDown(self):
169
- # clean up the VRAM after each test
170
- super().tearDown()
171
- gc.collect()
172
- torch.cuda.empty_cache()
173
-
174
- def test_audio_diffusion(self):
175
- device = torch_device
176
-
177
- pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
178
- pipe = pipe.to(device)
179
- pipe.set_progress_bar_config(disable=None)
180
-
181
- generator = torch.Generator(device=device).manual_seed(42)
182
- output = pipe(generator=generator)
183
- audio = output.audios[0]
184
- image = output.images[0]
185
-
186
- assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length)
187
- assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1]
188
- image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
189
- expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
190
-
191
- assert np.abs(image_slice.flatten() - expected_slice).max() == 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/audioldm/__init__.py DELETED
File without changes
diffusers/tests/pipelines/audioldm/test_audioldm.py DELETED
@@ -1,416 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
-
17
- import gc
18
- import unittest
19
-
20
- import numpy as np
21
- import torch
22
- import torch.nn.functional as F
23
- from transformers import (
24
- ClapTextConfig,
25
- ClapTextModelWithProjection,
26
- RobertaTokenizer,
27
- SpeechT5HifiGan,
28
- SpeechT5HifiGanConfig,
29
- )
30
-
31
- from diffusers import (
32
- AudioLDMPipeline,
33
- AutoencoderKL,
34
- DDIMScheduler,
35
- LMSDiscreteScheduler,
36
- PNDMScheduler,
37
- UNet2DConditionModel,
38
- )
39
- from diffusers.utils import slow, torch_device
40
-
41
- from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
42
- from ...test_pipelines_common import PipelineTesterMixin
43
-
44
-
45
- class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
46
- pipeline_class = AudioLDMPipeline
47
- params = TEXT_TO_AUDIO_PARAMS
48
- batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
49
- required_optional_params = frozenset(
50
- [
51
- "num_inference_steps",
52
- "num_waveforms_per_prompt",
53
- "generator",
54
- "latents",
55
- "output_type",
56
- "return_dict",
57
- "callback",
58
- "callback_steps",
59
- ]
60
- )
61
-
62
- def get_dummy_components(self):
63
- torch.manual_seed(0)
64
- unet = UNet2DConditionModel(
65
- block_out_channels=(32, 64),
66
- layers_per_block=2,
67
- sample_size=32,
68
- in_channels=4,
69
- out_channels=4,
70
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
71
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
72
- cross_attention_dim=(32, 64),
73
- class_embed_type="simple_projection",
74
- projection_class_embeddings_input_dim=32,
75
- class_embeddings_concat=True,
76
- )
77
- scheduler = DDIMScheduler(
78
- beta_start=0.00085,
79
- beta_end=0.012,
80
- beta_schedule="scaled_linear",
81
- clip_sample=False,
82
- set_alpha_to_one=False,
83
- )
84
- torch.manual_seed(0)
85
- vae = AutoencoderKL(
86
- block_out_channels=[32, 64],
87
- in_channels=1,
88
- out_channels=1,
89
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
90
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
91
- latent_channels=4,
92
- )
93
- torch.manual_seed(0)
94
- text_encoder_config = ClapTextConfig(
95
- bos_token_id=0,
96
- eos_token_id=2,
97
- hidden_size=32,
98
- intermediate_size=37,
99
- layer_norm_eps=1e-05,
100
- num_attention_heads=4,
101
- num_hidden_layers=5,
102
- pad_token_id=1,
103
- vocab_size=1000,
104
- projection_dim=32,
105
- )
106
- text_encoder = ClapTextModelWithProjection(text_encoder_config)
107
- tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
108
-
109
- vocoder_config = SpeechT5HifiGanConfig(
110
- model_in_dim=8,
111
- sampling_rate=16000,
112
- upsample_initial_channel=16,
113
- upsample_rates=[2, 2],
114
- upsample_kernel_sizes=[4, 4],
115
- resblock_kernel_sizes=[3, 7],
116
- resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
117
- normalize_before=False,
118
- )
119
-
120
- vocoder = SpeechT5HifiGan(vocoder_config)
121
-
122
- components = {
123
- "unet": unet,
124
- "scheduler": scheduler,
125
- "vae": vae,
126
- "text_encoder": text_encoder,
127
- "tokenizer": tokenizer,
128
- "vocoder": vocoder,
129
- }
130
- return components
131
-
132
- def get_dummy_inputs(self, device, seed=0):
133
- if str(device).startswith("mps"):
134
- generator = torch.manual_seed(seed)
135
- else:
136
- generator = torch.Generator(device=device).manual_seed(seed)
137
- inputs = {
138
- "prompt": "A hammer hitting a wooden surface",
139
- "generator": generator,
140
- "num_inference_steps": 2,
141
- "guidance_scale": 6.0,
142
- }
143
- return inputs
144
-
145
- def test_audioldm_ddim(self):
146
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
147
-
148
- components = self.get_dummy_components()
149
- audioldm_pipe = AudioLDMPipeline(**components)
150
- audioldm_pipe = audioldm_pipe.to(torch_device)
151
- audioldm_pipe.set_progress_bar_config(disable=None)
152
-
153
- inputs = self.get_dummy_inputs(device)
154
- output = audioldm_pipe(**inputs)
155
- audio = output.audios[0]
156
-
157
- assert audio.ndim == 1
158
- assert len(audio) == 256
159
-
160
- audio_slice = audio[:10]
161
- expected_slice = np.array(
162
- [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
163
- )
164
-
165
- assert np.abs(audio_slice - expected_slice).max() < 1e-2
166
-
167
- def test_audioldm_prompt_embeds(self):
168
- components = self.get_dummy_components()
169
- audioldm_pipe = AudioLDMPipeline(**components)
170
- audioldm_pipe = audioldm_pipe.to(torch_device)
171
- audioldm_pipe = audioldm_pipe.to(torch_device)
172
- audioldm_pipe.set_progress_bar_config(disable=None)
173
-
174
- inputs = self.get_dummy_inputs(torch_device)
175
- inputs["prompt"] = 3 * [inputs["prompt"]]
176
-
177
- # forward
178
- output = audioldm_pipe(**inputs)
179
- audio_1 = output.audios[0]
180
-
181
- inputs = self.get_dummy_inputs(torch_device)
182
- prompt = 3 * [inputs.pop("prompt")]
183
-
184
- text_inputs = audioldm_pipe.tokenizer(
185
- prompt,
186
- padding="max_length",
187
- max_length=audioldm_pipe.tokenizer.model_max_length,
188
- truncation=True,
189
- return_tensors="pt",
190
- )
191
- text_inputs = text_inputs["input_ids"].to(torch_device)
192
-
193
- prompt_embeds = audioldm_pipe.text_encoder(
194
- text_inputs,
195
- )
196
- prompt_embeds = prompt_embeds.text_embeds
197
- # additional L_2 normalization over each hidden-state
198
- prompt_embeds = F.normalize(prompt_embeds, dim=-1)
199
-
200
- inputs["prompt_embeds"] = prompt_embeds
201
-
202
- # forward
203
- output = audioldm_pipe(**inputs)
204
- audio_2 = output.audios[0]
205
-
206
- assert np.abs(audio_1 - audio_2).max() < 1e-2
207
-
208
- def test_audioldm_negative_prompt_embeds(self):
209
- components = self.get_dummy_components()
210
- audioldm_pipe = AudioLDMPipeline(**components)
211
- audioldm_pipe = audioldm_pipe.to(torch_device)
212
- audioldm_pipe = audioldm_pipe.to(torch_device)
213
- audioldm_pipe.set_progress_bar_config(disable=None)
214
-
215
- inputs = self.get_dummy_inputs(torch_device)
216
- negative_prompt = 3 * ["this is a negative prompt"]
217
- inputs["negative_prompt"] = negative_prompt
218
- inputs["prompt"] = 3 * [inputs["prompt"]]
219
-
220
- # forward
221
- output = audioldm_pipe(**inputs)
222
- audio_1 = output.audios[0]
223
-
224
- inputs = self.get_dummy_inputs(torch_device)
225
- prompt = 3 * [inputs.pop("prompt")]
226
-
227
- embeds = []
228
- for p in [prompt, negative_prompt]:
229
- text_inputs = audioldm_pipe.tokenizer(
230
- p,
231
- padding="max_length",
232
- max_length=audioldm_pipe.tokenizer.model_max_length,
233
- truncation=True,
234
- return_tensors="pt",
235
- )
236
- text_inputs = text_inputs["input_ids"].to(torch_device)
237
-
238
- text_embeds = audioldm_pipe.text_encoder(
239
- text_inputs,
240
- )
241
- text_embeds = text_embeds.text_embeds
242
- # additional L_2 normalization over each hidden-state
243
- text_embeds = F.normalize(text_embeds, dim=-1)
244
-
245
- embeds.append(text_embeds)
246
-
247
- inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
248
-
249
- # forward
250
- output = audioldm_pipe(**inputs)
251
- audio_2 = output.audios[0]
252
-
253
- assert np.abs(audio_1 - audio_2).max() < 1e-2
254
-
255
- def test_audioldm_negative_prompt(self):
256
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
257
- components = self.get_dummy_components()
258
- components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
259
- audioldm_pipe = AudioLDMPipeline(**components)
260
- audioldm_pipe = audioldm_pipe.to(device)
261
- audioldm_pipe.set_progress_bar_config(disable=None)
262
-
263
- inputs = self.get_dummy_inputs(device)
264
- negative_prompt = "egg cracking"
265
- output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
266
- audio = output.audios[0]
267
-
268
- assert audio.ndim == 1
269
- assert len(audio) == 256
270
-
271
- audio_slice = audio[:10]
272
- expected_slice = np.array(
273
- [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
274
- )
275
-
276
- assert np.abs(audio_slice - expected_slice).max() < 1e-2
277
-
278
- def test_audioldm_num_waveforms_per_prompt(self):
279
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
280
- components = self.get_dummy_components()
281
- components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
282
- audioldm_pipe = AudioLDMPipeline(**components)
283
- audioldm_pipe = audioldm_pipe.to(device)
284
- audioldm_pipe.set_progress_bar_config(disable=None)
285
-
286
- prompt = "A hammer hitting a wooden surface"
287
-
288
- # test num_waveforms_per_prompt=1 (default)
289
- audios = audioldm_pipe(prompt, num_inference_steps=2).audios
290
-
291
- assert audios.shape == (1, 256)
292
-
293
- # test num_waveforms_per_prompt=1 (default) for batch of prompts
294
- batch_size = 2
295
- audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
296
-
297
- assert audios.shape == (batch_size, 256)
298
-
299
- # test num_waveforms_per_prompt for single prompt
300
- num_waveforms_per_prompt = 2
301
- audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
302
-
303
- assert audios.shape == (num_waveforms_per_prompt, 256)
304
-
305
- # test num_waveforms_per_prompt for batch of prompts
306
- batch_size = 2
307
- audios = audioldm_pipe(
308
- [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
309
- ).audios
310
-
311
- assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
312
-
313
- def test_audioldm_audio_length_in_s(self):
314
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
315
- components = self.get_dummy_components()
316
- audioldm_pipe = AudioLDMPipeline(**components)
317
- audioldm_pipe = audioldm_pipe.to(torch_device)
318
- audioldm_pipe.set_progress_bar_config(disable=None)
319
- vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
320
-
321
- inputs = self.get_dummy_inputs(device)
322
- output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
323
- audio = output.audios[0]
324
-
325
- assert audio.ndim == 1
326
- assert len(audio) / vocoder_sampling_rate == 0.016
327
-
328
- output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
329
- audio = output.audios[0]
330
-
331
- assert audio.ndim == 1
332
- assert len(audio) / vocoder_sampling_rate == 0.032
333
-
334
- def test_audioldm_vocoder_model_in_dim(self):
335
- components = self.get_dummy_components()
336
- audioldm_pipe = AudioLDMPipeline(**components)
337
- audioldm_pipe = audioldm_pipe.to(torch_device)
338
- audioldm_pipe.set_progress_bar_config(disable=None)
339
-
340
- prompt = ["hey"]
341
-
342
- output = audioldm_pipe(prompt, num_inference_steps=1)
343
- audio_shape = output.audios.shape
344
- assert audio_shape == (1, 256)
345
-
346
- config = audioldm_pipe.vocoder.config
347
- config.model_in_dim *= 2
348
- audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
349
- output = audioldm_pipe(prompt, num_inference_steps=1)
350
- audio_shape = output.audios.shape
351
- # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
352
- assert audio_shape == (1, 256)
353
-
354
- def test_attention_slicing_forward_pass(self):
355
- self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
356
-
357
- def test_inference_batch_single_identical(self):
358
- self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
359
-
360
-
361
- @slow
362
- # @require_torch_gpu
363
- class AudioLDMPipelineSlowTests(unittest.TestCase):
364
- def tearDown(self):
365
- super().tearDown()
366
- gc.collect()
367
- torch.cuda.empty_cache()
368
-
369
- def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
370
- generator = torch.Generator(device=generator_device).manual_seed(seed)
371
- latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
372
- latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
373
- inputs = {
374
- "prompt": "A hammer hitting a wooden surface",
375
- "latents": latents,
376
- "generator": generator,
377
- "num_inference_steps": 3,
378
- "guidance_scale": 2.5,
379
- }
380
- return inputs
381
-
382
- def test_audioldm(self):
383
- audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
384
- audioldm_pipe = audioldm_pipe.to(torch_device)
385
- audioldm_pipe.set_progress_bar_config(disable=None)
386
-
387
- inputs = self.get_inputs(torch_device)
388
- inputs["num_inference_steps"] = 25
389
- audio = audioldm_pipe(**inputs).audios[0]
390
-
391
- assert audio.ndim == 1
392
- assert len(audio) == 81920
393
-
394
- audio_slice = audio[77230:77240]
395
- expected_slice = np.array(
396
- [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
397
- )
398
- max_diff = np.abs(expected_slice - audio_slice).max()
399
- assert max_diff < 1e-2
400
-
401
- def test_audioldm_lms(self):
402
- audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
403
- audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
404
- audioldm_pipe = audioldm_pipe.to(torch_device)
405
- audioldm_pipe.set_progress_bar_config(disable=None)
406
-
407
- inputs = self.get_inputs(torch_device)
408
- audio = audioldm_pipe(**inputs).audios[0]
409
-
410
- assert audio.ndim == 1
411
- assert len(audio) == 81920
412
-
413
- audio_slice = audio[27780:27790]
414
- expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
415
- max_diff = np.abs(expected_slice - audio_slice).max()
416
- assert max_diff < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/dance_diffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py DELETED
@@ -1,160 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
23
- from diffusers.utils import slow, torch_device
24
- from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
25
-
26
- from ...pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
27
- from ...test_pipelines_common import PipelineTesterMixin
28
-
29
-
30
- torch.backends.cuda.matmul.allow_tf32 = False
31
-
32
-
33
- class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
34
- pipeline_class = DanceDiffusionPipeline
35
- params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
36
- required_optional_params = PipelineTesterMixin.required_optional_params - {
37
- "callback",
38
- "latents",
39
- "callback_steps",
40
- "output_type",
41
- "num_images_per_prompt",
42
- }
43
- batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
44
- test_attention_slicing = False
45
- test_cpu_offload = False
46
-
47
- def get_dummy_components(self):
48
- torch.manual_seed(0)
49
- unet = UNet1DModel(
50
- block_out_channels=(32, 32, 64),
51
- extra_in_channels=16,
52
- sample_size=512,
53
- sample_rate=16_000,
54
- in_channels=2,
55
- out_channels=2,
56
- flip_sin_to_cos=True,
57
- use_timestep_embedding=False,
58
- time_embedding_type="fourier",
59
- mid_block_type="UNetMidBlock1D",
60
- down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
61
- up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
62
- )
63
- scheduler = IPNDMScheduler()
64
-
65
- components = {
66
- "unet": unet,
67
- "scheduler": scheduler,
68
- }
69
- return components
70
-
71
- def get_dummy_inputs(self, device, seed=0):
72
- if str(device).startswith("mps"):
73
- generator = torch.manual_seed(seed)
74
- else:
75
- generator = torch.Generator(device=device).manual_seed(seed)
76
- inputs = {
77
- "batch_size": 1,
78
- "generator": generator,
79
- "num_inference_steps": 4,
80
- }
81
- return inputs
82
-
83
- def test_dance_diffusion(self):
84
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
85
- components = self.get_dummy_components()
86
- pipe = DanceDiffusionPipeline(**components)
87
- pipe = pipe.to(device)
88
- pipe.set_progress_bar_config(disable=None)
89
-
90
- inputs = self.get_dummy_inputs(device)
91
- output = pipe(**inputs)
92
- audio = output.audios
93
-
94
- audio_slice = audio[0, -3:, -3:]
95
-
96
- assert audio.shape == (1, 2, components["unet"].sample_size)
97
- expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
98
- assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
99
-
100
- @skip_mps
101
- def test_save_load_local(self):
102
- return super().test_save_load_local()
103
-
104
- @skip_mps
105
- def test_dict_tuple_outputs_equivalent(self):
106
- return super().test_dict_tuple_outputs_equivalent()
107
-
108
- @skip_mps
109
- def test_save_load_optional_components(self):
110
- return super().test_save_load_optional_components()
111
-
112
- @skip_mps
113
- def test_attention_slicing_forward_pass(self):
114
- return super().test_attention_slicing_forward_pass()
115
-
116
-
117
- @slow
118
- @require_torch_gpu
119
- class PipelineIntegrationTests(unittest.TestCase):
120
- def tearDown(self):
121
- # clean up the VRAM after each test
122
- super().tearDown()
123
- gc.collect()
124
- torch.cuda.empty_cache()
125
-
126
- def test_dance_diffusion(self):
127
- device = torch_device
128
-
129
- pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
130
- pipe = pipe.to(device)
131
- pipe.set_progress_bar_config(disable=None)
132
-
133
- generator = torch.manual_seed(0)
134
- output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
135
- audio = output.audios
136
-
137
- audio_slice = audio[0, -3:, -3:]
138
-
139
- assert audio.shape == (1, 2, pipe.unet.sample_size)
140
- expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
141
-
142
- assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
143
-
144
- def test_dance_diffusion_fp16(self):
145
- device = torch_device
146
-
147
- pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
148
- pipe = pipe.to(device)
149
- pipe.set_progress_bar_config(disable=None)
150
-
151
- generator = torch.manual_seed(0)
152
- output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
153
- audio = output.audios
154
-
155
- audio_slice = audio[0, -3:, -3:]
156
-
157
- assert audio.shape == (1, 2, pipe.unet.sample_size)
158
- expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
159
-
160
- assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/ddim/__init__.py DELETED
File without changes
diffusers/tests/pipelines/ddim/test_ddim.py DELETED
@@ -1,132 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
22
- from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
23
-
24
- from ...pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
25
- from ...test_pipelines_common import PipelineTesterMixin
26
-
27
-
28
- torch.backends.cuda.matmul.allow_tf32 = False
29
-
30
-
31
- class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
32
- pipeline_class = DDIMPipeline
33
- params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS
34
- required_optional_params = PipelineTesterMixin.required_optional_params - {
35
- "num_images_per_prompt",
36
- "latents",
37
- "callback",
38
- "callback_steps",
39
- }
40
- batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS
41
- test_cpu_offload = False
42
-
43
- def get_dummy_components(self):
44
- torch.manual_seed(0)
45
- unet = UNet2DModel(
46
- block_out_channels=(32, 64),
47
- layers_per_block=2,
48
- sample_size=32,
49
- in_channels=3,
50
- out_channels=3,
51
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
52
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
53
- )
54
- scheduler = DDIMScheduler()
55
- components = {"unet": unet, "scheduler": scheduler}
56
- return components
57
-
58
- def get_dummy_inputs(self, device, seed=0):
59
- if str(device).startswith("mps"):
60
- generator = torch.manual_seed(seed)
61
- else:
62
- generator = torch.Generator(device=device).manual_seed(seed)
63
- inputs = {
64
- "batch_size": 1,
65
- "generator": generator,
66
- "num_inference_steps": 2,
67
- "output_type": "numpy",
68
- }
69
- return inputs
70
-
71
- def test_inference(self):
72
- device = "cpu"
73
-
74
- components = self.get_dummy_components()
75
- pipe = self.pipeline_class(**components)
76
- pipe.to(device)
77
- pipe.set_progress_bar_config(disable=None)
78
-
79
- inputs = self.get_dummy_inputs(device)
80
- image = pipe(**inputs).images
81
- image_slice = image[0, -3:, -3:, -1]
82
-
83
- self.assertEqual(image.shape, (1, 32, 32, 3))
84
- expected_slice = np.array(
85
- [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
86
- )
87
- max_diff = np.abs(image_slice.flatten() - expected_slice).max()
88
- self.assertLessEqual(max_diff, 1e-3)
89
-
90
-
91
- @slow
92
- @require_torch_gpu
93
- class DDIMPipelineIntegrationTests(unittest.TestCase):
94
- def test_inference_cifar10(self):
95
- model_id = "google/ddpm-cifar10-32"
96
-
97
- unet = UNet2DModel.from_pretrained(model_id)
98
- scheduler = DDIMScheduler()
99
-
100
- ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
101
- ddim.to(torch_device)
102
- ddim.set_progress_bar_config(disable=None)
103
-
104
- generator = torch.manual_seed(0)
105
- image = ddim(generator=generator, eta=0.0, output_type="numpy").images
106
-
107
- image_slice = image[0, -3:, -3:, -1]
108
-
109
- assert image.shape == (1, 32, 32, 3)
110
- expected_slice = np.array([0.1723, 0.1617, 0.1600, 0.1626, 0.1497, 0.1513, 0.1505, 0.1442, 0.1453])
111
-
112
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
113
-
114
- def test_inference_ema_bedroom(self):
115
- model_id = "google/ddpm-ema-bedroom-256"
116
-
117
- unet = UNet2DModel.from_pretrained(model_id)
118
- scheduler = DDIMScheduler.from_pretrained(model_id)
119
-
120
- ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
121
- ddpm.to(torch_device)
122
- ddpm.set_progress_bar_config(disable=None)
123
-
124
- generator = torch.manual_seed(0)
125
- image = ddpm(generator=generator, output_type="numpy").images
126
-
127
- image_slice = image[0, -3:, -3:, -1]
128
-
129
- assert image.shape == (1, 256, 256, 3)
130
- expected_slice = np.array([0.0060, 0.0201, 0.0344, 0.0024, 0.0018, 0.0002, 0.0022, 0.0000, 0.0069])
131
-
132
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/ddpm/__init__.py DELETED
File without changes
diffusers/tests/pipelines/ddpm/test_ddpm.py DELETED
@@ -1,111 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
22
- from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
23
-
24
-
25
- torch.backends.cuda.matmul.allow_tf32 = False
26
-
27
-
28
- class DDPMPipelineFastTests(unittest.TestCase):
29
- @property
30
- def dummy_uncond_unet(self):
31
- torch.manual_seed(0)
32
- model = UNet2DModel(
33
- block_out_channels=(32, 64),
34
- layers_per_block=2,
35
- sample_size=32,
36
- in_channels=3,
37
- out_channels=3,
38
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
39
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
40
- )
41
- return model
42
-
43
- def test_fast_inference(self):
44
- device = "cpu"
45
- unet = self.dummy_uncond_unet
46
- scheduler = DDPMScheduler()
47
-
48
- ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
49
- ddpm.to(device)
50
- ddpm.set_progress_bar_config(disable=None)
51
-
52
- generator = torch.Generator(device=device).manual_seed(0)
53
- image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
54
-
55
- generator = torch.Generator(device=device).manual_seed(0)
56
- image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
57
-
58
- image_slice = image[0, -3:, -3:, -1]
59
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
60
-
61
- assert image.shape == (1, 32, 32, 3)
62
- expected_slice = np.array(
63
- [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04]
64
- )
65
-
66
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
67
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
68
-
69
- def test_inference_predict_sample(self):
70
- unet = self.dummy_uncond_unet
71
- scheduler = DDPMScheduler(prediction_type="sample")
72
-
73
- ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
74
- ddpm.to(torch_device)
75
- ddpm.set_progress_bar_config(disable=None)
76
-
77
- generator = torch.manual_seed(0)
78
- image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
79
-
80
- generator = torch.manual_seed(0)
81
- image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
82
-
83
- image_slice = image[0, -3:, -3:, -1]
84
- image_eps_slice = image_eps[0, -3:, -3:, -1]
85
-
86
- assert image.shape == (1, 32, 32, 3)
87
- tolerance = 1e-2 if torch_device != "mps" else 3e-2
88
- assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
89
-
90
-
91
- @slow
92
- @require_torch_gpu
93
- class DDPMPipelineIntegrationTests(unittest.TestCase):
94
- def test_inference_cifar10(self):
95
- model_id = "google/ddpm-cifar10-32"
96
-
97
- unet = UNet2DModel.from_pretrained(model_id)
98
- scheduler = DDPMScheduler.from_pretrained(model_id)
99
-
100
- ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
101
- ddpm.to(torch_device)
102
- ddpm.set_progress_bar_config(disable=None)
103
-
104
- generator = torch.manual_seed(0)
105
- image = ddpm(generator=generator, output_type="numpy").images
106
-
107
- image_slice = image[0, -3:, -3:, -1]
108
-
109
- assert image.shape == (1, 32, 32, 3)
110
- expected_slice = np.array([0.4200, 0.3588, 0.1939, 0.3847, 0.3382, 0.2647, 0.4155, 0.3582, 0.3385])
111
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/dit/__init__.py DELETED
File without changes
diffusers/tests/pipelines/dit/test_dit.py DELETED
@@ -1,152 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel
23
- from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device
24
- from diffusers.utils.testing_utils import require_torch_gpu
25
-
26
- from ...pipeline_params import (
27
- CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
28
- CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
29
- )
30
- from ...test_pipelines_common import PipelineTesterMixin
31
-
32
-
33
- torch.backends.cuda.matmul.allow_tf32 = False
34
-
35
-
36
- class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
37
- pipeline_class = DiTPipeline
38
- params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS
39
- required_optional_params = PipelineTesterMixin.required_optional_params - {
40
- "latents",
41
- "num_images_per_prompt",
42
- "callback",
43
- "callback_steps",
44
- }
45
- batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS
46
- test_cpu_offload = False
47
-
48
- def get_dummy_components(self):
49
- torch.manual_seed(0)
50
- transformer = Transformer2DModel(
51
- sample_size=16,
52
- num_layers=2,
53
- patch_size=4,
54
- attention_head_dim=8,
55
- num_attention_heads=2,
56
- in_channels=4,
57
- out_channels=8,
58
- attention_bias=True,
59
- activation_fn="gelu-approximate",
60
- num_embeds_ada_norm=1000,
61
- norm_type="ada_norm_zero",
62
- norm_elementwise_affine=False,
63
- )
64
- vae = AutoencoderKL()
65
- scheduler = DDIMScheduler()
66
- components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler}
67
- return components
68
-
69
- def get_dummy_inputs(self, device, seed=0):
70
- if str(device).startswith("mps"):
71
- generator = torch.manual_seed(seed)
72
- else:
73
- generator = torch.Generator(device=device).manual_seed(seed)
74
- inputs = {
75
- "class_labels": [1],
76
- "generator": generator,
77
- "num_inference_steps": 2,
78
- "output_type": "numpy",
79
- }
80
- return inputs
81
-
82
- def test_inference(self):
83
- device = "cpu"
84
-
85
- components = self.get_dummy_components()
86
- pipe = self.pipeline_class(**components)
87
- pipe.to(device)
88
- pipe.set_progress_bar_config(disable=None)
89
-
90
- inputs = self.get_dummy_inputs(device)
91
- image = pipe(**inputs).images
92
- image_slice = image[0, -3:, -3:, -1]
93
-
94
- self.assertEqual(image.shape, (1, 16, 16, 3))
95
- expected_slice = np.array([0.4380, 0.4141, 0.5159, 0.0000, 0.4282, 0.6680, 0.5485, 0.2545, 0.6719])
96
- max_diff = np.abs(image_slice.flatten() - expected_slice).max()
97
- self.assertLessEqual(max_diff, 1e-3)
98
-
99
- def test_inference_batch_single_identical(self):
100
- self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
101
-
102
- @unittest.skipIf(
103
- torch_device != "cuda" or not is_xformers_available(),
104
- reason="XFormers attention is only available with CUDA and `xformers` installed",
105
- )
106
- def test_xformers_attention_forwardGenerator_pass(self):
107
- self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
108
-
109
-
110
- @require_torch_gpu
111
- @slow
112
- class DiTPipelineIntegrationTests(unittest.TestCase):
113
- def tearDown(self):
114
- super().tearDown()
115
- gc.collect()
116
- torch.cuda.empty_cache()
117
-
118
- def test_dit_256(self):
119
- generator = torch.manual_seed(0)
120
-
121
- pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
122
- pipe.to("cuda")
123
-
124
- words = ["vase", "umbrella", "white shark", "white wolf"]
125
- ids = pipe.get_label_ids(words)
126
-
127
- images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
128
-
129
- for word, image in zip(words, images):
130
- expected_image = load_numpy(
131
- f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy"
132
- )
133
- assert np.abs((expected_image - image).max()) < 1e-2
134
-
135
- def test_dit_512(self):
136
- pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
137
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
138
- pipe.to("cuda")
139
-
140
- words = ["vase", "umbrella"]
141
- ids = pipe.get_label_ids(words)
142
-
143
- generator = torch.manual_seed(0)
144
- images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
145
-
146
- for word, image in zip(words, images):
147
- expected_image = load_numpy(
148
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
149
- f"/dit/{word}_512.npy"
150
- )
151
-
152
- assert np.abs((expected_image - image).max()) < 1e-1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/karras_ve/__init__.py DELETED
File without changes
diffusers/tests/pipelines/karras_ve/test_karras_ve.py DELETED
@@ -1,86 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel
22
- from diffusers.utils.testing_utils import require_torch, slow, torch_device
23
-
24
-
25
- torch.backends.cuda.matmul.allow_tf32 = False
26
-
27
-
28
- class KarrasVePipelineFastTests(unittest.TestCase):
29
- @property
30
- def dummy_uncond_unet(self):
31
- torch.manual_seed(0)
32
- model = UNet2DModel(
33
- block_out_channels=(32, 64),
34
- layers_per_block=2,
35
- sample_size=32,
36
- in_channels=3,
37
- out_channels=3,
38
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
39
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
40
- )
41
- return model
42
-
43
- def test_inference(self):
44
- unet = self.dummy_uncond_unet
45
- scheduler = KarrasVeScheduler()
46
-
47
- pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
48
- pipe.to(torch_device)
49
- pipe.set_progress_bar_config(disable=None)
50
-
51
- generator = torch.manual_seed(0)
52
- image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
53
-
54
- generator = torch.manual_seed(0)
55
- image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
56
-
57
- image_slice = image[0, -3:, -3:, -1]
58
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
59
-
60
- assert image.shape == (1, 32, 32, 3)
61
- expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
62
-
63
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
64
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
65
-
66
-
67
- @slow
68
- @require_torch
69
- class KarrasVePipelineIntegrationTests(unittest.TestCase):
70
- def test_inference(self):
71
- model_id = "google/ncsnpp-celebahq-256"
72
- model = UNet2DModel.from_pretrained(model_id)
73
- scheduler = KarrasVeScheduler()
74
-
75
- pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
76
- pipe.to(torch_device)
77
- pipe.set_progress_bar_config(disable=None)
78
-
79
- generator = torch.manual_seed(0)
80
- image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
81
-
82
- image_slice = image[0, -3:, -3:, -1]
83
- assert image.shape == (1, 256, 256, 3)
84
- expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
85
-
86
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/latent_diffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py DELETED
@@ -1,202 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
- from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22
-
23
- from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
24
- from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
25
-
26
- from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
27
- from ...test_pipelines_common import PipelineTesterMixin
28
-
29
-
30
- torch.backends.cuda.matmul.allow_tf32 = False
31
-
32
-
33
- class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
34
- pipeline_class = LDMTextToImagePipeline
35
- params = TEXT_TO_IMAGE_PARAMS - {
36
- "negative_prompt",
37
- "negative_prompt_embeds",
38
- "cross_attention_kwargs",
39
- "prompt_embeds",
40
- }
41
- required_optional_params = PipelineTesterMixin.required_optional_params - {
42
- "num_images_per_prompt",
43
- "callback",
44
- "callback_steps",
45
- }
46
- batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
47
- test_cpu_offload = False
48
-
49
- def get_dummy_components(self):
50
- torch.manual_seed(0)
51
- unet = UNet2DConditionModel(
52
- block_out_channels=(32, 64),
53
- layers_per_block=2,
54
- sample_size=32,
55
- in_channels=4,
56
- out_channels=4,
57
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
58
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
59
- cross_attention_dim=32,
60
- )
61
- scheduler = DDIMScheduler(
62
- beta_start=0.00085,
63
- beta_end=0.012,
64
- beta_schedule="scaled_linear",
65
- clip_sample=False,
66
- set_alpha_to_one=False,
67
- )
68
- torch.manual_seed(0)
69
- vae = AutoencoderKL(
70
- block_out_channels=(32, 64),
71
- in_channels=3,
72
- out_channels=3,
73
- down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
74
- up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
75
- latent_channels=4,
76
- )
77
- torch.manual_seed(0)
78
- text_encoder_config = CLIPTextConfig(
79
- bos_token_id=0,
80
- eos_token_id=2,
81
- hidden_size=32,
82
- intermediate_size=37,
83
- layer_norm_eps=1e-05,
84
- num_attention_heads=4,
85
- num_hidden_layers=5,
86
- pad_token_id=1,
87
- vocab_size=1000,
88
- )
89
- text_encoder = CLIPTextModel(text_encoder_config)
90
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
91
-
92
- components = {
93
- "unet": unet,
94
- "scheduler": scheduler,
95
- "vqvae": vae,
96
- "bert": text_encoder,
97
- "tokenizer": tokenizer,
98
- }
99
- return components
100
-
101
- def get_dummy_inputs(self, device, seed=0):
102
- if str(device).startswith("mps"):
103
- generator = torch.manual_seed(seed)
104
- else:
105
- generator = torch.Generator(device=device).manual_seed(seed)
106
- inputs = {
107
- "prompt": "A painting of a squirrel eating a burger",
108
- "generator": generator,
109
- "num_inference_steps": 2,
110
- "guidance_scale": 6.0,
111
- "output_type": "numpy",
112
- }
113
- return inputs
114
-
115
- def test_inference_text2img(self):
116
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
117
-
118
- components = self.get_dummy_components()
119
- pipe = LDMTextToImagePipeline(**components)
120
- pipe.to(device)
121
- pipe.set_progress_bar_config(disable=None)
122
-
123
- inputs = self.get_dummy_inputs(device)
124
- image = pipe(**inputs).images
125
- image_slice = image[0, -3:, -3:, -1]
126
-
127
- assert image.shape == (1, 16, 16, 3)
128
- expected_slice = np.array([0.59450, 0.64078, 0.55509, 0.51229, 0.69640, 0.36960, 0.59296, 0.60801, 0.49332])
129
-
130
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
131
-
132
-
133
- @slow
134
- @require_torch_gpu
135
- class LDMTextToImagePipelineSlowTests(unittest.TestCase):
136
- def tearDown(self):
137
- super().tearDown()
138
- gc.collect()
139
- torch.cuda.empty_cache()
140
-
141
- def get_inputs(self, device, dtype=torch.float32, seed=0):
142
- generator = torch.manual_seed(seed)
143
- latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
144
- latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
145
- inputs = {
146
- "prompt": "A painting of a squirrel eating a burger",
147
- "latents": latents,
148
- "generator": generator,
149
- "num_inference_steps": 3,
150
- "guidance_scale": 6.0,
151
- "output_type": "numpy",
152
- }
153
- return inputs
154
-
155
- def test_ldm_default_ddim(self):
156
- pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
157
- pipe.set_progress_bar_config(disable=None)
158
-
159
- inputs = self.get_inputs(torch_device)
160
- image = pipe(**inputs).images
161
- image_slice = image[0, -3:, -3:, -1].flatten()
162
-
163
- assert image.shape == (1, 256, 256, 3)
164
- expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
165
- max_diff = np.abs(expected_slice - image_slice).max()
166
- assert max_diff < 1e-3
167
-
168
-
169
- @nightly
170
- @require_torch_gpu
171
- class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
172
- def tearDown(self):
173
- super().tearDown()
174
- gc.collect()
175
- torch.cuda.empty_cache()
176
-
177
- def get_inputs(self, device, dtype=torch.float32, seed=0):
178
- generator = torch.manual_seed(seed)
179
- latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
180
- latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
181
- inputs = {
182
- "prompt": "A painting of a squirrel eating a burger",
183
- "latents": latents,
184
- "generator": generator,
185
- "num_inference_steps": 50,
186
- "guidance_scale": 6.0,
187
- "output_type": "numpy",
188
- }
189
- return inputs
190
-
191
- def test_ldm_default_ddim(self):
192
- pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
193
- pipe.set_progress_bar_config(disable=None)
194
-
195
- inputs = self.get_inputs(torch_device)
196
- image = pipe(**inputs).images[0]
197
-
198
- expected_image = load_numpy(
199
- "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
200
- )
201
- max_diff = np.abs(expected_image - image).max()
202
- assert max_diff < 1e-3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py DELETED
@@ -1,131 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import random
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
23
- from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device
24
- from diffusers.utils.testing_utils import require_torch
25
-
26
-
27
- torch.backends.cuda.matmul.allow_tf32 = False
28
-
29
-
30
- class LDMSuperResolutionPipelineFastTests(unittest.TestCase):
31
- @property
32
- def dummy_image(self):
33
- batch_size = 1
34
- num_channels = 3
35
- sizes = (32, 32)
36
-
37
- image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
38
- return image
39
-
40
- @property
41
- def dummy_uncond_unet(self):
42
- torch.manual_seed(0)
43
- model = UNet2DModel(
44
- block_out_channels=(32, 64),
45
- layers_per_block=2,
46
- sample_size=32,
47
- in_channels=6,
48
- out_channels=3,
49
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
50
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
51
- )
52
- return model
53
-
54
- @property
55
- def dummy_vq_model(self):
56
- torch.manual_seed(0)
57
- model = VQModel(
58
- block_out_channels=[32, 64],
59
- in_channels=3,
60
- out_channels=3,
61
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
62
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
63
- latent_channels=3,
64
- )
65
- return model
66
-
67
- def test_inference_superresolution(self):
68
- device = "cpu"
69
- unet = self.dummy_uncond_unet
70
- scheduler = DDIMScheduler()
71
- vqvae = self.dummy_vq_model
72
-
73
- ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
74
- ldm.to(device)
75
- ldm.set_progress_bar_config(disable=None)
76
-
77
- init_image = self.dummy_image.to(device)
78
-
79
- generator = torch.Generator(device=device).manual_seed(0)
80
- image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
81
-
82
- image_slice = image[0, -3:, -3:, -1]
83
-
84
- assert image.shape == (1, 64, 64, 3)
85
- expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
86
-
87
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
88
-
89
- @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
90
- def test_inference_superresolution_fp16(self):
91
- unet = self.dummy_uncond_unet
92
- scheduler = DDIMScheduler()
93
- vqvae = self.dummy_vq_model
94
-
95
- # put models in fp16
96
- unet = unet.half()
97
- vqvae = vqvae.half()
98
-
99
- ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
100
- ldm.to(torch_device)
101
- ldm.set_progress_bar_config(disable=None)
102
-
103
- init_image = self.dummy_image.to(torch_device)
104
-
105
- image = ldm(init_image, num_inference_steps=2, output_type="numpy").images
106
-
107
- assert image.shape == (1, 64, 64, 3)
108
-
109
-
110
- @slow
111
- @require_torch
112
- class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
113
- def test_inference_superresolution(self):
114
- init_image = load_image(
115
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
116
- "/vq_diffusion/teddy_bear_pool.png"
117
- )
118
- init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
119
-
120
- ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto")
121
- ldm.set_progress_bar_config(disable=None)
122
-
123
- generator = torch.manual_seed(0)
124
- image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
125
-
126
- image_slice = image[0, -3:, -3:, -1]
127
-
128
- assert image.shape == (1, 256, 256, 3)
129
- expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.7560, 0.7425, 0.7257, 0.6907])
130
-
131
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py DELETED
@@ -1,116 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
- from transformers import CLIPTextConfig, CLIPTextModel
21
-
22
- from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
23
- from diffusers.utils.testing_utils import require_torch, slow, torch_device
24
-
25
-
26
- torch.backends.cuda.matmul.allow_tf32 = False
27
-
28
-
29
- class LDMPipelineFastTests(unittest.TestCase):
30
- @property
31
- def dummy_uncond_unet(self):
32
- torch.manual_seed(0)
33
- model = UNet2DModel(
34
- block_out_channels=(32, 64),
35
- layers_per_block=2,
36
- sample_size=32,
37
- in_channels=3,
38
- out_channels=3,
39
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
40
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
41
- )
42
- return model
43
-
44
- @property
45
- def dummy_vq_model(self):
46
- torch.manual_seed(0)
47
- model = VQModel(
48
- block_out_channels=[32, 64],
49
- in_channels=3,
50
- out_channels=3,
51
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
52
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
53
- latent_channels=3,
54
- )
55
- return model
56
-
57
- @property
58
- def dummy_text_encoder(self):
59
- torch.manual_seed(0)
60
- config = CLIPTextConfig(
61
- bos_token_id=0,
62
- eos_token_id=2,
63
- hidden_size=32,
64
- intermediate_size=37,
65
- layer_norm_eps=1e-05,
66
- num_attention_heads=4,
67
- num_hidden_layers=5,
68
- pad_token_id=1,
69
- vocab_size=1000,
70
- )
71
- return CLIPTextModel(config)
72
-
73
- def test_inference_uncond(self):
74
- unet = self.dummy_uncond_unet
75
- scheduler = DDIMScheduler()
76
- vae = self.dummy_vq_model
77
-
78
- ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
79
- ldm.to(torch_device)
80
- ldm.set_progress_bar_config(disable=None)
81
-
82
- generator = torch.manual_seed(0)
83
- image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
84
-
85
- generator = torch.manual_seed(0)
86
- image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
87
-
88
- image_slice = image[0, -3:, -3:, -1]
89
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
90
-
91
- assert image.shape == (1, 64, 64, 3)
92
- expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172])
93
- tolerance = 1e-2 if torch_device != "mps" else 3e-2
94
-
95
- assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
96
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
97
-
98
-
99
- @slow
100
- @require_torch
101
- class LDMPipelineIntegrationTests(unittest.TestCase):
102
- def test_inference_uncond(self):
103
- ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
104
- ldm.to(torch_device)
105
- ldm.set_progress_bar_config(disable=None)
106
-
107
- generator = torch.manual_seed(0)
108
- image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
109
-
110
- image_slice = image[0, -3:, -3:, -1]
111
-
112
- assert image.shape == (1, 256, 256, 3)
113
- expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
114
- tolerance = 1e-2 if torch_device != "mps" else 3e-2
115
-
116
- assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/paint_by_example/__init__.py DELETED
File without changes
diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py DELETED
@@ -1,210 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import random
18
- import unittest
19
-
20
- import numpy as np
21
- import torch
22
- from PIL import Image
23
- from transformers import CLIPImageProcessor, CLIPVisionConfig
24
-
25
- from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
26
- from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
27
- from diffusers.utils import floats_tensor, load_image, slow, torch_device
28
- from diffusers.utils.testing_utils import require_torch_gpu
29
-
30
- from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
31
- from ...test_pipelines_common import PipelineTesterMixin
32
-
33
-
34
- torch.backends.cuda.matmul.allow_tf32 = False
35
-
36
-
37
- class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
38
- pipeline_class = PaintByExamplePipeline
39
- params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
40
- batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
41
-
42
- def get_dummy_components(self):
43
- torch.manual_seed(0)
44
- unet = UNet2DConditionModel(
45
- block_out_channels=(32, 64),
46
- layers_per_block=2,
47
- sample_size=32,
48
- in_channels=9,
49
- out_channels=4,
50
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
51
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
52
- cross_attention_dim=32,
53
- )
54
- scheduler = PNDMScheduler(skip_prk_steps=True)
55
- torch.manual_seed(0)
56
- vae = AutoencoderKL(
57
- block_out_channels=[32, 64],
58
- in_channels=3,
59
- out_channels=3,
60
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
61
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
62
- latent_channels=4,
63
- )
64
- torch.manual_seed(0)
65
- config = CLIPVisionConfig(
66
- hidden_size=32,
67
- projection_dim=32,
68
- intermediate_size=37,
69
- layer_norm_eps=1e-05,
70
- num_attention_heads=4,
71
- num_hidden_layers=5,
72
- image_size=32,
73
- patch_size=4,
74
- )
75
- image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
76
- feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
77
-
78
- components = {
79
- "unet": unet,
80
- "scheduler": scheduler,
81
- "vae": vae,
82
- "image_encoder": image_encoder,
83
- "safety_checker": None,
84
- "feature_extractor": feature_extractor,
85
- }
86
- return components
87
-
88
- def convert_to_pt(self, image):
89
- image = np.array(image.convert("RGB"))
90
- image = image[None].transpose(0, 3, 1, 2)
91
- image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
92
- return image
93
-
94
- def get_dummy_inputs(self, device="cpu", seed=0):
95
- # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
96
- image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
97
- image = image.cpu().permute(0, 2, 3, 1)[0]
98
- init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
99
- mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
100
- example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
101
-
102
- if str(device).startswith("mps"):
103
- generator = torch.manual_seed(seed)
104
- else:
105
- generator = torch.Generator(device=device).manual_seed(seed)
106
- inputs = {
107
- "example_image": example_image,
108
- "image": init_image,
109
- "mask_image": mask_image,
110
- "generator": generator,
111
- "num_inference_steps": 2,
112
- "guidance_scale": 6.0,
113
- "output_type": "numpy",
114
- }
115
- return inputs
116
-
117
- def test_paint_by_example_inpaint(self):
118
- components = self.get_dummy_components()
119
-
120
- # make sure here that pndm scheduler skips prk
121
- pipe = PaintByExamplePipeline(**components)
122
- pipe = pipe.to("cpu")
123
- pipe.set_progress_bar_config(disable=None)
124
-
125
- inputs = self.get_dummy_inputs()
126
- output = pipe(**inputs)
127
- image = output.images
128
-
129
- image_slice = image[0, -3:, -3:, -1]
130
-
131
- assert image.shape == (1, 64, 64, 3)
132
- expected_slice = np.array([0.4701, 0.5555, 0.3994, 0.5107, 0.5691, 0.4517, 0.5125, 0.4769, 0.4539])
133
-
134
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
135
-
136
- def test_paint_by_example_image_tensor(self):
137
- device = "cpu"
138
- inputs = self.get_dummy_inputs()
139
- inputs.pop("mask_image")
140
- image = self.convert_to_pt(inputs.pop("image"))
141
- mask_image = image.clamp(0, 1) / 2
142
-
143
- # make sure here that pndm scheduler skips prk
144
- pipe = PaintByExamplePipeline(**self.get_dummy_components())
145
- pipe = pipe.to(device)
146
- pipe.set_progress_bar_config(disable=None)
147
-
148
- output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
149
- out_1 = output.images
150
-
151
- image = image.cpu().permute(0, 2, 3, 1)[0]
152
- mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
153
-
154
- image = Image.fromarray(np.uint8(image)).convert("RGB")
155
- mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
156
-
157
- output = pipe(**self.get_dummy_inputs())
158
- out_2 = output.images
159
-
160
- assert out_1.shape == (1, 64, 64, 3)
161
- assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
162
-
163
-
164
- @slow
165
- @require_torch_gpu
166
- class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
167
- def tearDown(self):
168
- # clean up the VRAM after each test
169
- super().tearDown()
170
- gc.collect()
171
- torch.cuda.empty_cache()
172
-
173
- def test_paint_by_example(self):
174
- # make sure here that pndm scheduler skips prk
175
- init_image = load_image(
176
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
177
- "/paint_by_example/dog_in_bucket.png"
178
- )
179
- mask_image = load_image(
180
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
181
- "/paint_by_example/mask.png"
182
- )
183
- example_image = load_image(
184
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
185
- "/paint_by_example/panda.jpg"
186
- )
187
-
188
- pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
189
- pipe = pipe.to(torch_device)
190
- pipe.set_progress_bar_config(disable=None)
191
-
192
- generator = torch.manual_seed(321)
193
- output = pipe(
194
- image=init_image,
195
- mask_image=mask_image,
196
- example_image=example_image,
197
- generator=generator,
198
- guidance_scale=5.0,
199
- num_inference_steps=50,
200
- output_type="np",
201
- )
202
-
203
- image = output.images
204
-
205
- image_slice = image[0, -3:, -3:, -1]
206
-
207
- assert image.shape == (1, 512, 512, 3)
208
- expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
209
-
210
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/pndm/__init__.py DELETED
File without changes
diffusers/tests/pipelines/pndm/test_pndm.py DELETED
@@ -1,87 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
22
- from diffusers.utils.testing_utils import require_torch, slow, torch_device
23
-
24
-
25
- torch.backends.cuda.matmul.allow_tf32 = False
26
-
27
-
28
- class PNDMPipelineFastTests(unittest.TestCase):
29
- @property
30
- def dummy_uncond_unet(self):
31
- torch.manual_seed(0)
32
- model = UNet2DModel(
33
- block_out_channels=(32, 64),
34
- layers_per_block=2,
35
- sample_size=32,
36
- in_channels=3,
37
- out_channels=3,
38
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
39
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
40
- )
41
- return model
42
-
43
- def test_inference(self):
44
- unet = self.dummy_uncond_unet
45
- scheduler = PNDMScheduler()
46
-
47
- pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
48
- pndm.to(torch_device)
49
- pndm.set_progress_bar_config(disable=None)
50
-
51
- generator = torch.manual_seed(0)
52
- image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
53
-
54
- generator = torch.manual_seed(0)
55
- image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0]
56
-
57
- image_slice = image[0, -3:, -3:, -1]
58
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
59
-
60
- assert image.shape == (1, 32, 32, 3)
61
- expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])
62
-
63
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
64
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
65
-
66
-
67
- @slow
68
- @require_torch
69
- class PNDMPipelineIntegrationTests(unittest.TestCase):
70
- def test_inference_cifar10(self):
71
- model_id = "google/ddpm-cifar10-32"
72
-
73
- unet = UNet2DModel.from_pretrained(model_id)
74
- scheduler = PNDMScheduler()
75
-
76
- pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
77
- pndm.to(torch_device)
78
- pndm.set_progress_bar_config(disable=None)
79
- generator = torch.manual_seed(0)
80
- image = pndm(generator=generator, output_type="numpy").images
81
-
82
- image_slice = image[0, -3:, -3:, -1]
83
-
84
- assert image.shape == (1, 32, 32, 3)
85
- expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
86
-
87
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/repaint/__init__.py DELETED
File without changes
diffusers/tests/pipelines/repaint/test_repaint.py DELETED
@@ -1,162 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
23
- from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device
24
-
25
- from ...pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
26
- from ...test_pipelines_common import PipelineTesterMixin
27
-
28
-
29
- torch.backends.cuda.matmul.allow_tf32 = False
30
-
31
-
32
- class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
33
- pipeline_class = RePaintPipeline
34
- params = IMAGE_INPAINTING_PARAMS - {"width", "height", "guidance_scale"}
35
- required_optional_params = PipelineTesterMixin.required_optional_params - {
36
- "latents",
37
- "num_images_per_prompt",
38
- "callback",
39
- "callback_steps",
40
- }
41
- batch_params = IMAGE_INPAINTING_BATCH_PARAMS
42
- test_cpu_offload = False
43
-
44
- def get_dummy_components(self):
45
- torch.manual_seed(0)
46
- torch.manual_seed(0)
47
- unet = UNet2DModel(
48
- block_out_channels=(32, 64),
49
- layers_per_block=2,
50
- sample_size=32,
51
- in_channels=3,
52
- out_channels=3,
53
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
54
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
55
- )
56
- scheduler = RePaintScheduler()
57
- components = {"unet": unet, "scheduler": scheduler}
58
- return components
59
-
60
- def get_dummy_inputs(self, device, seed=0):
61
- if str(device).startswith("mps"):
62
- generator = torch.manual_seed(seed)
63
- else:
64
- generator = torch.Generator(device=device).manual_seed(seed)
65
- image = np.random.RandomState(seed).standard_normal((1, 3, 32, 32))
66
- image = torch.from_numpy(image).to(device=device, dtype=torch.float32)
67
- mask = (image > 0).to(device=device, dtype=torch.float32)
68
- inputs = {
69
- "image": image,
70
- "mask_image": mask,
71
- "generator": generator,
72
- "num_inference_steps": 5,
73
- "eta": 0.0,
74
- "jump_length": 2,
75
- "jump_n_sample": 2,
76
- "output_type": "numpy",
77
- }
78
- return inputs
79
-
80
- def test_repaint(self):
81
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
82
- components = self.get_dummy_components()
83
- sd_pipe = RePaintPipeline(**components)
84
- sd_pipe = sd_pipe.to(device)
85
- sd_pipe.set_progress_bar_config(disable=None)
86
-
87
- inputs = self.get_dummy_inputs(device)
88
- image = sd_pipe(**inputs).images
89
- image_slice = image[0, -3:, -3:, -1]
90
-
91
- assert image.shape == (1, 32, 32, 3)
92
- expected_slice = np.array([1.0000, 0.5426, 0.5497, 0.2200, 1.0000, 1.0000, 0.5623, 1.0000, 0.6274])
93
-
94
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
95
-
96
- @skip_mps
97
- def test_save_load_local(self):
98
- return super().test_save_load_local()
99
-
100
- # RePaint can hardly be made deterministic since the scheduler is currently always
101
- # nondeterministic
102
- @unittest.skip("non-deterministic pipeline")
103
- def test_inference_batch_single_identical(self):
104
- return super().test_inference_batch_single_identical()
105
-
106
- @skip_mps
107
- def test_dict_tuple_outputs_equivalent(self):
108
- return super().test_dict_tuple_outputs_equivalent()
109
-
110
- @skip_mps
111
- def test_save_load_optional_components(self):
112
- return super().test_save_load_optional_components()
113
-
114
- @skip_mps
115
- def test_attention_slicing_forward_pass(self):
116
- return super().test_attention_slicing_forward_pass()
117
-
118
-
119
- @nightly
120
- @require_torch_gpu
121
- class RepaintPipelineNightlyTests(unittest.TestCase):
122
- def tearDown(self):
123
- super().tearDown()
124
- gc.collect()
125
- torch.cuda.empty_cache()
126
-
127
- def test_celebahq(self):
128
- original_image = load_image(
129
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
130
- "repaint/celeba_hq_256.png"
131
- )
132
- mask_image = load_image(
133
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
134
- )
135
- expected_image = load_numpy(
136
- "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
137
- "repaint/celeba_hq_256_result.npy"
138
- )
139
-
140
- model_id = "google/ddpm-ema-celebahq-256"
141
- unet = UNet2DModel.from_pretrained(model_id)
142
- scheduler = RePaintScheduler.from_pretrained(model_id)
143
-
144
- repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device)
145
- repaint.set_progress_bar_config(disable=None)
146
- repaint.enable_attention_slicing()
147
-
148
- generator = torch.manual_seed(0)
149
- output = repaint(
150
- original_image,
151
- mask_image,
152
- num_inference_steps=250,
153
- eta=0.0,
154
- jump_length=10,
155
- jump_n_sample=10,
156
- generator=generator,
157
- output_type="np",
158
- )
159
- image = output.images[0]
160
-
161
- assert image.shape == (256, 256, 3)
162
- assert np.abs(expected_image - image).mean() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/score_sde_ve/__init__.py DELETED
File without changes
diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py DELETED
@@ -1,91 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import unittest
17
-
18
- import numpy as np
19
- import torch
20
-
21
- from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
22
- from diffusers.utils.testing_utils import require_torch, slow, torch_device
23
-
24
-
25
- torch.backends.cuda.matmul.allow_tf32 = False
26
-
27
-
28
- class ScoreSdeVeipelineFastTests(unittest.TestCase):
29
- @property
30
- def dummy_uncond_unet(self):
31
- torch.manual_seed(0)
32
- model = UNet2DModel(
33
- block_out_channels=(32, 64),
34
- layers_per_block=2,
35
- sample_size=32,
36
- in_channels=3,
37
- out_channels=3,
38
- down_block_types=("DownBlock2D", "AttnDownBlock2D"),
39
- up_block_types=("AttnUpBlock2D", "UpBlock2D"),
40
- )
41
- return model
42
-
43
- def test_inference(self):
44
- unet = self.dummy_uncond_unet
45
- scheduler = ScoreSdeVeScheduler()
46
-
47
- sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
48
- sde_ve.to(torch_device)
49
- sde_ve.set_progress_bar_config(disable=None)
50
-
51
- generator = torch.manual_seed(0)
52
- image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
53
-
54
- generator = torch.manual_seed(0)
55
- image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
56
- 0
57
- ]
58
-
59
- image_slice = image[0, -3:, -3:, -1]
60
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
61
-
62
- assert image.shape == (1, 32, 32, 3)
63
- expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
64
-
65
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
66
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
67
-
68
-
69
- @slow
70
- @require_torch
71
- class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
72
- def test_inference(self):
73
- model_id = "google/ncsnpp-church-256"
74
- model = UNet2DModel.from_pretrained(model_id)
75
-
76
- scheduler = ScoreSdeVeScheduler.from_pretrained(model_id)
77
-
78
- sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
79
- sde_ve.to(torch_device)
80
- sde_ve.set_progress_bar_config(disable=None)
81
-
82
- generator = torch.manual_seed(0)
83
- image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
84
-
85
- image_slice = image[0, -3:, -3:, -1]
86
-
87
- assert image.shape == (1, 256, 256, 3)
88
-
89
- expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
90
-
91
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py DELETED
@@ -1,601 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2023 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import random
18
- import tempfile
19
- import unittest
20
-
21
- import numpy as np
22
- import torch
23
- from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24
-
25
- from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
26
- from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
27
- from diffusers.utils import floats_tensor, nightly, torch_device
28
- from diffusers.utils.testing_utils import require_torch_gpu
29
-
30
-
31
- torch.backends.cuda.matmul.allow_tf32 = False
32
-
33
-
34
- class SafeDiffusionPipelineFastTests(unittest.TestCase):
35
- def tearDown(self):
36
- # clean up the VRAM after each test
37
- super().tearDown()
38
- gc.collect()
39
- torch.cuda.empty_cache()
40
-
41
- @property
42
- def dummy_image(self):
43
- batch_size = 1
44
- num_channels = 3
45
- sizes = (32, 32)
46
-
47
- image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
48
- return image
49
-
50
- @property
51
- def dummy_cond_unet(self):
52
- torch.manual_seed(0)
53
- model = UNet2DConditionModel(
54
- block_out_channels=(32, 64),
55
- layers_per_block=2,
56
- sample_size=32,
57
- in_channels=4,
58
- out_channels=4,
59
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
60
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
61
- cross_attention_dim=32,
62
- )
63
- return model
64
-
65
- @property
66
- def dummy_vae(self):
67
- torch.manual_seed(0)
68
- model = AutoencoderKL(
69
- block_out_channels=[32, 64],
70
- in_channels=3,
71
- out_channels=3,
72
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
73
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
74
- latent_channels=4,
75
- )
76
- return model
77
-
78
- @property
79
- def dummy_text_encoder(self):
80
- torch.manual_seed(0)
81
- config = CLIPTextConfig(
82
- bos_token_id=0,
83
- eos_token_id=2,
84
- hidden_size=32,
85
- intermediate_size=37,
86
- layer_norm_eps=1e-05,
87
- num_attention_heads=4,
88
- num_hidden_layers=5,
89
- pad_token_id=1,
90
- vocab_size=1000,
91
- )
92
- return CLIPTextModel(config)
93
-
94
- @property
95
- def dummy_extractor(self):
96
- def extract(*args, **kwargs):
97
- class Out:
98
- def __init__(self):
99
- self.pixel_values = torch.ones([0])
100
-
101
- def to(self, device):
102
- self.pixel_values.to(device)
103
- return self
104
-
105
- return Out()
106
-
107
- return extract
108
-
109
- def test_semantic_diffusion_ddim(self):
110
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
111
- unet = self.dummy_cond_unet
112
- scheduler = DDIMScheduler(
113
- beta_start=0.00085,
114
- beta_end=0.012,
115
- beta_schedule="scaled_linear",
116
- clip_sample=False,
117
- set_alpha_to_one=False,
118
- )
119
-
120
- vae = self.dummy_vae
121
- bert = self.dummy_text_encoder
122
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
123
-
124
- # make sure here that pndm scheduler skips prk
125
- sd_pipe = StableDiffusionPipeline(
126
- unet=unet,
127
- scheduler=scheduler,
128
- vae=vae,
129
- text_encoder=bert,
130
- tokenizer=tokenizer,
131
- safety_checker=None,
132
- feature_extractor=self.dummy_extractor,
133
- )
134
- sd_pipe = sd_pipe.to(device)
135
- sd_pipe.set_progress_bar_config(disable=None)
136
-
137
- prompt = "A painting of a squirrel eating a burger"
138
-
139
- generator = torch.Generator(device=device).manual_seed(0)
140
- output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
141
- image = output.images
142
-
143
- generator = torch.Generator(device=device).manual_seed(0)
144
- image_from_tuple = sd_pipe(
145
- [prompt],
146
- generator=generator,
147
- guidance_scale=6.0,
148
- num_inference_steps=2,
149
- output_type="np",
150
- return_dict=False,
151
- )[0]
152
-
153
- image_slice = image[0, -3:, -3:, -1]
154
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
155
-
156
- assert image.shape == (1, 64, 64, 3)
157
- expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
158
-
159
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
160
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
161
-
162
- def test_semantic_diffusion_pndm(self):
163
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
164
- unet = self.dummy_cond_unet
165
- scheduler = PNDMScheduler(skip_prk_steps=True)
166
- vae = self.dummy_vae
167
- bert = self.dummy_text_encoder
168
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
169
-
170
- # make sure here that pndm scheduler skips prk
171
- sd_pipe = StableDiffusionPipeline(
172
- unet=unet,
173
- scheduler=scheduler,
174
- vae=vae,
175
- text_encoder=bert,
176
- tokenizer=tokenizer,
177
- safety_checker=None,
178
- feature_extractor=self.dummy_extractor,
179
- )
180
- sd_pipe = sd_pipe.to(device)
181
- sd_pipe.set_progress_bar_config(disable=None)
182
-
183
- prompt = "A painting of a squirrel eating a burger"
184
- generator = torch.Generator(device=device).manual_seed(0)
185
- output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
186
-
187
- image = output.images
188
-
189
- generator = torch.Generator(device=device).manual_seed(0)
190
- image_from_tuple = sd_pipe(
191
- [prompt],
192
- generator=generator,
193
- guidance_scale=6.0,
194
- num_inference_steps=2,
195
- output_type="np",
196
- return_dict=False,
197
- )[0]
198
-
199
- image_slice = image[0, -3:, -3:, -1]
200
- image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
201
-
202
- assert image.shape == (1, 64, 64, 3)
203
- expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
204
-
205
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
206
- assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
207
-
208
- def test_semantic_diffusion_no_safety_checker(self):
209
- pipe = StableDiffusionPipeline.from_pretrained(
210
- "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
211
- )
212
- assert isinstance(pipe, StableDiffusionPipeline)
213
- assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
214
- assert pipe.safety_checker is None
215
-
216
- image = pipe("example prompt", num_inference_steps=2).images[0]
217
- assert image is not None
218
-
219
- # check that there's no error when saving a pipeline with one of the models being None
220
- with tempfile.TemporaryDirectory() as tmpdirname:
221
- pipe.save_pretrained(tmpdirname)
222
- pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
223
-
224
- # sanity check that the pipeline still works
225
- assert pipe.safety_checker is None
226
- image = pipe("example prompt", num_inference_steps=2).images[0]
227
- assert image is not None
228
-
229
- @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
230
- def test_semantic_diffusion_fp16(self):
231
- """Test that stable diffusion works with fp16"""
232
- unet = self.dummy_cond_unet
233
- scheduler = PNDMScheduler(skip_prk_steps=True)
234
- vae = self.dummy_vae
235
- bert = self.dummy_text_encoder
236
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
237
-
238
- # put models in fp16
239
- unet = unet.half()
240
- vae = vae.half()
241
- bert = bert.half()
242
-
243
- # make sure here that pndm scheduler skips prk
244
- sd_pipe = StableDiffusionPipeline(
245
- unet=unet,
246
- scheduler=scheduler,
247
- vae=vae,
248
- text_encoder=bert,
249
- tokenizer=tokenizer,
250
- safety_checker=None,
251
- feature_extractor=self.dummy_extractor,
252
- )
253
- sd_pipe = sd_pipe.to(torch_device)
254
- sd_pipe.set_progress_bar_config(disable=None)
255
-
256
- prompt = "A painting of a squirrel eating a burger"
257
- image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
258
-
259
- assert image.shape == (1, 64, 64, 3)
260
-
261
-
262
- @nightly
263
- @require_torch_gpu
264
- class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
265
- def tearDown(self):
266
- # clean up the VRAM after each test
267
- super().tearDown()
268
- gc.collect()
269
- torch.cuda.empty_cache()
270
-
271
- def test_positive_guidance(self):
272
- torch_device = "cuda"
273
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
274
- pipe = pipe.to(torch_device)
275
- pipe.set_progress_bar_config(disable=None)
276
-
277
- prompt = "a photo of a cat"
278
- edit = {
279
- "editing_prompt": ["sunglasses"],
280
- "reverse_editing_direction": [False],
281
- "edit_warmup_steps": 10,
282
- "edit_guidance_scale": 6,
283
- "edit_threshold": 0.95,
284
- "edit_momentum_scale": 0.5,
285
- "edit_mom_beta": 0.6,
286
- }
287
-
288
- seed = 3
289
- guidance_scale = 7
290
-
291
- # no sega enabled
292
- generator = torch.Generator(torch_device)
293
- generator.manual_seed(seed)
294
- output = pipe(
295
- [prompt],
296
- generator=generator,
297
- guidance_scale=guidance_scale,
298
- num_inference_steps=50,
299
- output_type="np",
300
- width=512,
301
- height=512,
302
- )
303
-
304
- image = output.images
305
- image_slice = image[0, -3:, -3:, -1]
306
- expected_slice = [
307
- 0.34673113,
308
- 0.38492733,
309
- 0.37597352,
310
- 0.34086335,
311
- 0.35650748,
312
- 0.35579205,
313
- 0.3384763,
314
- 0.34340236,
315
- 0.3573271,
316
- ]
317
-
318
- assert image.shape == (1, 512, 512, 3)
319
-
320
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
321
-
322
- # with sega enabled
323
- # generator = torch.manual_seed(seed)
324
- generator.manual_seed(seed)
325
- output = pipe(
326
- [prompt],
327
- generator=generator,
328
- guidance_scale=guidance_scale,
329
- num_inference_steps=50,
330
- output_type="np",
331
- width=512,
332
- height=512,
333
- **edit,
334
- )
335
-
336
- image = output.images
337
- image_slice = image[0, -3:, -3:, -1]
338
- expected_slice = [
339
- 0.41887826,
340
- 0.37728766,
341
- 0.30138272,
342
- 0.41416335,
343
- 0.41664985,
344
- 0.36283392,
345
- 0.36191246,
346
- 0.43364465,
347
- 0.43001732,
348
- ]
349
-
350
- assert image.shape == (1, 512, 512, 3)
351
-
352
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
353
-
354
- def test_negative_guidance(self):
355
- torch_device = "cuda"
356
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
357
- pipe = pipe.to(torch_device)
358
- pipe.set_progress_bar_config(disable=None)
359
-
360
- prompt = "an image of a crowded boulevard, realistic, 4k"
361
- edit = {
362
- "editing_prompt": "crowd, crowded, people",
363
- "reverse_editing_direction": True,
364
- "edit_warmup_steps": 10,
365
- "edit_guidance_scale": 8.3,
366
- "edit_threshold": 0.9,
367
- "edit_momentum_scale": 0.5,
368
- "edit_mom_beta": 0.6,
369
- }
370
-
371
- seed = 9
372
- guidance_scale = 7
373
-
374
- # no sega enabled
375
- generator = torch.Generator(torch_device)
376
- generator.manual_seed(seed)
377
- output = pipe(
378
- [prompt],
379
- generator=generator,
380
- guidance_scale=guidance_scale,
381
- num_inference_steps=50,
382
- output_type="np",
383
- width=512,
384
- height=512,
385
- )
386
-
387
- image = output.images
388
- image_slice = image[0, -3:, -3:, -1]
389
- expected_slice = [
390
- 0.43497998,
391
- 0.91814065,
392
- 0.7540739,
393
- 0.55580205,
394
- 0.8467265,
395
- 0.5389691,
396
- 0.62574506,
397
- 0.58897763,
398
- 0.50926757,
399
- ]
400
-
401
- assert image.shape == (1, 512, 512, 3)
402
-
403
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
404
-
405
- # with sega enabled
406
- # generator = torch.manual_seed(seed)
407
- generator.manual_seed(seed)
408
- output = pipe(
409
- [prompt],
410
- generator=generator,
411
- guidance_scale=guidance_scale,
412
- num_inference_steps=50,
413
- output_type="np",
414
- width=512,
415
- height=512,
416
- **edit,
417
- )
418
-
419
- image = output.images
420
- image_slice = image[0, -3:, -3:, -1]
421
- expected_slice = [
422
- 0.3089719,
423
- 0.30500144,
424
- 0.29016042,
425
- 0.30630964,
426
- 0.325687,
427
- 0.29419225,
428
- 0.2908091,
429
- 0.28723598,
430
- 0.27696294,
431
- ]
432
-
433
- assert image.shape == (1, 512, 512, 3)
434
-
435
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
436
-
437
- def test_multi_cond_guidance(self):
438
- torch_device = "cuda"
439
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
440
- pipe = pipe.to(torch_device)
441
- pipe.set_progress_bar_config(disable=None)
442
-
443
- prompt = "a castle next to a river"
444
- edit = {
445
- "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
446
- "reverse_editing_direction": False,
447
- "edit_warmup_steps": [15, 18],
448
- "edit_guidance_scale": 6,
449
- "edit_threshold": [0.9, 0.8],
450
- "edit_momentum_scale": 0.5,
451
- "edit_mom_beta": 0.6,
452
- }
453
-
454
- seed = 48
455
- guidance_scale = 7
456
-
457
- # no sega enabled
458
- generator = torch.Generator(torch_device)
459
- generator.manual_seed(seed)
460
- output = pipe(
461
- [prompt],
462
- generator=generator,
463
- guidance_scale=guidance_scale,
464
- num_inference_steps=50,
465
- output_type="np",
466
- width=512,
467
- height=512,
468
- )
469
-
470
- image = output.images
471
- image_slice = image[0, -3:, -3:, -1]
472
- expected_slice = [
473
- 0.75163555,
474
- 0.76037145,
475
- 0.61785,
476
- 0.9189673,
477
- 0.8627701,
478
- 0.85189694,
479
- 0.8512813,
480
- 0.87012076,
481
- 0.8312857,
482
- ]
483
-
484
- assert image.shape == (1, 512, 512, 3)
485
-
486
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
487
-
488
- # with sega enabled
489
- # generator = torch.manual_seed(seed)
490
- generator.manual_seed(seed)
491
- output = pipe(
492
- [prompt],
493
- generator=generator,
494
- guidance_scale=guidance_scale,
495
- num_inference_steps=50,
496
- output_type="np",
497
- width=512,
498
- height=512,
499
- **edit,
500
- )
501
-
502
- image = output.images
503
- image_slice = image[0, -3:, -3:, -1]
504
- expected_slice = [
505
- 0.73553365,
506
- 0.7537271,
507
- 0.74341905,
508
- 0.66480356,
509
- 0.6472925,
510
- 0.63039416,
511
- 0.64812905,
512
- 0.6749717,
513
- 0.6517102,
514
- ]
515
-
516
- assert image.shape == (1, 512, 512, 3)
517
-
518
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
519
-
520
- def test_guidance_fp16(self):
521
- torch_device = "cuda"
522
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
523
- pipe = pipe.to(torch_device)
524
- pipe.set_progress_bar_config(disable=None)
525
-
526
- prompt = "a photo of a cat"
527
- edit = {
528
- "editing_prompt": ["sunglasses"],
529
- "reverse_editing_direction": [False],
530
- "edit_warmup_steps": 10,
531
- "edit_guidance_scale": 6,
532
- "edit_threshold": 0.95,
533
- "edit_momentum_scale": 0.5,
534
- "edit_mom_beta": 0.6,
535
- }
536
-
537
- seed = 3
538
- guidance_scale = 7
539
-
540
- # no sega enabled
541
- generator = torch.Generator(torch_device)
542
- generator.manual_seed(seed)
543
- output = pipe(
544
- [prompt],
545
- generator=generator,
546
- guidance_scale=guidance_scale,
547
- num_inference_steps=50,
548
- output_type="np",
549
- width=512,
550
- height=512,
551
- )
552
-
553
- image = output.images
554
- image_slice = image[0, -3:, -3:, -1]
555
- expected_slice = [
556
- 0.34887695,
557
- 0.3876953,
558
- 0.375,
559
- 0.34423828,
560
- 0.3581543,
561
- 0.35717773,
562
- 0.3383789,
563
- 0.34570312,
564
- 0.359375,
565
- ]
566
-
567
- assert image.shape == (1, 512, 512, 3)
568
-
569
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
570
-
571
- # with sega enabled
572
- # generator = torch.manual_seed(seed)
573
- generator.manual_seed(seed)
574
- output = pipe(
575
- [prompt],
576
- generator=generator,
577
- guidance_scale=guidance_scale,
578
- num_inference_steps=50,
579
- output_type="np",
580
- width=512,
581
- height=512,
582
- **edit,
583
- )
584
-
585
- image = output.images
586
- image_slice = image[0, -3:, -3:, -1]
587
- expected_slice = [
588
- 0.42285156,
589
- 0.36914062,
590
- 0.29077148,
591
- 0.42041016,
592
- 0.41918945,
593
- 0.35498047,
594
- 0.3618164,
595
- 0.4423828,
596
- 0.43115234,
597
- ]
598
-
599
- assert image.shape == (1, 512, 512, 3)
600
-
601
- assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/spectrogram_diffusion/__init__.py DELETED
File without changes
diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py DELETED
@@ -1,235 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 HuggingFace Inc.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import gc
17
- import unittest
18
-
19
- import numpy as np
20
- import torch
21
-
22
- from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
23
- from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
24
- from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
25
- from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
26
-
27
- from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
28
- from ...test_pipelines_common import PipelineTesterMixin
29
-
30
-
31
- torch.backends.cuda.matmul.allow_tf32 = False
32
-
33
-
34
- MIDI_FILE = "./tests/fixtures/elise_format0.mid"
35
-
36
-
37
- class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
38
- pipeline_class = SpectrogramDiffusionPipeline
39
- required_optional_params = PipelineTesterMixin.required_optional_params - {
40
- "callback",
41
- "latents",
42
- "callback_steps",
43
- "output_type",
44
- "num_images_per_prompt",
45
- }
46
- test_attention_slicing = False
47
- test_cpu_offload = False
48
- batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS
49
- params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
50
-
51
- def get_dummy_components(self):
52
- torch.manual_seed(0)
53
- notes_encoder = SpectrogramNotesEncoder(
54
- max_length=2048,
55
- vocab_size=1536,
56
- d_model=768,
57
- dropout_rate=0.1,
58
- num_layers=1,
59
- num_heads=1,
60
- d_kv=4,
61
- d_ff=2048,
62
- feed_forward_proj="gated-gelu",
63
- )
64
-
65
- continuous_encoder = SpectrogramContEncoder(
66
- input_dims=128,
67
- targets_context_length=256,
68
- d_model=768,
69
- dropout_rate=0.1,
70
- num_layers=1,
71
- num_heads=1,
72
- d_kv=4,
73
- d_ff=2048,
74
- feed_forward_proj="gated-gelu",
75
- )
76
-
77
- decoder = T5FilmDecoder(
78
- input_dims=128,
79
- targets_length=256,
80
- max_decoder_noise_time=20000.0,
81
- d_model=768,
82
- num_layers=1,
83
- num_heads=1,
84
- d_kv=4,
85
- d_ff=2048,
86
- dropout_rate=0.1,
87
- )
88
-
89
- scheduler = DDPMScheduler()
90
-
91
- components = {
92
- "notes_encoder": notes_encoder.eval(),
93
- "continuous_encoder": continuous_encoder.eval(),
94
- "decoder": decoder.eval(),
95
- "scheduler": scheduler,
96
- "melgan": None,
97
- }
98
- return components
99
-
100
- def get_dummy_inputs(self, device, seed=0):
101
- if str(device).startswith("mps"):
102
- generator = torch.manual_seed(seed)
103
- else:
104
- generator = torch.Generator(device=device).manual_seed(seed)
105
- inputs = {
106
- "input_tokens": [
107
- [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033
108
- ],
109
- "generator": generator,
110
- "num_inference_steps": 4,
111
- "output_type": "mel",
112
- }
113
- return inputs
114
-
115
- def test_spectrogram_diffusion(self):
116
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
117
- components = self.get_dummy_components()
118
- pipe = SpectrogramDiffusionPipeline(**components)
119
- pipe = pipe.to(device)
120
- pipe.set_progress_bar_config(disable=None)
121
-
122
- inputs = self.get_dummy_inputs(device)
123
- output = pipe(**inputs)
124
- mel = output.audios
125
-
126
- mel_slice = mel[0, -3:, -3:]
127
-
128
- assert mel_slice.shape == (3, 3)
129
- expected_slice = np.array(
130
- [-11.512925, -4.788215, -0.46172905, -2.051715, -10.539147, -10.970963, -9.091634, 4.0, 4.0]
131
- )
132
- assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
133
-
134
- @skip_mps
135
- def test_save_load_local(self):
136
- return super().test_save_load_local()
137
-
138
- @skip_mps
139
- def test_dict_tuple_outputs_equivalent(self):
140
- return super().test_dict_tuple_outputs_equivalent()
141
-
142
- @skip_mps
143
- def test_save_load_optional_components(self):
144
- return super().test_save_load_optional_components()
145
-
146
- @skip_mps
147
- def test_attention_slicing_forward_pass(self):
148
- return super().test_attention_slicing_forward_pass()
149
-
150
- def test_inference_batch_single_identical(self):
151
- pass
152
-
153
- def test_inference_batch_consistent(self):
154
- pass
155
-
156
- @skip_mps
157
- def test_progress_bar(self):
158
- return super().test_progress_bar()
159
-
160
-
161
- @slow
162
- @require_torch_gpu
163
- @require_onnxruntime
164
- @require_note_seq
165
- class PipelineIntegrationTests(unittest.TestCase):
166
- def tearDown(self):
167
- # clean up the VRAM after each test
168
- super().tearDown()
169
- gc.collect()
170
- torch.cuda.empty_cache()
171
-
172
- def test_callback(self):
173
- # TODO - test that pipeline can decode tokens in a callback
174
- # so that music can be played live
175
- device = torch_device
176
-
177
- pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
178
- melgan = pipe.melgan
179
- pipe.melgan = None
180
-
181
- pipe = pipe.to(device)
182
- pipe.set_progress_bar_config(disable=None)
183
-
184
- def callback(step, mel_output):
185
- # decode mel to audio
186
- audio = melgan(input_features=mel_output.astype(np.float32))[0]
187
- assert len(audio[0]) == 81920 * (step + 1)
188
- # simulate that audio is played
189
- return audio
190
-
191
- processor = MidiProcessor()
192
- input_tokens = processor(MIDI_FILE)
193
-
194
- input_tokens = input_tokens[:3]
195
- generator = torch.manual_seed(0)
196
- pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel")
197
-
198
- def test_spectrogram_fast(self):
199
- device = torch_device
200
-
201
- pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
202
- pipe = pipe.to(device)
203
- pipe.set_progress_bar_config(disable=None)
204
- processor = MidiProcessor()
205
-
206
- input_tokens = processor(MIDI_FILE)
207
- # just run two denoising loops
208
- input_tokens = input_tokens[:2]
209
-
210
- generator = torch.manual_seed(0)
211
- output = pipe(input_tokens, num_inference_steps=2, generator=generator)
212
-
213
- audio = output.audios[0]
214
-
215
- assert abs(np.abs(audio).sum() - 3612.841) < 1e-1
216
-
217
- def test_spectrogram(self):
218
- device = torch_device
219
-
220
- pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
221
- pipe = pipe.to(device)
222
- pipe.set_progress_bar_config(disable=None)
223
-
224
- processor = MidiProcessor()
225
-
226
- input_tokens = processor(MIDI_FILE)
227
-
228
- # just run 4 denoising loops
229
- input_tokens = input_tokens[:4]
230
-
231
- generator = torch.manual_seed(0)
232
- output = pipe(input_tokens, num_inference_steps=100, generator=generator)
233
-
234
- audio = output.audios[0]
235
- assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
diffusers/tests/pipelines/stable_diffusion/__init__.py DELETED
File without changes