@@ -73,14 +73,15 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
7373 def get_dummy_components (self ):
7474 torch .manual_seed (0 )
7575 unet = AudioLDM2UNet2DConditionModel (
76- block_out_channels = (32 , 64 ),
77- layers_per_block = 2 ,
76+ block_out_channels = (8 , 16 ),
77+ layers_per_block = 1 ,
78+ norm_num_groups = 8 ,
7879 sample_size = 32 ,
7980 in_channels = 4 ,
8081 out_channels = 4 ,
8182 down_block_types = ("DownBlock2D" , "CrossAttnDownBlock2D" ),
8283 up_block_types = ("CrossAttnUpBlock2D" , "UpBlock2D" ),
83- cross_attention_dim = ([ None , 16 , 32 ], [ None , 16 , 32 ] ),
84+ cross_attention_dim = (8 , 16 ),
8485 )
8586 scheduler = DDIMScheduler (
8687 beta_start = 0.00085 ,
@@ -91,9 +92,10 @@ def get_dummy_components(self):
9192 )
9293 torch .manual_seed (0 )
9394 vae = AutoencoderKL (
94- block_out_channels = [32 , 64 ],
95+ block_out_channels = [8 , 16 ],
9596 in_channels = 1 ,
9697 out_channels = 1 ,
98+ norm_num_groups = 8 ,
9799 down_block_types = ["DownEncoderBlock2D" , "DownEncoderBlock2D" ],
98100 up_block_types = ["UpDecoderBlock2D" , "UpDecoderBlock2D" ],
99101 latent_channels = 4 ,
@@ -102,32 +104,34 @@ def get_dummy_components(self):
102104 text_branch_config = ClapTextConfig (
103105 bos_token_id = 0 ,
104106 eos_token_id = 2 ,
105- hidden_size = 16 ,
107+ hidden_size = 8 ,
106108 intermediate_size = 37 ,
107109 layer_norm_eps = 1e-05 ,
108- num_attention_heads = 2 ,
109- num_hidden_layers = 2 ,
110+ num_attention_heads = 1 ,
111+ num_hidden_layers = 1 ,
110112 pad_token_id = 1 ,
111113 vocab_size = 1000 ,
112- projection_dim = 16 ,
114+ projection_dim = 8 ,
113115 )
114116 audio_branch_config = ClapAudioConfig (
115- spec_size = 64 ,
117+ spec_size = 8 ,
116118 window_size = 4 ,
117- num_mel_bins = 64 ,
119+ num_mel_bins = 8 ,
118120 intermediate_size = 37 ,
119121 layer_norm_eps = 1e-05 ,
120- depths = [2 , 2 ],
121- num_attention_heads = [2 , 2 ],
122- num_hidden_layers = 2 ,
122+ depths = [1 , 1 ],
123+ num_attention_heads = [1 , 1 ],
124+ num_hidden_layers = 1 ,
123125 hidden_size = 192 ,
124- projection_dim = 16 ,
126+ projection_dim = 8 ,
125127 patch_size = 2 ,
126128 patch_stride = 2 ,
127129 patch_embed_input_channels = 4 ,
128130 )
129131 text_encoder_config = ClapConfig .from_text_audio_configs (
130- text_config = text_branch_config , audio_config = audio_branch_config , projection_dim = 16
132+ text_config = text_branch_config ,
133+ audio_config = audio_branch_config ,
134+ projection_dim = 16 ,
131135 )
132136 text_encoder = ClapModel (text_encoder_config )
133137 tokenizer = RobertaTokenizer .from_pretrained ("hf-internal-testing/tiny-random-roberta" , model_max_length = 77 )
@@ -141,17 +145,17 @@ def get_dummy_components(self):
141145 d_model = 32 ,
142146 d_ff = 37 ,
143147 d_kv = 8 ,
144- num_heads = 2 ,
145- num_layers = 2 ,
148+ num_heads = 1 ,
149+ num_layers = 1 ,
146150 )
147151 text_encoder_2 = T5EncoderModel (text_encoder_2_config )
148152 tokenizer_2 = T5Tokenizer .from_pretrained ("hf-internal-testing/tiny-random-T5Model" , model_max_length = 77 )
149153
150154 torch .manual_seed (0 )
151155 language_model_config = GPT2Config (
152156 n_embd = 16 ,
153- n_head = 2 ,
154- n_layer = 2 ,
157+ n_head = 1 ,
158+ n_layer = 1 ,
155159 vocab_size = 1000 ,
156160 n_ctx = 99 ,
157161 n_positions = 99 ,
@@ -160,7 +164,11 @@ def get_dummy_components(self):
160164 language_model .config .max_new_tokens = 8
161165
162166 torch .manual_seed (0 )
163- projection_model = AudioLDM2ProjectionModel (text_encoder_dim = 16 , text_encoder_1_dim = 32 , langauge_model_dim = 16 )
167+ projection_model = AudioLDM2ProjectionModel (
168+ text_encoder_dim = 16 ,
169+ text_encoder_1_dim = 32 ,
170+ langauge_model_dim = 16 ,
171+ )
164172
165173 vocoder_config = SpeechT5HifiGanConfig (
166174 model_in_dim = 8 ,
@@ -220,7 +228,18 @@ def test_audioldm2_ddim(self):
220228
221229 audio_slice = audio [:10 ]
222230 expected_slice = np .array (
223- [0.0025 , 0.0018 , 0.0018 , - 0.0023 , - 0.0026 , - 0.0020 , - 0.0026 , - 0.0021 , - 0.0027 , - 0.0020 ]
231+ [
232+ 2.602e-03 ,
233+ 1.729e-03 ,
234+ 1.863e-03 ,
235+ - 2.219e-03 ,
236+ - 2.656e-03 ,
237+ - 2.017e-03 ,
238+ - 2.648e-03 ,
239+ - 2.115e-03 ,
240+ - 2.502e-03 ,
241+ - 2.081e-03 ,
242+ ]
224243 )
225244
226245 assert np .abs (audio_slice - expected_slice ).max () < 1e-4
@@ -361,7 +380,7 @@ def test_audioldm2_negative_prompt(self):
361380
362381 audio_slice = audio [:10 ]
363382 expected_slice = np .array (
364- [0.0025 , 0.0018 , 0.0018 , - 0.0023 , - 0.0026 , - 0.0020 , - 0.0026 , - 0.0021 , - 0.0027 , - 0.0020 ]
383+ [0.0026 , 0.0017 , 0.0018 , - 0.0022 , - 0.0026 , - 0.002 , - 0.0026 , - 0.0021 , - 0.0025 , - 0.0021 ]
365384 )
366385
367386 assert np .abs (audio_slice - expected_slice ).max () < 1e-4
@@ -388,7 +407,7 @@ def test_audioldm2_num_waveforms_per_prompt(self):
388407 assert audios .shape == (batch_size , 256 )
389408
390409 # test num_waveforms_per_prompt for single prompt
391- num_waveforms_per_prompt = 2
410+ num_waveforms_per_prompt = 1
392411 audios = audioldm_pipe (prompt , num_inference_steps = 2 , num_waveforms_per_prompt = num_waveforms_per_prompt ).audios
393412
394413 assert audios .shape == (num_waveforms_per_prompt , 256 )
0 commit comments