ginipick commited on
Commit
de17a4e
·
verified ·
1 Parent(s): d6a6a48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -155
app.py CHANGED
@@ -98,47 +98,9 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
98
  return video_save_path
99
 
100
 
101
- @spaces.GPU(duration=120)
102
- @torch.inference_mode()
103
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
104
- duration: float):
105
-
106
- rng = torch.Generator(device=device)
107
- if seed >= 0:
108
- rng.manual_seed(seed)
109
- else:
110
- rng.seed()
111
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
112
-
113
- clip_frames = sync_frames = None
114
- seq_cfg.duration = duration
115
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
116
-
117
- audios = generate(clip_frames,
118
- sync_frames, [prompt],
119
- negative_text=[negative_prompt],
120
- feature_utils=feature_utils,
121
- net=net,
122
- fm=fm,
123
- rng=rng,
124
- cfg_strength=cfg_strength)
125
- audio = audios.float().cpu()[0]
126
-
127
- audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
128
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
129
- log.info(f'Saved audio to {audio_save_path}')
130
- return audio_save_path
131
-
132
-
133
  video_to_audio_tab = gr.Interface(
134
  fn=video_to_audio,
135
- description="""
136
- Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
137
- Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
138
 
139
- NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
140
- Doing so does not improve results.
141
- """,
142
  inputs=[
143
  gr.Video(),
144
  gr.Text(label='Prompt'),
@@ -149,124 +111,7 @@ video_to_audio_tab = gr.Interface(
149
  gr.Number(label='Duration (sec)', value=8, minimum=1),
150
  ],
151
  outputs='playable_video',
152
- cache_examples=False,
153
- title='MMAudio — Video-to-Audio Synthesis',
154
- examples=[
155
- [
156
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
157
- 'waves, seagulls',
158
- '',
159
- 0,
160
- 25,
161
- 4.5,
162
- 10,
163
- ],
164
- [
165
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
166
- '',
167
- 'music',
168
- 0,
169
- 25,
170
- 4.5,
171
- 10,
172
- ],
173
- [
174
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
175
- 'bubbles',
176
- '',
177
- 0,
178
- 25,
179
- 4.5,
180
- 10,
181
- ],
182
- [
183
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
184
- 'Indian holy music',
185
- '',
186
- 0,
187
- 25,
188
- 4.5,
189
- 10,
190
- ],
191
- [
192
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
193
- 'galloping',
194
- '',
195
- 0,
196
- 25,
197
- 4.5,
198
- 10,
199
- ],
200
- [
201
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
202
- 'waves, storm',
203
- '',
204
- 0,
205
- 25,
206
- 4.5,
207
- 10,
208
- ],
209
- [
210
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
211
- '',
212
- '',
213
- 0,
214
- 25,
215
- 4.5,
216
- 10,
217
- ],
218
- [
219
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
220
- 'storm',
221
- '',
222
- 0,
223
- 25,
224
- 4.5,
225
- 10,
226
- ],
227
- [
228
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
229
- '',
230
- '',
231
- 0,
232
- 25,
233
- 4.5,
234
- 10,
235
- ],
236
- [
237
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
238
- 'typing',
239
- '',
240
- 0,
241
- 25,
242
- 4.5,
243
- 10,
244
- ],
245
- [
246
- 'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
247
- '',
248
- '',
249
- 0,
250
- 25,
251
- 4.5,
252
- 10,
253
- ],
254
- ])
255
 
256
- text_to_audio_tab = gr.Interface(
257
- fn=text_to_audio,
258
- inputs=[
259
- gr.Text(label='Prompt'),
260
- gr.Text(label='Negative prompt'),
261
- gr.Number(label='Seed', value=0, precision=0, minimum=0),
262
- gr.Number(label='Num steps', value=25, precision=0, minimum=1),
263
- gr.Number(label='Guidance Strength', value=4.5, minimum=1),
264
- gr.Number(label='Duration (sec)', value=8, minimum=1),
265
- ],
266
- outputs='audio',
267
- cache_examples=False,
268
- title='MMAudio — Text-to-Audio Synthesis',
269
- )
270
 
271
  if __name__ == "__main__":
272
  gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
 
98
  return video_save_path
99
 
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  video_to_audio_tab = gr.Interface(
102
  fn=video_to_audio,
 
 
 
103
 
 
 
 
104
  inputs=[
105
  gr.Video(),
106
  gr.Text(label='Prompt'),
 
111
  gr.Number(label='Duration (sec)', value=8, minimum=1),
112
  ],
113
  outputs='playable_video',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  if __name__ == "__main__":
117
  gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],