Update app.py
Browse files
app.py
CHANGED
@@ -62,18 +62,31 @@ def analyze_lyrics(lyrics):
|
|
62 |
def calculate_generation_params(lyrics):
|
63 |
sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
|
64 |
|
65 |
-
# ๊ธฐ๋ณธ ํ ํฐ ์ ๊ณ์ฐ
|
66 |
-
|
67 |
-
|
68 |
-
chorus_tokens = section_lines['chorus'] * (base_tokens_per_line * 1.5) # ์ฝ๋ฌ์ค๋ 50% ๋ ๋ง์ ํ ํฐ
|
69 |
-
bridge_tokens = section_lines['bridge'] * base_tokens_per_line
|
70 |
|
71 |
-
#
|
72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
#
|
75 |
-
|
|
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# ํ ํฐ ์ ์ ํ
|
78 |
max_tokens = min(32000, max(3000, total_tokens))
|
79 |
|
@@ -81,9 +94,19 @@ def calculate_generation_params(lyrics):
|
|
81 |
'max_tokens': max_tokens,
|
82 |
'num_segments': num_segments,
|
83 |
'sections': sections,
|
84 |
-
'section_lines': section_lines
|
|
|
85 |
}
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
# ์ธ์ด ๊ฐ์ง ๋ฐ ๋ชจ๋ธ ์ ํ ํจ์
|
88 |
def detect_and_select_model(text):
|
89 |
if re.search(r'[\u3131-\u318E\uAC00-\uD7A3]', text): # ํ๊ธ
|
@@ -239,6 +262,13 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
239 |
logging.info(f"Selected model: {model_path}")
|
240 |
logging.info(f"Lyrics analysis: {params}")
|
241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
# ์ค์ ์ฌ์ฉํ ํ๋ผ๋ฏธํฐ
|
243 |
actual_num_segments = config['num_segments']
|
244 |
actual_max_tokens = config['max_tokens']
|
@@ -253,7 +283,7 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
253 |
os.makedirs(output_dir, exist_ok=True)
|
254 |
empty_output_folder(output_dir)
|
255 |
|
256 |
-
# ๊ธฐ๋ณธ ๋ช
๋ น์ด ๊ตฌ์ฑ
|
257 |
command = [
|
258 |
"python", "infer.py",
|
259 |
"--stage1_model", model_path,
|
@@ -264,9 +294,16 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
264 |
"--stage2_batch_size", str(config['batch_size']),
|
265 |
"--output_dir", output_dir,
|
266 |
"--cuda_idx", "0",
|
267 |
-
"--max_new_tokens", str(actual_max_tokens)
|
|
|
268 |
]
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
# GPU๊ฐ ์๋ ๊ฒฝ์ฐ์๋ง ์ถ๊ฐ ์ต์
์ ์ฉ
|
271 |
if torch.cuda.is_available():
|
272 |
command.append("--disable_offload_model")
|
@@ -311,7 +348,11 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
311 |
# ๊ฒฐ๊ณผ ์ฒ๋ฆฌ
|
312 |
last_mp3 = get_last_mp3_file(output_dir)
|
313 |
if last_mp3:
|
|
|
314 |
logging.info(f"Generated audio file: {last_mp3}")
|
|
|
|
|
|
|
315 |
return last_mp3
|
316 |
else:
|
317 |
logging.warning("No output audio file generated")
|
@@ -334,7 +375,16 @@ def main():
|
|
334 |
with gr.Blocks() as demo:
|
335 |
with gr.Column():
|
336 |
gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation (Multi-Language Support)")
|
337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
with gr.Row():
|
340 |
with gr.Column():
|
@@ -365,6 +415,9 @@ def main():
|
|
365 |
value=4000,
|
366 |
interactive=False
|
367 |
)
|
|
|
|
|
|
|
368 |
submit_btn = gr.Button("Generate Music", variant="primary")
|
369 |
music_out = gr.Audio(label="Generated Audio")
|
370 |
|
@@ -385,7 +438,6 @@ Don't let this moment fade, hold me close tonight
|
|
385 |
With you here beside me, everything's alright
|
386 |
Can't imagine life alone, don't want to let you go
|
387 |
Stay with me forever, let our love just flow
|
388 |
-
|
389 |
"""
|
390 |
],
|
391 |
# ํ๊ตญ์ด ์์
|
@@ -402,7 +454,6 @@ Stay with me forever, let our love just flow
|
|
402 |
๋๋ ค์์ ์์ด ๋์ ํจ๊ป๋ผ๋ฉด
|
403 |
์์ํ ๊ณ์๋ ์ฐ๋ฆฌ์ ๋
ธ๋
|
404 |
์ด ์๊ฐ์ ๊ธฐ์ตํด forever
|
405 |
-
|
406 |
"""
|
407 |
]
|
408 |
],
|
@@ -412,7 +463,24 @@ Stay with me forever, let our love just flow
|
|
412 |
# ์์คํ
์ด๊ธฐํ
|
413 |
initialize_system()
|
414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
submit_btn.click(
|
417 |
fn=infer,
|
418 |
inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
|
|
|
62 |
def calculate_generation_params(lyrics):
|
63 |
sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
|
64 |
|
65 |
+
# ๊ธฐ๋ณธ ํ ํฐ ์ ๊ณ์ฐ (1ํ ํฐ โ 0.02์ด ๊ธฐ์ค)
|
66 |
+
seconds_per_line = 3 # ํ ์ค๋น ํ๊ท 3์ด
|
67 |
+
target_duration = 0 # ๋ชฉํ ๊ธธ์ด (์ด)
|
|
|
|
|
68 |
|
69 |
+
# ๊ฐ ์น์
๋ณ ์๊ฐ ๊ณ์ฐ
|
70 |
+
for section_type in ['verse', 'chorus', 'bridge']:
|
71 |
+
lines = section_lines[section_type]
|
72 |
+
if section_type == 'chorus':
|
73 |
+
# ์ฝ๋ฌ์ค๋ ๋ ๊ธด ์๊ฐ ํ ๋น
|
74 |
+
target_duration += lines * seconds_per_line * 1.5
|
75 |
+
else:
|
76 |
+
target_duration += lines * seconds_per_line
|
77 |
|
78 |
+
# ํ ํฐ ์ ๊ณ์ฐ (1์ด๋น ์ฝ 50ํ ํฐ)
|
79 |
+
tokens_per_second = 50
|
80 |
+
total_tokens = int(target_duration * tokens_per_second)
|
81 |
|
82 |
+
# ์น์
๊ธฐ๋ฐ ์ธ๊ทธ๋จผํธ ์ ๊ณ์ฐ
|
83 |
+
if target_duration > 180: # 3๋ถ ์ด์
|
84 |
+
num_segments = 4
|
85 |
+
elif target_duration > 120: # 2๋ถ ์ด์
|
86 |
+
num_segments = 3
|
87 |
+
else:
|
88 |
+
num_segments = 2
|
89 |
+
|
90 |
# ํ ํฐ ์ ์ ํ
|
91 |
max_tokens = min(32000, max(3000, total_tokens))
|
92 |
|
|
|
94 |
'max_tokens': max_tokens,
|
95 |
'num_segments': num_segments,
|
96 |
'sections': sections,
|
97 |
+
'section_lines': section_lines,
|
98 |
+
'estimated_duration': target_duration
|
99 |
}
|
100 |
|
101 |
+
def get_audio_duration(file_path):
|
102 |
+
try:
|
103 |
+
import librosa
|
104 |
+
duration = librosa.get_duration(path=file_path)
|
105 |
+
return duration
|
106 |
+
except Exception as e:
|
107 |
+
logging.error(f"Failed to get audio duration: {e}")
|
108 |
+
return None
|
109 |
+
|
110 |
# ์ธ์ด ๊ฐ์ง ๋ฐ ๋ชจ๋ธ ์ ํ ํจ์
|
111 |
def detect_and_select_model(text):
|
112 |
if re.search(r'[\u3131-\u318E\uAC00-\uD7A3]', text): # ํ๊ธ
|
|
|
262 |
logging.info(f"Selected model: {model_path}")
|
263 |
logging.info(f"Lyrics analysis: {params}")
|
264 |
|
265 |
+
# ์ฝ๋ฌ์ค ์น์
ํ์ธ
|
266 |
+
has_chorus = params['sections']['chorus'] > 0
|
267 |
+
estimated_duration = params.get('estimated_duration', 60) # ๊ธฐ๋ณธ๊ฐ 60์ด
|
268 |
+
|
269 |
+
logging.info(f"Estimated duration: {estimated_duration} seconds")
|
270 |
+
logging.info(f"Has chorus sections: {has_chorus}")
|
271 |
+
|
272 |
# ์ค์ ์ฌ์ฉํ ํ๋ผ๋ฏธํฐ
|
273 |
actual_num_segments = config['num_segments']
|
274 |
actual_max_tokens = config['max_tokens']
|
|
|
283 |
os.makedirs(output_dir, exist_ok=True)
|
284 |
empty_output_folder(output_dir)
|
285 |
|
286 |
+
# ๊ธฐ๋ณธ ๋ช
๋ น์ด ๊ตฌ์ฑ
|
287 |
command = [
|
288 |
"python", "infer.py",
|
289 |
"--stage1_model", model_path,
|
|
|
294 |
"--stage2_batch_size", str(config['batch_size']),
|
295 |
"--output_dir", output_dir,
|
296 |
"--cuda_idx", "0",
|
297 |
+
"--max_new_tokens", str(actual_max_tokens),
|
298 |
+
"--keep_intermediate"
|
299 |
]
|
300 |
|
301 |
+
if has_chorus:
|
302 |
+
command.extend([
|
303 |
+
"--segment_duration", str(int(estimated_duration / actual_num_segments)),
|
304 |
+
"--enhance_chorus"
|
305 |
+
])
|
306 |
+
|
307 |
# GPU๊ฐ ์๋ ๊ฒฝ์ฐ์๋ง ์ถ๊ฐ ์ต์
์ ์ฉ
|
308 |
if torch.cuda.is_available():
|
309 |
command.append("--disable_offload_model")
|
|
|
348 |
# ๊ฒฐ๊ณผ ์ฒ๋ฆฌ
|
349 |
last_mp3 = get_last_mp3_file(output_dir)
|
350 |
if last_mp3:
|
351 |
+
duration = get_audio_duration(last_mp3)
|
352 |
logging.info(f"Generated audio file: {last_mp3}")
|
353 |
+
if duration:
|
354 |
+
logging.info(f"Audio duration: {duration:.2f} seconds")
|
355 |
+
logging.info(f"Expected duration: {estimated_duration} seconds")
|
356 |
return last_mp3
|
357 |
else:
|
358 |
logging.warning("No output audio file generated")
|
|
|
375 |
with gr.Blocks() as demo:
|
376 |
with gr.Column():
|
377 |
gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation (Multi-Language Support)")
|
378 |
+
gr.HTML("""
|
379 |
+
<div style="display:flex;column-gap:4px;">
|
380 |
+
<a href="https://github.com/multimodal-art-projection/YuE">
|
381 |
+
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
|
382 |
+
</a>
|
383 |
+
<a href="https://map-yue.github.io">
|
384 |
+
<img src='https://img.shields.io/badge/Project-Page-green'>
|
385 |
+
</a>
|
386 |
+
</div>
|
387 |
+
""")
|
388 |
|
389 |
with gr.Row():
|
390 |
with gr.Column():
|
|
|
415 |
value=4000,
|
416 |
interactive=False
|
417 |
)
|
418 |
+
with gr.Row():
|
419 |
+
duration_info = gr.Label(label="Estimated Duration")
|
420 |
+
sections_info = gr.Label(label="Section Information")
|
421 |
submit_btn = gr.Button("Generate Music", variant="primary")
|
422 |
music_out = gr.Audio(label="Generated Audio")
|
423 |
|
|
|
438 |
With you here beside me, everything's alright
|
439 |
Can't imagine life alone, don't want to let you go
|
440 |
Stay with me forever, let our love just flow
|
|
|
441 |
"""
|
442 |
],
|
443 |
# ํ๊ตญ์ด ์์
|
|
|
454 |
๋๋ ค์์ ์์ด ๋์ ํจ๊ป๋ผ๋ฉด
|
455 |
์์ํ ๊ณ์๋ ์ฐ๋ฆฌ์ ๋
ธ๋
|
456 |
์ด ์๊ฐ์ ๊ธฐ์ตํด forever
|
|
|
457 |
"""
|
458 |
]
|
459 |
],
|
|
|
463 |
# ์์คํ
์ด๊ธฐํ
|
464 |
initialize_system()
|
465 |
|
466 |
+
def update_info(lyrics):
|
467 |
+
if not lyrics:
|
468 |
+
return "No lyrics entered", "No sections detected"
|
469 |
+
params = calculate_generation_params(lyrics)
|
470 |
+
duration = params.get('estimated_duration', 0)
|
471 |
+
sections = params['sections']
|
472 |
+
return (
|
473 |
+
f"{duration:.1f} seconds",
|
474 |
+
f"Verses: {sections['verse']}, Chorus: {sections['chorus']}, Bridge: {sections['bridge']}"
|
475 |
+
)
|
476 |
+
|
477 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
478 |
+
lyrics_txt.change(
|
479 |
+
fn=update_info,
|
480 |
+
inputs=[lyrics_txt],
|
481 |
+
outputs=[duration_info, sections_info]
|
482 |
+
)
|
483 |
+
|
484 |
submit_btn.click(
|
485 |
fn=infer,
|
486 |
inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
|