Update app.py
Browse files
app.py
CHANGED
@@ -62,40 +62,43 @@ def analyze_lyrics(lyrics):
|
|
62 |
def calculate_generation_params(lyrics):
|
63 |
sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
|
64 |
|
65 |
-
# ๊ธฐ๋ณธ
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
68 |
|
69 |
-
# ๊ฐ ์น์
๋ณ ์๊ฐ ๊ณ์ฐ
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
target_duration += lines * seconds_per_line
|
77 |
|
78 |
-
# ํ ํฐ
|
79 |
tokens_per_second = 50
|
80 |
-
total_tokens = int(
|
81 |
|
82 |
# ์น์
๊ธฐ๋ฐ ์ธ๊ทธ๋จผํธ ์ ๊ณ์ฐ
|
83 |
-
if
|
84 |
num_segments = 4
|
85 |
-
elif
|
86 |
num_segments = 3
|
87 |
-
else:
|
88 |
num_segments = 2
|
89 |
-
|
90 |
-
# ํ ํฐ ์ ์ ํ
|
91 |
-
max_tokens = min(32000, max(
|
92 |
|
93 |
return {
|
94 |
'max_tokens': max_tokens,
|
95 |
'num_segments': num_segments,
|
96 |
'sections': sections,
|
97 |
'section_lines': section_lines,
|
98 |
-
'estimated_duration':
|
|
|
99 |
}
|
100 |
|
101 |
def get_audio_duration(file_path):
|
@@ -122,30 +125,41 @@ def optimize_model_selection(lyrics, genre):
|
|
122 |
model_path = detect_and_select_model(lyrics)
|
123 |
params = calculate_generation_params(lyrics)
|
124 |
|
|
|
|
|
|
|
125 |
model_config = {
|
126 |
"m-a-p/YuE-s1-7B-anneal-en-cot": {
|
127 |
"max_tokens": params['max_tokens'],
|
128 |
"temperature": 0.8,
|
129 |
"batch_size": 8,
|
130 |
"num_segments": params['num_segments'],
|
131 |
-
"
|
|
|
132 |
},
|
133 |
"m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
|
134 |
"max_tokens": params['max_tokens'],
|
135 |
"temperature": 0.7,
|
136 |
"batch_size": 8,
|
137 |
"num_segments": params['num_segments'],
|
138 |
-
"
|
|
|
139 |
},
|
140 |
"m-a-p/YuE-s1-7B-anneal-zh-cot": {
|
141 |
"max_tokens": params['max_tokens'],
|
142 |
"temperature": 0.7,
|
143 |
"batch_size": 8,
|
144 |
"num_segments": params['num_segments'],
|
145 |
-
"
|
|
|
146 |
}
|
147 |
}
|
148 |
|
|
|
|
|
|
|
|
|
|
|
149 |
return model_path, model_config[model_path], params
|
150 |
|
151 |
# GPU ์ค์ ์ต์ ํ
|
@@ -266,13 +280,16 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
266 |
has_chorus = params['sections']['chorus'] > 0
|
267 |
estimated_duration = params.get('estimated_duration', 60)
|
268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
logging.info(f"Estimated duration: {estimated_duration} seconds")
|
270 |
logging.info(f"Has chorus sections: {has_chorus}")
|
271 |
-
|
272 |
-
# ์ค์ ์ฌ์ฉํ ํ๋ผ๋ฏธํฐ
|
273 |
-
actual_num_segments = config['num_segments']
|
274 |
-
actual_max_tokens = config['max_tokens']
|
275 |
-
|
276 |
logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
|
277 |
|
278 |
# ์์ ํ์ผ ์์ฑ
|
@@ -283,7 +300,7 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
283 |
os.makedirs(output_dir, exist_ok=True)
|
284 |
empty_output_folder(output_dir)
|
285 |
|
286 |
-
# ๊ธฐ๋ณธ ๋ช
๋ น์ด ๊ตฌ์ฑ
|
287 |
command = [
|
288 |
"python", "infer.py",
|
289 |
"--stage1_model", model_path,
|
@@ -347,6 +364,10 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
|
|
347 |
if duration:
|
348 |
logging.info(f"Audio duration: {duration:.2f} seconds")
|
349 |
logging.info(f"Expected duration: {estimated_duration} seconds")
|
|
|
|
|
|
|
|
|
350 |
except Exception as e:
|
351 |
logging.warning(f"Failed to get audio duration: {e}")
|
352 |
return last_mp3
|
@@ -370,17 +391,8 @@ def main():
|
|
370 |
# Gradio ์ธํฐํ์ด์ค
|
371 |
with gr.Blocks() as demo:
|
372 |
with gr.Column():
|
373 |
-
gr.Markdown("#
|
374 |
-
|
375 |
-
<div style="display:flex;column-gap:4px;">
|
376 |
-
<a href="https://github.com/multimodal-art-projection/YuE">
|
377 |
-
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
|
378 |
-
</a>
|
379 |
-
<a href="https://map-yue.github.io">
|
380 |
-
<img src='https://img.shields.io/badge/Project-Page-green'>
|
381 |
-
</a>
|
382 |
-
</div>
|
383 |
-
""")
|
384 |
|
385 |
with gr.Row():
|
386 |
with gr.Column():
|
@@ -458,18 +470,20 @@ Stay with me forever, let our love just flow
|
|
458 |
|
459 |
# ์์คํ
์ด๊ธฐํ
|
460 |
initialize_system()
|
461 |
-
|
462 |
def update_info(lyrics):
|
463 |
if not lyrics:
|
464 |
return "No lyrics entered", "No sections detected"
|
465 |
params = calculate_generation_params(lyrics)
|
466 |
-
duration = params
|
467 |
sections = params['sections']
|
468 |
return (
|
469 |
-
f"{duration:.1f} seconds",
|
470 |
-
f"Verses: {sections['verse']}, Chorus: {sections['chorus']}
|
471 |
)
|
|
|
472 |
|
|
|
473 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
474 |
lyrics_txt.change(
|
475 |
fn=update_info,
|
|
|
62 |
def calculate_generation_params(lyrics):
|
63 |
sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
|
64 |
|
65 |
+
# ๊ธฐ๋ณธ ์๊ฐ ๊ณ์ฐ (์ด ๋จ์)
|
66 |
+
time_per_line = {
|
67 |
+
'verse': 4, # verse๋ ํ ์ค๋น 4์ด
|
68 |
+
'chorus': 6, # chorus๋ ํ ์ค๋น 6์ด (๋ ๊ธด ์๊ฐ ํ ๋น)
|
69 |
+
'bridge': 5 # bridge๋ ํ ์ค๋น 5์ด
|
70 |
+
}
|
71 |
|
72 |
+
# ๊ฐ ์น์
๋ณ ์์ ์๊ฐ ๊ณ์ฐ
|
73 |
+
total_duration = 0
|
74 |
+
for section_type, lines in section_lines.items():
|
75 |
+
total_duration += lines * time_per_line[section_type]
|
76 |
+
|
77 |
+
# ์ต์ ์ง์ ์๊ฐ ๋ณด์ฅ (60์ด)
|
78 |
+
total_duration = max(60, total_duration)
|
|
|
79 |
|
80 |
+
# ํ ํฐ ๊ณ์ฐ (1์ด๋น ์ฝ 50ํ ํฐ์ผ๋ก ๊ณ์ฐ)
|
81 |
tokens_per_second = 50
|
82 |
+
total_tokens = int(total_duration * tokens_per_second)
|
83 |
|
84 |
# ์น์
๊ธฐ๋ฐ ์ธ๊ทธ๋จผํธ ์ ๊ณ์ฐ
|
85 |
+
if total_duration > 180: # 3๋ถ ์ด์
|
86 |
num_segments = 4
|
87 |
+
elif total_duration > 120: # 2๋ถ ์ด์
|
88 |
num_segments = 3
|
89 |
+
else: # 2๋ถ ๋ฏธ๋ง
|
90 |
num_segments = 2
|
91 |
+
|
92 |
+
# ํ ํฐ ์ ์ ํ (์ต์ 6000ํ ํฐ ๋ณด์ฅ)
|
93 |
+
max_tokens = min(32000, max(6000, total_tokens))
|
94 |
|
95 |
return {
|
96 |
'max_tokens': max_tokens,
|
97 |
'num_segments': num_segments,
|
98 |
'sections': sections,
|
99 |
'section_lines': section_lines,
|
100 |
+
'estimated_duration': total_duration,
|
101 |
+
'tokens_per_segment': max_tokens // num_segments
|
102 |
}
|
103 |
|
104 |
def get_audio_duration(file_path):
|
|
|
125 |
model_path = detect_and_select_model(lyrics)
|
126 |
params = calculate_generation_params(lyrics)
|
127 |
|
128 |
+
# ์ฝ๋ฌ์ค ์กด์ฌ ์ฌ๋ถ์ ๋ฐ๋ฅธ ์ค์ ์กฐ์
|
129 |
+
has_chorus = params['sections']['chorus'] > 0
|
130 |
+
|
131 |
model_config = {
|
132 |
"m-a-p/YuE-s1-7B-anneal-en-cot": {
|
133 |
"max_tokens": params['max_tokens'],
|
134 |
"temperature": 0.8,
|
135 |
"batch_size": 8,
|
136 |
"num_segments": params['num_segments'],
|
137 |
+
"tokens_per_segment": params['tokens_per_segment'],
|
138 |
+
"estimated_duration": params['estimated_duration']
|
139 |
},
|
140 |
"m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
|
141 |
"max_tokens": params['max_tokens'],
|
142 |
"temperature": 0.7,
|
143 |
"batch_size": 8,
|
144 |
"num_segments": params['num_segments'],
|
145 |
+
"tokens_per_segment": params['tokens_per_segment'],
|
146 |
+
"estimated_duration": params['estimated_duration']
|
147 |
},
|
148 |
"m-a-p/YuE-s1-7B-anneal-zh-cot": {
|
149 |
"max_tokens": params['max_tokens'],
|
150 |
"temperature": 0.7,
|
151 |
"batch_size": 8,
|
152 |
"num_segments": params['num_segments'],
|
153 |
+
"tokens_per_segment": params['tokens_per_segment'],
|
154 |
+
"estimated_duration": params['estimated_duration']
|
155 |
}
|
156 |
}
|
157 |
|
158 |
+
# ์ฝ๋ฌ์ค๊ฐ ์๋ ๊ฒฝ์ฐ ํ ํฐ ์ ์ฆ๊ฐ
|
159 |
+
if has_chorus:
|
160 |
+
for config in model_config.values():
|
161 |
+
config['max_tokens'] = int(config['max_tokens'] * 1.5) # 50% ๋ ๋ง์ ํ ํฐ ํ ๋น
|
162 |
+
|
163 |
return model_path, model_config[model_path], params
|
164 |
|
165 |
# GPU ์ค์ ์ต์ ํ
|
|
|
280 |
has_chorus = params['sections']['chorus'] > 0
|
281 |
estimated_duration = params.get('estimated_duration', 60)
|
282 |
|
283 |
+
# ํ ํฐ ์ ์กฐ์ (์ฝ๋ฌ์ค๊ฐ ์๋ ๊ฒฝ์ฐ ๋ ๋ง์ ํ ํฐ ํ ๋น)
|
284 |
+
if has_chorus:
|
285 |
+
actual_max_tokens = int(config['max_tokens'] * 1.5) # 50% ๋ ๋ง์ ํ ํฐ
|
286 |
+
actual_num_segments = max(3, config['num_segments']) # ์ต์ 3๊ฐ ์ธ๊ทธ๋จผํธ ๋ณด์ฅ
|
287 |
+
else:
|
288 |
+
actual_max_tokens = config['max_tokens']
|
289 |
+
actual_num_segments = config['num_segments']
|
290 |
+
|
291 |
logging.info(f"Estimated duration: {estimated_duration} seconds")
|
292 |
logging.info(f"Has chorus sections: {has_chorus}")
|
|
|
|
|
|
|
|
|
|
|
293 |
logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
|
294 |
|
295 |
# ์์ ํ์ผ ์์ฑ
|
|
|
300 |
os.makedirs(output_dir, exist_ok=True)
|
301 |
empty_output_folder(output_dir)
|
302 |
|
303 |
+
# ๊ธฐ๋ณธ ๋ช
๋ น์ด ๊ตฌ์ฑ
|
304 |
command = [
|
305 |
"python", "infer.py",
|
306 |
"--stage1_model", model_path,
|
|
|
364 |
if duration:
|
365 |
logging.info(f"Audio duration: {duration:.2f} seconds")
|
366 |
logging.info(f"Expected duration: {estimated_duration} seconds")
|
367 |
+
|
368 |
+
# ์์ฑ๋ ์์
์ด ๋๋ฌด ์งง์ ๊ฒฝ์ฐ ๊ฒฝ๊ณ
|
369 |
+
if duration < estimated_duration * 0.8: # ์์ ๊ธธ์ด์ 80% ๋ฏธ๋ง์ธ ๊ฒฝ์ฐ
|
370 |
+
logging.warning(f"Generated audio is shorter than expected: {duration:.2f}s < {estimated_duration:.2f}s")
|
371 |
except Exception as e:
|
372 |
logging.warning(f"Failed to get audio duration: {e}")
|
373 |
return last_mp3
|
|
|
391 |
# Gradio ์ธํฐํ์ด์ค
|
392 |
with gr.Blocks() as demo:
|
393 |
with gr.Column():
|
394 |
+
gr.Markdown("# Open SUNI: Full-Song Generation (Multi-Language Support)")
|
395 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
|
397 |
with gr.Row():
|
398 |
with gr.Column():
|
|
|
470 |
|
471 |
# ์์คํ
์ด๊ธฐํ
|
472 |
initialize_system()
|
473 |
+
|
474 |
def update_info(lyrics):
|
475 |
if not lyrics:
|
476 |
return "No lyrics entered", "No sections detected"
|
477 |
params = calculate_generation_params(lyrics)
|
478 |
+
duration = params['estimated_duration']
|
479 |
sections = params['sections']
|
480 |
return (
|
481 |
+
f"Estimated duration: {duration:.1f} seconds",
|
482 |
+
f"Verses: {sections['verse']}, Chorus: {sections['chorus']} (Expected full length including chorus)"
|
483 |
)
|
484 |
+
|
485 |
|
486 |
+
|
487 |
# ์ด๋ฒคํธ ํธ๋ค๋ฌ
|
488 |
lyrics_txt.change(
|
489 |
fn=update_info,
|