ginipick commited on
Commit
dbad390
ยท
verified ยท
1 Parent(s): bddc80f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -44
app.py CHANGED
@@ -62,40 +62,43 @@ def analyze_lyrics(lyrics):
62
  def calculate_generation_params(lyrics):
63
  sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
64
 
65
- # ๊ธฐ๋ณธ ํ† ํฐ ์ˆ˜ ๊ณ„์‚ฐ (1ํ† ํฐ โ‰ˆ 0.02์ดˆ ๊ธฐ์ค€)
66
- seconds_per_line = 3 # ํ•œ ์ค„๋‹น ํ‰๊ท  3์ดˆ
67
- target_duration = 0 # ๋ชฉํ‘œ ๊ธธ์ด (์ดˆ)
 
 
 
68
 
69
- # ๊ฐ ์„น์…˜๋ณ„ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
70
- for section_type in ['verse', 'chorus', 'bridge']:
71
- lines = section_lines[section_type]
72
- if section_type == 'chorus':
73
- # ์ฝ”๋Ÿฌ์Šค๋Š” ๋” ๊ธด ์‹œ๊ฐ„ ํ• ๋‹น
74
- target_duration += lines * seconds_per_line * 1.5
75
- else:
76
- target_duration += lines * seconds_per_line
77
 
78
- # ํ† ํฐ ์ˆ˜ ๊ณ„์‚ฐ (1์ดˆ๋‹น ์•ฝ 50ํ† ํฐ)
79
  tokens_per_second = 50
80
- total_tokens = int(target_duration * tokens_per_second)
81
 
82
  # ์„น์…˜ ๊ธฐ๋ฐ˜ ์„ธ๊ทธ๋จผํŠธ ์ˆ˜ ๊ณ„์‚ฐ
83
- if target_duration > 180: # 3๋ถ„ ์ด์ƒ
84
  num_segments = 4
85
- elif target_duration > 120: # 2๋ถ„ ์ด์ƒ
86
  num_segments = 3
87
- else:
88
  num_segments = 2
89
-
90
- # ํ† ํฐ ์ˆ˜ ์ œํ•œ
91
- max_tokens = min(32000, max(3000, total_tokens))
92
 
93
  return {
94
  'max_tokens': max_tokens,
95
  'num_segments': num_segments,
96
  'sections': sections,
97
  'section_lines': section_lines,
98
- 'estimated_duration': target_duration
 
99
  }
100
 
101
  def get_audio_duration(file_path):
@@ -122,30 +125,41 @@ def optimize_model_selection(lyrics, genre):
122
  model_path = detect_and_select_model(lyrics)
123
  params = calculate_generation_params(lyrics)
124
 
 
 
 
125
  model_config = {
126
  "m-a-p/YuE-s1-7B-anneal-en-cot": {
127
  "max_tokens": params['max_tokens'],
128
  "temperature": 0.8,
129
  "batch_size": 8,
130
  "num_segments": params['num_segments'],
131
- "chorus_strength": 1.2 if params['sections']['chorus'] > 0 else 1.0
 
132
  },
133
  "m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
134
  "max_tokens": params['max_tokens'],
135
  "temperature": 0.7,
136
  "batch_size": 8,
137
  "num_segments": params['num_segments'],
138
- "chorus_strength": 1.2 if params['sections']['chorus'] > 0 else 1.0
 
139
  },
140
  "m-a-p/YuE-s1-7B-anneal-zh-cot": {
141
  "max_tokens": params['max_tokens'],
142
  "temperature": 0.7,
143
  "batch_size": 8,
144
  "num_segments": params['num_segments'],
145
- "chorus_strength": 1.2 if params['sections']['chorus'] > 0 else 1.0
 
146
  }
147
  }
148
 
 
 
 
 
 
149
  return model_path, model_config[model_path], params
150
 
151
  # GPU ์„ค์ • ์ตœ์ ํ™”
@@ -266,13 +280,16 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
266
  has_chorus = params['sections']['chorus'] > 0
267
  estimated_duration = params.get('estimated_duration', 60)
268
 
 
 
 
 
 
 
 
 
269
  logging.info(f"Estimated duration: {estimated_duration} seconds")
270
  logging.info(f"Has chorus sections: {has_chorus}")
271
-
272
- # ์‹ค์ œ ์‚ฌ์šฉํ•  ํŒŒ๋ผ๋ฏธํ„ฐ
273
- actual_num_segments = config['num_segments']
274
- actual_max_tokens = config['max_tokens']
275
-
276
  logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
277
 
278
  # ์ž„์‹œ ํŒŒ์ผ ์ƒ์„ฑ
@@ -283,7 +300,7 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
283
  os.makedirs(output_dir, exist_ok=True)
284
  empty_output_folder(output_dir)
285
 
286
- # ๊ธฐ๋ณธ ๋ช…๋ น์–ด ๊ตฌ์„ฑ (์ง€์›๋˜๋Š” ๋งค๊ฐœ๋ณ€์ˆ˜๋งŒ ์‚ฌ์šฉ)
287
  command = [
288
  "python", "infer.py",
289
  "--stage1_model", model_path,
@@ -347,6 +364,10 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
347
  if duration:
348
  logging.info(f"Audio duration: {duration:.2f} seconds")
349
  logging.info(f"Expected duration: {estimated_duration} seconds")
 
 
 
 
350
  except Exception as e:
351
  logging.warning(f"Failed to get audio duration: {e}")
352
  return last_mp3
@@ -370,17 +391,8 @@ def main():
370
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
371
  with gr.Blocks() as demo:
372
  with gr.Column():
373
- gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation (Multi-Language Support)")
374
- gr.HTML("""
375
- <div style="display:flex;column-gap:4px;">
376
- <a href="https://github.com/multimodal-art-projection/YuE">
377
- <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
378
- </a>
379
- <a href="https://map-yue.github.io">
380
- <img src='https://img.shields.io/badge/Project-Page-green'>
381
- </a>
382
- </div>
383
- """)
384
 
385
  with gr.Row():
386
  with gr.Column():
@@ -458,18 +470,20 @@ Stay with me forever, let our love just flow
458
 
459
  # ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™”
460
  initialize_system()
461
-
462
  def update_info(lyrics):
463
  if not lyrics:
464
  return "No lyrics entered", "No sections detected"
465
  params = calculate_generation_params(lyrics)
466
- duration = params.get('estimated_duration', 0)
467
  sections = params['sections']
468
  return (
469
- f"{duration:.1f} seconds",
470
- f"Verses: {sections['verse']}, Chorus: {sections['chorus']}, Bridge: {sections['bridge']}"
471
  )
 
472
 
 
473
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
474
  lyrics_txt.change(
475
  fn=update_info,
 
62
  def calculate_generation_params(lyrics):
63
  sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
64
 
65
+ # ๊ธฐ๋ณธ ์‹œ๊ฐ„ ๊ณ„์‚ฐ (์ดˆ ๋‹จ์œ„)
66
+ time_per_line = {
67
+ 'verse': 4, # verse๋Š” ํ•œ ์ค„๋‹น 4์ดˆ
68
+ 'chorus': 6, # chorus๋Š” ํ•œ ์ค„๋‹น 6์ดˆ (๋” ๊ธด ์‹œ๊ฐ„ ํ• ๋‹น)
69
+ 'bridge': 5 # bridge๋Š” ํ•œ ์ค„๋‹น 5์ดˆ
70
+ }
71
 
72
+ # ๊ฐ ์„น์…˜๋ณ„ ์˜ˆ์ƒ ์‹œ๊ฐ„ ๊ณ„์‚ฐ
73
+ total_duration = 0
74
+ for section_type, lines in section_lines.items():
75
+ total_duration += lines * time_per_line[section_type]
76
+
77
+ # ์ตœ์†Œ ์ง€์† ์‹œ๊ฐ„ ๋ณด์žฅ (60์ดˆ)
78
+ total_duration = max(60, total_duration)
 
79
 
80
+ # ํ† ํฐ ๊ณ„์‚ฐ (1์ดˆ๋‹น ์•ฝ 50ํ† ํฐ์œผ๋กœ ๊ณ„์‚ฐ)
81
  tokens_per_second = 50
82
+ total_tokens = int(total_duration * tokens_per_second)
83
 
84
  # ์„น์…˜ ๊ธฐ๋ฐ˜ ์„ธ๊ทธ๋จผํŠธ ์ˆ˜ ๊ณ„์‚ฐ
85
+ if total_duration > 180: # 3๋ถ„ ์ด์ƒ
86
  num_segments = 4
87
+ elif total_duration > 120: # 2๋ถ„ ์ด์ƒ
88
  num_segments = 3
89
+ else: # 2๋ถ„ ๋ฏธ๋งŒ
90
  num_segments = 2
91
+
92
+ # ํ† ํฐ ์ˆ˜ ์ œํ•œ (์ตœ์†Œ 6000ํ† ํฐ ๋ณด์žฅ)
93
+ max_tokens = min(32000, max(6000, total_tokens))
94
 
95
  return {
96
  'max_tokens': max_tokens,
97
  'num_segments': num_segments,
98
  'sections': sections,
99
  'section_lines': section_lines,
100
+ 'estimated_duration': total_duration,
101
+ 'tokens_per_segment': max_tokens // num_segments
102
  }
103
 
104
  def get_audio_duration(file_path):
 
125
  model_path = detect_and_select_model(lyrics)
126
  params = calculate_generation_params(lyrics)
127
 
128
+ # ์ฝ”๋Ÿฌ์Šค ์กด์žฌ ์—ฌ๋ถ€์— ๋”ฐ๋ฅธ ์„ค์ • ์กฐ์ •
129
+ has_chorus = params['sections']['chorus'] > 0
130
+
131
  model_config = {
132
  "m-a-p/YuE-s1-7B-anneal-en-cot": {
133
  "max_tokens": params['max_tokens'],
134
  "temperature": 0.8,
135
  "batch_size": 8,
136
  "num_segments": params['num_segments'],
137
+ "tokens_per_segment": params['tokens_per_segment'],
138
+ "estimated_duration": params['estimated_duration']
139
  },
140
  "m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
141
  "max_tokens": params['max_tokens'],
142
  "temperature": 0.7,
143
  "batch_size": 8,
144
  "num_segments": params['num_segments'],
145
+ "tokens_per_segment": params['tokens_per_segment'],
146
+ "estimated_duration": params['estimated_duration']
147
  },
148
  "m-a-p/YuE-s1-7B-anneal-zh-cot": {
149
  "max_tokens": params['max_tokens'],
150
  "temperature": 0.7,
151
  "batch_size": 8,
152
  "num_segments": params['num_segments'],
153
+ "tokens_per_segment": params['tokens_per_segment'],
154
+ "estimated_duration": params['estimated_duration']
155
  }
156
  }
157
 
158
+ # ์ฝ”๋Ÿฌ์Šค๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ํ† ํฐ ์ˆ˜ ์ฆ๊ฐ€
159
+ if has_chorus:
160
+ for config in model_config.values():
161
+ config['max_tokens'] = int(config['max_tokens'] * 1.5) # 50% ๋” ๋งŽ์€ ํ† ํฐ ํ• ๋‹น
162
+
163
  return model_path, model_config[model_path], params
164
 
165
  # GPU ์„ค์ • ์ตœ์ ํ™”
 
280
  has_chorus = params['sections']['chorus'] > 0
281
  estimated_duration = params.get('estimated_duration', 60)
282
 
283
+ # ํ† ํฐ ์ˆ˜ ์กฐ์ • (์ฝ”๋Ÿฌ์Šค๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ๋” ๋งŽ์€ ํ† ํฐ ํ• ๋‹น)
284
+ if has_chorus:
285
+ actual_max_tokens = int(config['max_tokens'] * 1.5) # 50% ๋” ๋งŽ์€ ํ† ํฐ
286
+ actual_num_segments = max(3, config['num_segments']) # ์ตœ์†Œ 3๊ฐœ ์„ธ๊ทธ๋จผํŠธ ๋ณด์žฅ
287
+ else:
288
+ actual_max_tokens = config['max_tokens']
289
+ actual_num_segments = config['num_segments']
290
+
291
  logging.info(f"Estimated duration: {estimated_duration} seconds")
292
  logging.info(f"Has chorus sections: {has_chorus}")
 
 
 
 
 
293
  logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
294
 
295
  # ์ž„์‹œ ํŒŒ์ผ ์ƒ์„ฑ
 
300
  os.makedirs(output_dir, exist_ok=True)
301
  empty_output_folder(output_dir)
302
 
303
+ # ๊ธฐ๋ณธ ๋ช…๋ น์–ด ๊ตฌ์„ฑ
304
  command = [
305
  "python", "infer.py",
306
  "--stage1_model", model_path,
 
364
  if duration:
365
  logging.info(f"Audio duration: {duration:.2f} seconds")
366
  logging.info(f"Expected duration: {estimated_duration} seconds")
367
+
368
+ # ์ƒ์„ฑ๋œ ์Œ์•…์ด ๋„ˆ๋ฌด ์งง์€ ๊ฒฝ์šฐ ๊ฒฝ๊ณ 
369
+ if duration < estimated_duration * 0.8: # ์˜ˆ์ƒ ๊ธธ์ด์˜ 80% ๋ฏธ๋งŒ์ธ ๊ฒฝ์šฐ
370
+ logging.warning(f"Generated audio is shorter than expected: {duration:.2f}s < {estimated_duration:.2f}s")
371
  except Exception as e:
372
  logging.warning(f"Failed to get audio duration: {e}")
373
  return last_mp3
 
391
  # Gradio ์ธํ„ฐํŽ˜์ด์Šค
392
  with gr.Blocks() as demo:
393
  with gr.Column():
394
+ gr.Markdown("# Open SUNI: Full-Song Generation (Multi-Language Support)")
395
+
 
 
 
 
 
 
 
 
 
396
 
397
  with gr.Row():
398
  with gr.Column():
 
470
 
471
  # ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™”
472
  initialize_system()
473
+
474
  def update_info(lyrics):
475
  if not lyrics:
476
  return "No lyrics entered", "No sections detected"
477
  params = calculate_generation_params(lyrics)
478
+ duration = params['estimated_duration']
479
  sections = params['sections']
480
  return (
481
+ f"Estimated duration: {duration:.1f} seconds",
482
+ f"Verses: {sections['verse']}, Chorus: {sections['chorus']} (Expected full length including chorus)"
483
  )
484
+
485
 
486
+
487
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
488
  lyrics_txt.change(
489
  fn=update_info,