ginipick commited on
Commit
a88519d
ยท
verified ยท
1 Parent(s): fbb8741

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -377
app.py CHANGED
@@ -7,10 +7,8 @@ import torch
7
  import logging
8
  import numpy as np
9
  import re
10
- import sys # sys ๋ชจ๋“ˆ ์ถ”๊ฐ€
11
  from concurrent.futures import ThreadPoolExecutor
12
  from functools import lru_cache
13
- from datetime import datetime
14
 
15
  # ๋กœ๊น… ์„ค์ •
16
  logging.basicConfig(
@@ -22,28 +20,24 @@ logging.basicConfig(
22
  ]
23
  )
24
 
 
 
 
 
25
  def optimize_gpu_settings():
26
  if torch.cuda.is_available():
27
- # GPU ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ ์ตœ์ ํ™”
28
  torch.backends.cuda.matmul.allow_tf32 = True
29
  torch.backends.cudnn.benchmark = True
30
  torch.backends.cudnn.enabled = True
31
  torch.backends.cudnn.deterministic = False
32
-
33
- # L40S์— ์ตœ์ ํ™”๋œ ๋ฉ”๋ชจ๋ฆฌ ์„ค์ •
34
  torch.cuda.empty_cache()
35
  torch.cuda.set_device(0)
36
-
37
- # CUDA ์ŠคํŠธ๋ฆผ ์ตœ์ ํ™”
38
  torch.cuda.Stream(0)
39
-
40
- # ๋ฉ”๋ชจ๋ฆฌ ํ• ๋‹น ์ตœ์ ํ™”
41
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
42
 
43
  logging.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
44
  logging.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
45
 
46
- # L40S ํŠนํ™” ์„ค์ •
47
  if 'L40S' in torch.cuda.get_device_name(0):
48
  torch.cuda.set_per_process_memory_fraction(0.95)
49
 
@@ -65,7 +59,6 @@ def analyze_lyrics(lyrics, repeat_chorus=2):
65
  }
66
  last_section = None
67
 
68
- # ๋งˆ์ง€๋ง‰ ์„น์…˜ ํƒœ๊ทธ ์ฐพ๊ธฐ
69
  for i, line in enumerate(lines):
70
  if '[verse]' in line.lower() or '[chorus]' in line.lower() or '[bridge]' in line.lower():
71
  last_section = i
@@ -73,9 +66,8 @@ def analyze_lyrics(lyrics, repeat_chorus=2):
73
  for i, line in enumerate(lines):
74
  lower_line = line.lower()
75
 
76
- # ์„น์…˜ ํƒœ๊ทธ ์ฒ˜๋ฆฌ
77
  if '[verse]' in lower_line:
78
- if current_section: # ์ด์ „ ์„น์…˜์˜ ๋ผ์ธ๋“ค ์ €์žฅ
79
  section_lines[current_section].extend(lines[last_section_start:i])
80
  current_section = 'verse'
81
  sections['verse'] += 1
@@ -96,58 +88,50 @@ def analyze_lyrics(lyrics, repeat_chorus=2):
96
  last_section_start = i + 1
97
  continue
98
 
99
- # ๋งˆ์ง€๋ง‰ ์„น์…˜์˜ ๋ผ์ธ๋“ค ์ถ”๊ฐ€
100
- if current_section and last_section_start < len(lines):
101
  section_lines[current_section].extend(lines[last_section_start:])
102
 
103
- # ์ฝ”๋Ÿฌ์Šค ๋ฐ˜๋ณต ์ฒ˜๋ฆฌ
104
  if sections['chorus'] > 0 and repeat_chorus > 1:
105
  original_chorus = section_lines['chorus'][:]
106
  for _ in range(repeat_chorus - 1):
107
  section_lines['chorus'].extend(original_chorus)
108
 
109
- # ์„น์…˜๋ณ„ ๋ผ์ธ ์ˆ˜ ํ™•์ธ ๋กœ๊น…
110
- logging.info(f"Section line counts - Verse: {len(section_lines['verse'])}, "
111
- f"Chorus: {len(section_lines['chorus'])}, "
112
- f"Bridge: {len(section_lines['bridge'])}")
 
113
 
114
  return sections, (sections['verse'] + sections['chorus'] + sections['bridge']), len(lines), section_lines
115
 
116
  def calculate_generation_params(lyrics):
117
  sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
118
 
119
- # ๊ธฐ๋ณธ ์‹œ๊ฐ„ ๊ณ„์‚ฐ (์ดˆ ๋‹จ์œ„)
120
  time_per_line = {
121
- 'verse': 4, # verse๋Š” ํ•œ ์ค„๋‹น 4์ดˆ
122
- 'chorus': 6, # chorus๋Š” ํ•œ ์ค„๋‹น 6์ดˆ
123
- 'bridge': 5 # bridge๋Š” ํ•œ ์ค„๋‹น 5์ดˆ
124
  }
125
 
126
- # ๊ฐ ์„น์…˜๋ณ„ ์˜ˆ์ƒ ์‹œ๊ฐ„ ๊ณ„์‚ฐ (๋งˆ์ง€๋ง‰ ์„น์…˜ ํฌํ•จ)
127
  section_durations = {}
128
  for section_type in ['verse', 'chorus', 'bridge']:
129
  lines_count = len(section_lines[section_type])
130
  section_durations[section_type] = lines_count * time_per_line[section_type]
131
 
132
- # ์ „์ฒด ์‹œ๊ฐ„ ๊ณ„์‚ฐ (์—ฌ์œ  ์‹œ๊ฐ„ ์ถ”๊ฐ€)
133
  total_duration = sum(duration for duration in section_durations.values())
134
- total_duration = max(60, int(total_duration * 1.2)) # 20% ์—ฌ์œ  ์‹œ๊ฐ„ ์ถ”๊ฐ€
135
 
136
- # ํ† ํฐ ๊ณ„์‚ฐ (๋งˆ์ง€๋ง‰ ์„น์…˜์„ ์œ„ํ•œ ์ถ”๊ฐ€ ํ† ํฐ)
137
  base_tokens = 3000
138
  tokens_per_line = 200
139
- extra_tokens = 1000 # ๋งˆ์ง€๋ง‰ ์„น์…˜์„ ์œ„ํ•œ ์ถ”๊ฐ€ ํ† ํฐ
140
-
141
  total_tokens = base_tokens + (total_lines * tokens_per_line) + extra_tokens
142
 
143
- # ์„ธ๊ทธ๋จผํŠธ ์ˆ˜ ๊ณ„์‚ฐ (๋งˆ์ง€๋ง‰ ์„น์…˜์„ ์œ„ํ•œ ์ถ”๊ฐ€ ์„ธ๊ทธ๋จผํŠธ)
144
  if sections['chorus'] > 0:
145
- num_segments = 4 # ์ฝ”๋Ÿฌ์Šค๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ 4๊ฐœ ์„ธ๊ทธ๋จผํŠธ
146
  else:
147
- num_segments = 3 # ์ฝ”๋Ÿฌ์Šค๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ 3๊ฐœ ์„ธ๊ทธ๋จผํŠธ
148
-
149
- # ํ† ํฐ ์ˆ˜ ์ œํ•œ (๋” ํฐ ์ œํ•œ)
150
- max_tokens = min(12000, total_tokens) # ์ตœ๋Œ€ ํ† ํฐ ์ˆ˜ ์ฆ๊ฐ€
151
 
152
  return {
153
  'max_tokens': max_tokens,
@@ -201,101 +185,35 @@ def install_flash_attn():
201
  logging.warning(f"Failed to install flash-attn: {e}")
202
  return False
203
 
204
-
205
- # ์ „์—ญ ๋ณ€์ˆ˜๋กœ ๊ฒฝ๋กœ ์„ค์ •
206
- APP_DIR = os.path.abspath(os.path.dirname(__file__))
207
- INFERENCE_DIR = os.path.join(APP_DIR, "inference")
208
- INFER_SCRIPT = os.path.join(INFERENCE_DIR, "infer.py")
209
-
210
  def initialize_system():
211
  optimize_gpu_settings()
212
 
213
- try:
214
- # ์ ˆ๋Œ€ ๊ฒฝ๋กœ ์„ค์ •
215
- app_dir = os.path.abspath(os.path.dirname(__file__))
216
- inference_dir = os.path.join(app_dir, "inference")
217
-
218
- # ๊ธฐ๋ณธ ๋””๋ ‰ํ† ๋ฆฌ ๊ตฌ์กฐ ์ƒ์„ฑ
219
- os.makedirs(inference_dir, exist_ok=True)
220
- os.makedirs(os.path.join(inference_dir, "models"), exist_ok=True)
221
- os.makedirs(os.path.join(inference_dir, "xcodec_mini_infer"), exist_ok=True)
222
-
223
- # ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๋ณ€๊ฒฝ
224
- os.chdir(inference_dir)
225
- logging.info(f"Working directory changed to: {os.getcwd()}")
226
-
227
- from huggingface_hub import snapshot_download, hf_hub_download
228
 
229
- # models ๋ชจ๋“ˆ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ ๋ฐ ์„ค์ •
230
- models_dir = os.path.join(inference_dir, "models")
231
- os.makedirs(models_dir, exist_ok=True)
232
 
233
- # __init__.py ์ƒ์„ฑ
234
- with open(os.path.join(models_dir, "__init__.py"), "w") as f:
235
- f.write("")
236
-
237
- # soundstream_hubert_new.py ๋‹ค์šด๋กœ๋“œ
238
- try:
239
- soundstream_file = hf_hub_download(
240
- repo_id="m-a-p/xcodec_mini_infer",
241
- filename="models/soundstream_hubert_new.py",
242
- cache_dir=os.path.join(inference_dir, "cache"),
243
- force_download=True
244
- )
245
- shutil.copy2(soundstream_file, os.path.join(models_dir, "soundstream_hubert_new.py"))
246
- except Exception as e:
247
- logging.error(f"Failed to download soundstream_hubert_new.py: {e}")
248
- raise
249
-
250
- # xcodec_mini_infer ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ
251
- snapshot_download(
252
  repo_id="m-a-p/xcodec_mini_infer",
253
- local_dir=os.path.join(inference_dir, "xcodec_mini_infer"),
254
- force_download=True
255
- )
256
-
257
- # YuE ๋ชจ๋ธ๋“ค ๋‹ค์šด๋กœ๋“œ
258
- models = [
259
- "m-a-p/YuE-s1-7B-anneal-jp-kr-cot",
260
- "m-a-p/YuE-s1-7B-anneal-en-cot",
261
- "m-a-p/YuE-s1-7B-anneal-zh-cot",
262
- "m-a-p/YuE-s2-1B-general"
263
- ]
264
-
265
- for model in models:
266
- model_name = model.split('/')[-1]
267
- snapshot_download(
268
- repo_id=model,
269
- local_dir=os.path.join(inference_dir, "models", model_name),
270
- force_download=True
271
- )
272
-
273
- # PYTHONPATH ์„ค์ •
274
- if inference_dir not in sys.path:
275
- sys.path.insert(0, inference_dir)
276
 
277
- # ํŒŒ์ผ ์กด์žฌ ํ™•์ธ
278
- required_files = [
279
- os.path.join(models_dir, "__init__.py"),
280
- os.path.join(models_dir, "soundstream_hubert_new.py"),
281
- os.path.join(inference_dir, "xcodec_mini_infer", "config.json"),
282
- os.path.join(inference_dir, "xcodec_mini_infer", "vocal_decoder.pth"),
283
- os.path.join(inference_dir, "xcodec_mini_infer", "inst_decoder.pth")
284
- ]
285
-
286
- for file_path in required_files:
287
- if not os.path.exists(file_path):
288
- raise FileNotFoundError(f"Required file not found: {file_path}")
289
- else:
290
- file_size = os.path.getsize(file_path)
291
- logging.info(f"Verified {os.path.basename(file_path)}: {file_size} bytes")
292
-
293
- logging.info("System initialization completed successfully")
294
-
295
- except Exception as e:
296
- logging.error(f"Initialization error: {e}")
297
- raise
298
 
 
 
 
 
 
 
299
 
300
  @lru_cache(maxsize=100)
301
  def get_cached_file_path(content_hash, prefix):
@@ -338,33 +256,62 @@ def get_audio_duration(file_path):
338
  logging.error(f"Failed to get audio duration: {e}")
339
  return None
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
342
  genre_txt_path = None
343
  lyrics_txt_path = None
344
 
345
  try:
346
- if not os.path.exists(INFER_SCRIPT):
347
- raise FileNotFoundError(f"infer.py not found at: {INFER_SCRIPT}")
348
-
349
  model_path, config, params = optimize_model_selection(lyrics_txt_content, genre_txt_content)
350
  logging.info(f"Selected model: {model_path}")
351
  logging.info(f"Lyrics analysis: {params}")
352
-
353
 
354
  has_chorus = params['sections']['chorus'] > 0
355
  estimated_duration = params.get('estimated_duration', 90)
356
 
357
-
358
  # ์„ธ๊ทธ๋จผํŠธ ๋ฐ ํ† ํฐ ์ˆ˜ ์„ค์ •
359
  if has_chorus:
360
  actual_max_tokens = min(12000, int(config['max_tokens'] * 1.3)) # 30% ๋” ๋งŽ์€ ํ† ํฐ
361
- actual_num_segments = min(5, params['num_segments'] + 2) # ์ถ”๊ฐ€ ์„ธ๊ทธ๋จผํŠธ
362
  else:
363
  actual_max_tokens = min(10000, int(config['max_tokens'] * 1.2))
364
  actual_num_segments = min(4, params['num_segments'] + 1)
365
 
366
-
367
-
368
  logging.info(f"Estimated duration: {estimated_duration} seconds")
369
  logging.info(f"Has chorus sections: {has_chorus}")
370
  logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
@@ -376,29 +323,21 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
376
  os.makedirs(output_dir, exist_ok=True)
377
  empty_output_folder(output_dir)
378
 
379
-
380
- python_executable = sys.executable or "python" # fallback to "python" if sys.executable is not available
381
-
382
-
383
-
384
-
385
  command = [
386
- python_executable,
387
- INFER_SCRIPT,
388
  "--stage1_model", model_path,
389
  "--stage2_model", "m-a-p/YuE-s2-1B-general",
390
- "--genre_txt", os.path.abspath(genre_txt_path),
391
- "--lyrics_txt", os.path.abspath(lyrics_txt_path),
392
  "--run_n_segments", str(actual_num_segments),
393
  "--stage2_batch_size", "16",
394
- "--output_dir", os.path.abspath("./output"),
395
  "--cuda_idx", "0",
396
  "--max_new_tokens", str(actual_max_tokens),
397
  "--disable_offload_model"
398
  ]
 
399
  env = os.environ.copy()
400
- current_dir = os.getcwd()
401
-
402
  if torch.cuda.is_available():
403
  env.update({
404
  "CUDA_VISIBLE_DEVICES": "0",
@@ -406,11 +345,10 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
406
  "PATH": f"/usr/local/cuda/bin:{env.get('PATH', '')}",
407
  "LD_LIBRARY_PATH": f"/usr/local/cuda/lib64:{env.get('LD_LIBRARY_PATH', '')}",
408
  "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
409
- "CUDA_LAUNCH_BLOCKING": "0",
410
- "PYTHONPATH": current_dir
411
  })
412
 
413
- # transformers ์บ์‹œ ๋งˆ์ด๊ทธ๋ ˆ์ด์…˜ ์ฒ˜๋ฆฌ
414
  try:
415
  from transformers.utils import move_cache
416
  move_cache()
@@ -444,7 +382,9 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
444
  logging.info(f"Expected duration: {estimated_duration} seconds")
445
 
446
  if duration < estimated_duration * 0.8:
447
- logging.warning(f"Generated audio is shorter than expected: {duration:.2f}s < {estimated_duration:.2f}s")
 
 
448
  except Exception as e:
449
  logging.warning(f"Failed to get audio duration: {e}")
450
  return last_mp3
@@ -464,190 +404,117 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
464
  except Exception as e:
465
  logging.warning(f"Failed to remove temporary file {path}: {e}")
466
 
467
- def optimize_model_selection(lyrics, genre):
468
- model_path = detect_and_select_model(lyrics)
 
 
 
 
 
 
469
  params = calculate_generation_params(lyrics)
470
-
471
- has_chorus = params['sections']['chorus'] > 0
472
- tokens_per_segment = params['max_tokens'] // params['num_segments']
473
-
474
- model_config = {
475
- "m-a-p/YuE-s1-7B-anneal-en-cot": {
476
- "max_tokens": params['max_tokens'],
477
- "temperature": 0.8,
478
- "batch_size": 16,
479
- "num_segments": params['num_segments'],
480
- "estimated_duration": params['estimated_duration']
481
- },
482
- "m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
483
- "max_tokens": params['max_tokens'],
484
- "temperature": 0.7,
485
- "batch_size": 16,
486
- "num_segments": params['num_segments'],
487
- "estimated_duration": params['estimated_duration']
488
- },
489
- "m-a-p/YuE-s1-7B-anneal-zh-cot": {
490
- "max_tokens": params['max_tokens'],
491
- "temperature": 0.7,
492
- "batch_size": 16,
493
- "num_segments": params['num_segments'],
494
- "estimated_duration": params['estimated_duration']
495
- }
496
- }
497
-
498
- if has_chorus:
499
- for config in model_config.values():
500
- config['max_tokens'] = int(config['max_tokens'] * 1.5)
501
-
502
- return model_path, model_config[model_path], params
503
 
504
  def main():
505
- # ํ…Œ๋งˆ ์„ค์ •
506
- theme = gr.themes.Soft(
507
- primary_hue="indigo",
508
- secondary_hue="purple",
509
- neutral_hue="slate",
510
- font=["Arial", "sans-serif"]
511
- )
512
-
513
- # CSS ์Šคํƒ€์ผ ์ •์˜
514
- custom_css = """
515
- #main-container {
516
- max-width: 1200px;
517
- margin: auto;
518
- padding: 20px;
519
- }
520
- #header {
521
- text-align: center;
522
- margin-bottom: 30px;
523
- background: linear-gradient(135deg, #6366f1, #a855f7);
524
- padding: 20px;
525
- border-radius: 15px;
526
- color: white;
527
- }
528
- .input-section {
529
- background: #f8fafc;
530
- padding: 20px;
531
- border-radius: 15px;
532
- margin-bottom: 20px;
533
- box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);
534
- }
535
- .output-section {
536
- background: #f0f9ff;
537
- padding: 20px;
538
- border-radius: 15px;
539
- margin-bottom: 20px;
540
- box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);
541
- }
542
- #generate-btn {
543
- background: linear-gradient(135deg, #6366f1, #a855f7);
544
- border: none;
545
- padding: 15px 30px;
546
- border-radius: 10px;
547
- color: white;
548
- font-weight: bold;
549
- cursor: pointer;
550
- transition: all 0.3s ease;
551
- }
552
- #generate-btn:hover {
553
- transform: translateY(-2px);
554
- box-shadow: 0 4px 12px rgba(0,0,0,0.15);
555
- }
556
- .info-box {
557
- background: #fff;
558
- padding: 15px;
559
- border-radius: 10px;
560
- border: 1px solid #e2e8f0;
561
- margin: 10px 0;
562
- }
563
- .status-section {
564
- background: #fff;
565
- padding: 15px;
566
- border-radius: 10px;
567
- margin-top: 15px;
568
- border: 1px solid #e2e8f0;
569
- }
570
- """
571
-
572
- with gr.Blocks(theme=theme, css=custom_css) as demo:
573
- with gr.Column(elem_id="main-container"):
574
- # ํ—ค๋” ์„น์…˜
575
- with gr.Row(elem_id="header"):
576
- gr.Markdown(
577
- """
578
- # ๐ŸŽต AI Song Creator 'Open SUNO'
579
- ### Transform Your Lyrics into Complete Songs with Music
580
- Create professional songs from your lyrics in multiple languages
581
- """
582
  )
583
 
584
- # ๋ฉ”์ธ ์ปจํ…์ธ 
585
- with gr.Row():
586
- # ์ž…๋ ฅ ์„น์…˜
587
- with gr.Column(scale=1, elem_classes="input-section"):
588
- gr.Markdown("### ๐Ÿ“ Input Your Song Details")
589
- genre_txt = gr.Textbox(
590
- label="๐ŸŽธ Music Genre & Style",
591
- placeholder="e.g., K-pop bright energetic synth dance electronic...",
592
- elem_id="genre-input"
 
 
593
  )
594
- lyrics_txt = gr.Textbox(
595
- label="๐Ÿ“ Lyrics",
596
- placeholder="Enter lyrics with section tags: [verse], [chorus], [bridge]...",
597
- lines=10,
598
- elem_id="lyrics-input"
 
 
599
  )
600
-
601
- # ์ •๋ณด ํ‘œ์‹œ ์„น์…˜
602
- with gr.Row():
603
- with gr.Column(scale=1):
604
- duration_info = gr.Label(
605
- label="โฑ๏ธ Estimated Duration",
606
- elem_classes="info-box"
607
- )
608
- with gr.Column(scale=1):
609
- sections_info = gr.Label(
610
- label="๐Ÿ“Š Section Analysis",
611
- elem_classes="info-box"
612
- )
613
-
614
- submit_btn = gr.Button(
615
- "๐ŸŽผ Generate Music",
616
- variant="primary",
617
- elem_id="generate-btn"
618
- )
619
-
620
- # ์ถœ๋ ฅ ์„น์…˜
621
- with gr.Column(scale=1, elem_classes="output-section"):
622
- gr.Markdown("### ๐ŸŽต Generated Music")
623
- music_out = gr.Audio(
624
- label="Generated Song",
625
- elem_id="music-output"
626
- )
627
-
628
- # ์ง„ํ–‰ ์ƒํƒœ
629
- with gr.Group(elem_classes="status-section"):
630
- gr.Markdown("### ๐Ÿ”„ Generation Status")
631
- num_segments = gr.Number(
632
- label="Song Segments",
633
- value=2,
634
- interactive=False,
635
- visible=False
636
- )
637
- max_new_tokens = gr.Number(
638
- label="Tokens",
639
- value=4000,
640
- interactive=False,
641
- visible=False
642
- )
643
-
644
- # ์˜ˆ์ œ ์„น์…˜
645
- with gr.Accordion("๐Ÿ“– Examples", open=False):
646
- gr.Examples(
647
- examples=[
648
- [
649
- "female blues airy vocal bright vocal piano sad romantic guitar jazz",
650
- """[verse]
651
  In the quiet of the evening, shadows start to fall
652
  Whispers of the night wind echo through the hall
653
  Lost within the silence, I hear your gentle voice
@@ -657,67 +524,31 @@ Guiding me back homeward, making my heart rejoice
657
  Don't let this moment fade, hold me close tonight
658
  With you here beside me, everything's alright
659
  Can't imagine life alone, don't want to let you go
660
- Stay with me forever, let our love just flow"""
661
- ],
662
- [
663
- "K-pop bright energetic synth dance electronic",
664
- """[verse]
 
665
  ์–ธ์  ๊ฐ€ ๋งˆ์ฃผํ•œ ๋ˆˆ๋น› ์†์—์„œ
666
 
667
  [chorus]
668
  ๋‹ค์‹œ ํ•œ ๋ฒˆ ๋‚ด๊ฒŒ ๋งํ•ด์ค˜
 
 
 
 
 
 
669
 
670
- [verse]
671
- ์–ด๋‘์šด ๋ฐค์„ ์ง€๋‚  ๋•Œ๋งˆ๋‹ค
672
-
673
- [chorus]
674
- ๋‹ค์‹œ ํ•œ ๋ฒˆ ๋‚ด๊ฒŒ ๋งํ•ด์ค˜"""
675
- ]
676
- ],
677
- inputs=[genre_txt, lyrics_txt]
678
- )
679
-
680
- # ๋„์›€๋ง ์„น์…˜
681
- with gr.Accordion("โ„น๏ธ Help & Information", open=False):
682
- gr.Markdown(
683
- """
684
- ### ๐ŸŽต How to Use
685
- 1. **Enter Genre & Style**: Describe the musical style you want
686
- 2. **Input Lyrics**: Write your lyrics using section tags
687
- 3. **Generate**: Click the Generate button and wait for your music!
688
-
689
- ### ๐ŸŒ Supported Languages
690
- - English
691
- - Korean (ํ•œ๊ตญ์–ด)
692
- - Japanese (ๆ—ฅๆœฌ่ชž)
693
- - Chinese (ไธญๆ–‡)
694
-
695
- ### โšก Tips for Best Results
696
- - Be specific with genre descriptions
697
- - Include emotion and instrument preferences
698
- - Properly tag your lyrics sections
699
- - Include both verse and chorus sections
700
- """
701
- )
702
-
703
- def update_info(lyrics):
704
- if not lyrics:
705
- return "No lyrics entered", "No sections detected"
706
- params = calculate_generation_params(lyrics)
707
- duration = params['estimated_duration']
708
- sections = params['sections']
709
- return (
710
- f"โฑ๏ธ Duration: {duration:.1f} seconds",
711
- f"๐Ÿ“Š Verses: {sections['verse']}, Chorus: {sections['chorus']}"
712
- )
713
-
714
- # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ ์„ค์ •
715
  lyrics_txt.change(
716
  fn=update_info,
717
  inputs=[lyrics_txt],
718
  outputs=[duration_info, sections_info]
719
  )
720
-
 
721
  submit_btn.click(
722
  fn=infer,
723
  inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
@@ -726,6 +557,7 @@ Stay with me forever, let our love just flow"""
726
 
727
  return demo
728
 
 
729
  if __name__ == "__main__":
730
  demo = main()
731
  demo.queue(max_size=20).launch(
@@ -735,4 +567,4 @@ if __name__ == "__main__":
735
  show_api=True,
736
  show_error=True,
737
  max_threads=8
738
- )
 
7
  import logging
8
  import numpy as np
9
  import re
 
10
  from concurrent.futures import ThreadPoolExecutor
11
  from functools import lru_cache
 
12
 
13
  # ๋กœ๊น… ์„ค์ •
14
  logging.basicConfig(
 
20
  ]
21
  )
22
 
23
+ ################################
24
+ # ๊ธฐ์กด์— ์ •์˜๋œ ํ•จ์ˆ˜ ๋ฐ ๋กœ์ง๋“ค #
25
+ ################################
26
+
27
  def optimize_gpu_settings():
28
  if torch.cuda.is_available():
 
29
  torch.backends.cuda.matmul.allow_tf32 = True
30
  torch.backends.cudnn.benchmark = True
31
  torch.backends.cudnn.enabled = True
32
  torch.backends.cudnn.deterministic = False
 
 
33
  torch.cuda.empty_cache()
34
  torch.cuda.set_device(0)
 
 
35
  torch.cuda.Stream(0)
 
 
36
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
37
 
38
  logging.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
39
  logging.info(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
40
 
 
41
  if 'L40S' in torch.cuda.get_device_name(0):
42
  torch.cuda.set_per_process_memory_fraction(0.95)
43
 
 
59
  }
60
  last_section = None
61
 
 
62
  for i, line in enumerate(lines):
63
  if '[verse]' in line.lower() or '[chorus]' in line.lower() or '[bridge]' in line.lower():
64
  last_section = i
 
66
  for i, line in enumerate(lines):
67
  lower_line = line.lower()
68
 
 
69
  if '[verse]' in lower_line:
70
+ if current_section:
71
  section_lines[current_section].extend(lines[last_section_start:i])
72
  current_section = 'verse'
73
  sections['verse'] += 1
 
88
  last_section_start = i + 1
89
  continue
90
 
91
+ if current_section and 'last_section_start' in locals() and last_section_start < len(lines):
 
92
  section_lines[current_section].extend(lines[last_section_start:])
93
 
 
94
  if sections['chorus'] > 0 and repeat_chorus > 1:
95
  original_chorus = section_lines['chorus'][:]
96
  for _ in range(repeat_chorus - 1):
97
  section_lines['chorus'].extend(original_chorus)
98
 
99
+ logging.info(
100
+ f"Section line counts - Verse: {len(section_lines['verse'])}, "
101
+ f"Chorus: {len(section_lines['chorus'])}, "
102
+ f"Bridge: {len(section_lines['bridge'])}"
103
+ )
104
 
105
  return sections, (sections['verse'] + sections['chorus'] + sections['bridge']), len(lines), section_lines
106
 
107
  def calculate_generation_params(lyrics):
108
  sections, total_sections, total_lines, section_lines = analyze_lyrics(lyrics)
109
 
 
110
  time_per_line = {
111
+ 'verse': 4,
112
+ 'chorus': 6,
113
+ 'bridge': 5
114
  }
115
 
 
116
  section_durations = {}
117
  for section_type in ['verse', 'chorus', 'bridge']:
118
  lines_count = len(section_lines[section_type])
119
  section_durations[section_type] = lines_count * time_per_line[section_type]
120
 
 
121
  total_duration = sum(duration for duration in section_durations.values())
122
+ total_duration = max(60, int(total_duration * 1.2))
123
 
 
124
  base_tokens = 3000
125
  tokens_per_line = 200
126
+ extra_tokens = 1000
 
127
  total_tokens = base_tokens + (total_lines * tokens_per_line) + extra_tokens
128
 
 
129
  if sections['chorus'] > 0:
130
+ num_segments = 4
131
  else:
132
+ num_segments = 3
133
+
134
+ max_tokens = min(12000, total_tokens)
 
135
 
136
  return {
137
  'max_tokens': max_tokens,
 
185
  logging.warning(f"Failed to install flash-attn: {e}")
186
  return False
187
 
 
 
 
 
 
 
188
  def initialize_system():
189
  optimize_gpu_settings()
190
 
191
+ with ThreadPoolExecutor(max_workers=4) as executor:
192
+ futures = []
193
+ futures.append(executor.submit(install_flash_attn))
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ from huggingface_hub import snapshot_download
 
 
196
 
197
+ folder_path = './inference/xcodec_mini_infer'
198
+ os.makedirs(folder_path, exist_ok=True)
199
+ logging.info(f"Created folder at: {folder_path}")
200
+
201
+ futures.append(executor.submit(
202
+ snapshot_download,
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  repo_id="m-a-p/xcodec_mini_infer",
204
+ local_dir="./inference/xcodec_mini_infer",
205
+ resume_download=True
206
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
+ for future in futures:
209
+ future.result()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ try:
212
+ os.chdir("./inference")
213
+ logging.info(f"Working directory changed to: {os.getcwd()}")
214
+ except FileNotFoundError as e:
215
+ logging.error(f"Directory error: {e}")
216
+ raise
217
 
218
  @lru_cache(maxsize=100)
219
  def get_cached_file_path(content_hash, prefix):
 
256
  logging.error(f"Failed to get audio duration: {e}")
257
  return None
258
 
259
+ def optimize_model_selection(lyrics, genre):
260
+ model_path = detect_and_select_model(lyrics)
261
+ params = calculate_generation_params(lyrics)
262
+
263
+ has_chorus = params['sections']['chorus'] > 0
264
+
265
+ model_config = {
266
+ "m-a-p/YuE-s1-7B-anneal-en-cot": {
267
+ "max_tokens": params['max_tokens'],
268
+ "temperature": 0.8,
269
+ "batch_size": 16,
270
+ "num_segments": params['num_segments'],
271
+ "estimated_duration": params['estimated_duration']
272
+ },
273
+ "m-a-p/YuE-s1-7B-anneal-jp-kr-cot": {
274
+ "max_tokens": params['max_tokens'],
275
+ "temperature": 0.7,
276
+ "batch_size": 16,
277
+ "num_segments": params['num_segments'],
278
+ "estimated_duration": params['estimated_duration']
279
+ },
280
+ "m-a-p/YuE-s1-7B-anneal-zh-cot": {
281
+ "max_tokens": params['max_tokens'],
282
+ "temperature": 0.7,
283
+ "batch_size": 16,
284
+ "num_segments": params['num_segments'],
285
+ "estimated_duration": params['estimated_duration']
286
+ }
287
+ }
288
+
289
+ if has_chorus:
290
+ for config in model_config.values():
291
+ config['max_tokens'] = int(config['max_tokens'] * 1.5)
292
+
293
+ return model_path, model_config[model_path], params
294
+
295
  def infer(genre_txt_content, lyrics_txt_content, num_segments, max_new_tokens):
296
  genre_txt_path = None
297
  lyrics_txt_path = None
298
 
299
  try:
 
 
 
300
  model_path, config, params = optimize_model_selection(lyrics_txt_content, genre_txt_content)
301
  logging.info(f"Selected model: {model_path}")
302
  logging.info(f"Lyrics analysis: {params}")
 
303
 
304
  has_chorus = params['sections']['chorus'] > 0
305
  estimated_duration = params.get('estimated_duration', 90)
306
 
 
307
  # ์„ธ๊ทธ๋จผํŠธ ๋ฐ ํ† ํฐ ์ˆ˜ ์„ค์ •
308
  if has_chorus:
309
  actual_max_tokens = min(12000, int(config['max_tokens'] * 1.3)) # 30% ๋” ๋งŽ์€ ํ† ํฐ
310
+ actual_num_segments = min(5, params['num_segments'] + 2) # ์ถ”๊ฐ€ ์„ธ๊ทธ๋จผํŠธ
311
  else:
312
  actual_max_tokens = min(10000, int(config['max_tokens'] * 1.2))
313
  actual_num_segments = min(4, params['num_segments'] + 1)
314
 
 
 
315
  logging.info(f"Estimated duration: {estimated_duration} seconds")
316
  logging.info(f"Has chorus sections: {has_chorus}")
317
  logging.info(f"Using segments: {actual_num_segments}, tokens: {actual_max_tokens}")
 
323
  os.makedirs(output_dir, exist_ok=True)
324
  empty_output_folder(output_dir)
325
 
 
 
 
 
 
 
326
  command = [
327
+ "python", "infer.py",
 
328
  "--stage1_model", model_path,
329
  "--stage2_model", "m-a-p/YuE-s2-1B-general",
330
+ "--genre_txt", genre_txt_path,
331
+ "--lyrics_txt", lyrics_txt_path,
332
  "--run_n_segments", str(actual_num_segments),
333
  "--stage2_batch_size", "16",
334
+ "--output_dir", output_dir,
335
  "--cuda_idx", "0",
336
  "--max_new_tokens", str(actual_max_tokens),
337
  "--disable_offload_model"
338
  ]
339
+
340
  env = os.environ.copy()
 
 
341
  if torch.cuda.is_available():
342
  env.update({
343
  "CUDA_VISIBLE_DEVICES": "0",
 
345
  "PATH": f"/usr/local/cuda/bin:{env.get('PATH', '')}",
346
  "LD_LIBRARY_PATH": f"/usr/local/cuda/lib64:{env.get('LD_LIBRARY_PATH', '')}",
347
  "PYTORCH_CUDA_ALLOC_CONF": "max_split_size_mb:512",
348
+ "CUDA_LAUNCH_BLOCKING": "0"
 
349
  })
350
 
351
+ # transformers ์บ์‹œ ๋งˆ์ด๊ทธ๋ ˆ์ด์…˜ ์ฒ˜๋ฆฌ (๋ฒ„์ „์— ๋”ฐ๋ผ ๋™์ž‘ํ•˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Œ)
352
  try:
353
  from transformers.utils import move_cache
354
  move_cache()
 
382
  logging.info(f"Expected duration: {estimated_duration} seconds")
383
 
384
  if duration < estimated_duration * 0.8:
385
+ logging.warning(
386
+ f"Generated audio is shorter than expected: {duration:.2f}s < {estimated_duration:.2f}s"
387
+ )
388
  except Exception as e:
389
  logging.warning(f"Failed to get audio duration: {e}")
390
  return last_mp3
 
404
  except Exception as e:
405
  logging.warning(f"Failed to remove temporary file {path}: {e}")
406
 
407
+ #####################################
408
+ # ์•„๋ž˜๋ถ€ํ„ฐ Gradio UI ๋ฐ main() ๋ถ€๋ถ„ #
409
+ #####################################
410
+
411
+ def update_info(lyrics):
412
+ """๊ฐ€์‚ฌ ๋ณ€๊ฒฝ ์‹œ ์ถ”์ • ์ •๋ณด๋ฅผ ์—…๋ฐ์ดํŠธํ•˜๋Š” ํ•จ์ˆ˜."""
413
+ if not lyrics:
414
+ return "No lyrics entered", "No sections detected"
415
  params = calculate_generation_params(lyrics)
416
+ duration = params['estimated_duration']
417
+ sections = params['sections']
418
+ return (
419
+ f"Estimated duration: {duration:.1f} seconds",
420
+ f"Verses: {sections['verse']}, Chorus: {sections['chorus']} (Expected full length including chorus)"
421
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
 
423
  def main():
424
+ # ๋จผ์ € ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (GPU ์ตœ์ ํ™”, ํ•„์š”ํ•œ ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ ๋“ฑ)
425
+ initialize_system()
426
+
427
+ with gr.Blocks(css="""
428
+ /* ์ „์ฒด ๋ฐฐ๊ฒฝ ๋ฐ ์ปจํ…Œ์ด๋„ˆ ์Šคํƒ€์ผ */
429
+ body {
430
+ background-color: #f5f5f5;
431
+ }
432
+ .gradio-container {
433
+ max-width: 1000px;
434
+ margin: auto !important;
435
+ background-color: #ffffff;
436
+ border-radius: 8px;
437
+ padding: 20px;
438
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
439
+ }
440
+ /* ํ…์ŠคํŠธ ํฌ๊ธฐ, ๋งˆ์ง„ ์กฐ์ • */
441
+ h1, h2, h3 {
442
+ margin: 0;
443
+ padding: 0;
444
+ }
445
+ p {
446
+ margin: 5px 0;
447
+ }
448
+ /* ์˜ˆ์ œ ๋ธ”๋ก ์Šคํƒ€์ผ */
449
+ .gr-examples {
450
+ background-color: #fafafa;
451
+ border-radius: 8px;
452
+ padding: 10px;
453
+ }
454
+ """) as demo:
455
+
456
+ # ์ƒ๋‹จ ํ—ค๋”
457
+ gr.HTML("""
458
+ <div style="text-align: center; margin-bottom: 1.5rem;">
459
+ <h1>Open SUNO: Full-Song Generation (Multi-Language Support)</h1>
460
+ <p style="font-size: 1.1rem; color: #555;">
461
+ Enter your song details below and let the AI handle the music production!
462
+ </p>
463
+ </div>
464
+ """)
465
+
466
+ with gr.Row():
467
+ # ์™ผ์ชฝ ์ž…๋ ฅ ์ปฌ๋Ÿผ
468
+ with gr.Column():
469
+ genre_txt = gr.Textbox(
470
+ label="Genre",
471
+ placeholder="Enter music genre and style descriptions...",
472
+ lines=2
473
+ )
474
+ lyrics_txt = gr.Textbox(
475
+ label="Lyrics (Supports English, Korean, Japanese, Chinese)",
476
+ placeholder="Enter song lyrics with [verse], [chorus], [bridge] tags...",
477
+ lines=10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  )
479
 
480
+ # ์˜ค๋ฅธ์ชฝ ์„ค์ •/์ •๋ณด ์ปฌ๋Ÿผ
481
+ with gr.Column():
482
+ with gr.Box():
483
+ gr.Markdown("### Generation Settings")
484
+ num_segments = gr.Number(
485
+ label="Number of Song Segments (Auto-adjusted)",
486
+ value=2,
487
+ minimum=1,
488
+ maximum=4,
489
+ step=1,
490
+ interactive=False
491
  )
492
+ max_new_tokens = gr.Slider(
493
+ label="Max New Tokens (Auto-adjusted)",
494
+ minimum=500,
495
+ maximum=32000,
496
+ step=500,
497
+ value=4000,
498
+ interactive=False
499
  )
500
+
501
+ with gr.Box():
502
+ gr.Markdown("### Song Info")
503
+ duration_info = gr.Label(label="Estimated Duration")
504
+ sections_info = gr.Label(label="Section Information")
505
+
506
+ submit_btn = gr.Button("Generate Music", variant="primary")
507
+
508
+ # ์ƒ์„ฑ๋œ ์˜ค๋””์˜ค ์ถœ๋ ฅ ์˜์—ญ
509
+ with gr.Box():
510
+ music_out = gr.Audio(label="Generated Audio")
511
+
512
+ # ์˜ˆ์‹œ
513
+ gr.Examples(
514
+ examples=[
515
+ [
516
+ "female blues airy vocal bright vocal piano sad romantic guitar jazz",
517
+ """[verse]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  In the quiet of the evening, shadows start to fall
519
  Whispers of the night wind echo through the hall
520
  Lost within the silence, I hear your gentle voice
 
524
  Don't let this moment fade, hold me close tonight
525
  With you here beside me, everything's alright
526
  Can't imagine life alone, don't want to let you go
527
+ Stay with me forever, let our love just flow
528
+ """
529
+ ],
530
+ [
531
+ "K-pop bright energetic synth dance electronic",
532
+ """[verse]
533
  ์–ธ์  ๊ฐ€ ๋งˆ์ฃผํ•œ ๋ˆˆ๋น› ์†์—์„œ
534
 
535
  [chorus]
536
  ๋‹ค์‹œ ํ•œ ๋ฒˆ ๋‚ด๊ฒŒ ๋งํ•ด์ค˜
537
+ """
538
+ ]
539
+ ],
540
+ inputs=[genre_txt, lyrics_txt],
541
+ outputs=[]
542
+ )
543
 
544
+ # ๊ฐ€์‚ฌ ๋ณ€๊ฒฝ ์‹œ ์ถ”์ • ์ •๋ณด ์—…๋ฐ์ดํŠธ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  lyrics_txt.change(
546
  fn=update_info,
547
  inputs=[lyrics_txt],
548
  outputs=[duration_info, sections_info]
549
  )
550
+
551
+ # ๋ฒ„ํŠผ ํด๋ฆญ ์‹œ infer ์‹คํ–‰
552
  submit_btn.click(
553
  fn=infer,
554
  inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
 
557
 
558
  return demo
559
 
560
+
561
  if __name__ == "__main__":
562
  demo = main()
563
  demo.queue(max_size=20).launch(
 
567
  show_api=True,
568
  show_error=True,
569
  max_threads=8
570
+ )