InspireMusic / app.py
chong.zhang
update
c75e017
raw
history blame
8.46 kB
# Copyright (c) 2024 Alibaba Inc (authors: Chong Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.system('nvidia-smi')
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
# os.system('pip install flash-attn --no-build-isolation')
# os.system('git submodule update --init --recursive')
# os.system('git clone https://github.com/shivammehta25/Matcha-TTS.git third_party/')
# os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
# os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git && for i in InspireMusic-Base; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
import sys
import torch
print(torch.backends.cudnn.version())
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
import spaces
import gradio as gr
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
import torchaudio
import datetime
import hashlib
import importlib
MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-1.5B-24kHz", "InspireMusic-Base", "InspireMusic-Base-24kHz"]
def generate_filename():
hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
hash_string = hash_object.hexdigest()
return hash_string
def get_args(
task, text="", audio=None, model_name="InspireMusic-Base",
chorus="intro",
output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
if output_sample_rate == 24000:
fast = True
else:
fast = False
# This function constructs the arguments required for InspireMusic
args = {
"task" : task,
"text" : text,
"audio_prompt" : audio,
"model_name" : model_name,
"chorus" : chorus,
"fast" : fast,
"fade_out" : True,
"trim" : trim,
"output_sample_rate" : output_sample_rate,
"min_generate_audio_seconds": 10.0,
"max_generate_audio_seconds": max_generate_audio_seconds,
"max_audio_prompt_length": 5.0,
"model_dir" : os.path.join("pretrained_models",
model_name),
"result_dir" : "exp/inspiremusic",
"output_fn" : generate_filename(),
"format" : "wav",
"time_start" : time_start,
"time_end": time_end,
"fade_out_duration": 1.0,
}
if args["time_start"] is None:
args["time_start"] = 0.0
args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
print(args)
return args
def trim_audio(audio_file, cut_seconds=5):
audio, sr = torchaudio.load(audio_file)
num_samples = cut_seconds * sr
cutted_audio = audio[:, :num_samples]
output_path = os.path.join(os.getcwd(), "audio_prompt_" + generate_filename() + ".wav")
torchaudio.save(output_path, cutted_audio, sr)
return output_path
@spaces.GPU(duration=120)
def music_generation(args):
set_env_variables()
model = InspireMusicUnified(
model_name=args["model_name"],
model_dir=args["model_dir"],
min_generate_audio_seconds=args["min_generate_audio_seconds"],
max_generate_audio_seconds=args["max_generate_audio_seconds"],
sample_rate=24000,
output_sample_rate=args["output_sample_rate"],
load_jit=True,
load_onnx=False,
fast=args["fast"],
result_dir=args["result_dir"])
output_path = model.inference(
task=args["task"],
text=args["text"],
audio_prompt=args["audio_prompt"],
chorus=args["chorus"],
time_start=args["time_start"],
time_end=args["time_end"],
output_fn=args["output_fn"],
max_audio_prompt_length=args["max_audio_prompt_length"],
fade_out_duration=args["fade_out_duration"],
output_format=args["format"],
fade_out_mode=args["fade_out"],
trim=args["trim"])
return output_path
# @spaces.GPU(duration=120)
def demo_inspiremusic_t2m(text, model_name, chorus,
output_sample_rate, max_generate_audio_seconds):
args = get_args(
task='text-to-music', text=text, audio=None,
model_name=model_name, chorus=chorus,
output_sample_rate=output_sample_rate,
max_generate_audio_seconds=max_generate_audio_seconds)
return music_generation(args)
# @spaces.GPU(duration=120)
def demo_inspiremusic_con(text, audio, model_name, chorus,
output_sample_rate, max_generate_audio_seconds):
args = get_args(
task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
model_name=model_name, chorus=chorus,
output_sample_rate=output_sample_rate,
max_generate_audio_seconds=max_generate_audio_seconds)
return music_generation(args)
# @spaces.GPU(duration=120)
def main():
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# InspireMusic
- Support text-to-music, music continuation, audio super-resolution, audio reconstruction tasks with high audio quality, with available sampling rates of 24kHz, 48kHz.
- Support long audio generation in multiple output audio formats, i.e., wav, flac, mp3, m4a.
- Open-source [InspireMusic-Base](https://modelscope.cn/models/iic/InspireMusic/summary), [InspireMusic-Base-24kHz](https://modelscope.cn/models/iic/InspireMusic-Base-24kHz/summary), [InspireMusic-1.5B](https://modelscope.cn/models/iic/InspireMusic-1.5B/summary), [InspireMusic-1.5B-24kHz](https://modelscope.cn/models/iic/InspireMusic-1.5B-24kHz/summary), [InspireMusic-1.5B-Long](https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/summary) models for music generation.
- Currently only support English text prompts.
This page is for demo purpose, if you want to generate long form music, please try to deploy locally. Thank you for your support.
""")
with gr.Row(equal_height=True):
model_name = gr.Dropdown(MODELS, label="Select Model Name", value="InspireMusic-1.5B-Long")
chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
label="Chorus Mode", value="intro")
output_sample_rate = gr.Dropdown([48000, 24000],
label="Output Audio Sample Rate (Hz)",
value=48000)
max_generate_audio_seconds = gr.Slider(10, 30,
label="Generate Audio Length (s)",
value=30)
# with gr.Row(equal_height=True):
text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)", value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
music_output = gr.Audio(label="Text to Music Output", type="filepath")
button = gr.Button("Text to Music")
button.click(demo_inspiremusic_t2m,
inputs=[text_input, model_name,
chorus,
output_sample_rate,
max_generate_audio_seconds],
outputs=music_output)
audio_input = gr.Audio(label="Input Audio Prompt (For Music Continuation Task)",
type="filepath")
music_con_output = gr.Audio(label="Music Continuation Output", type="filepath")
generate_button = gr.Button("Music Continuation")
generate_button.click(demo_inspiremusic_con,
inputs=[text_input, audio_input, model_name,
chorus,
output_sample_rate,
max_generate_audio_seconds],
outputs=music_con_output)
demo.launch()
if __name__ == '__main__':
main()