{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Audio\n", "from scipy.io.wavfile import write as write_wav\n", "\n", "from bark.api import generate_audio\n", "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "semantic_path = \"semantic_output/pytorch_model.bin\" # set to None if you don't want to use finetuned semantic\n", "coarse_path = \"coarse_output/pytorch_model.bin\" # set to None if you don't want to use finetuned coarse\n", "fine_path = \"fine_output/pytorch_model.bin\" # set to None if you don't want to use finetuned fine\n", "use_rvc = True # Set to False to use bark without RVC\n", "rvc_name = 'mi-test'\n", "rvc_path = f\"Retrieval-based-Voice-Conversion-WebUI/weights/{rvc_name}.pth\"\n", "index_path = f\"Retrieval-based-Voice-Conversion-WebUI/logs/{rvc_name}/added_IVF256_Flat_nprobe_1_{rvc_name}_v2.index\"\n", "device=\"cuda:0\"\n", "is_half=True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download and load all models\n", "preload_models(\n", " text_use_gpu=True,\n", " text_use_small=False,\n", " text_model_path=semantic_path,\n", " coarse_use_gpu=True,\n", " coarse_use_small=False,\n", " coarse_model_path=coarse_path,\n", " fine_use_gpu=True,\n", " fine_use_small=False,\n", " fine_model_path=fine_path,\n", " codec_use_gpu=True,\n", " force_reload=False,\n", " path=\"models\"\n", ")\n", "\n", "if use_rvc:\n", " from rvc_infer import get_vc, vc_single\n", " get_vc(rvc_path, device, is_half)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# simple generation\n", "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n", "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n", "\n", "filepath = \"output/audio.wav\"\n", "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n", "write_wav(filepath, SAMPLE_RATE, audio_array)\n", "\n", "if use_rvc:\n", " index_rate = 0.75\n", " f0up_key = -6\n", " filter_radius = 3\n", " rms_mix_rate = 0.25\n", " protect = 0.33\n", " resample_sr = SAMPLE_RATE\n", " f0method = \"harvest\" #harvest or pm\n", " try:\n", " audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n", " except:\n", " audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n", " write_wav(filepath, SAMPLE_RATE, audio_array)\n", "\n", "Audio(audio_array, rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# generation with more control\n", "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n", "voice_name = \"speaker_0\" # use your custom voice name here if you have on\n", "\n", "filepath = \"output/audio.wav\"\n", "\n", "x_semantic = generate_text_semantic(\n", " text_prompt,\n", " history_prompt=voice_name,\n", " temp=0.7,\n", " top_k=50,\n", " top_p=0.95,\n", ")\n", "\n", "x_coarse_gen = generate_coarse(\n", " x_semantic,\n", " history_prompt=voice_name,\n", " temp=0.7,\n", " top_k=50,\n", " top_p=0.95,\n", ")\n", "x_fine_gen = generate_fine(\n", " x_coarse_gen,\n", " history_prompt=voice_name,\n", " temp=0.5,\n", ")\n", "audio_array = codec_decode(x_fine_gen)\n", "write_wav(filepath, SAMPLE_RATE, audio_array)\n", "\n", "if use_rvc:\n", " index_rate = 0.75\n", " f0up_key = -6\n", " filter_radius = 3\n", " rms_mix_rate = 0.25\n", " protect = 0.33\n", " resample_sr = SAMPLE_RATE\n", " f0method = \"harvest\" #harvest or pm\n", " try:\n", " audio_array = vc_single(0,filepath,f0up_key,None,f0method,index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n", " except:\n", " audio_array = vc_single(0,filepath,f0up_key,None,'pm',index_path,index_rate, filter_radius=filter_radius, resample_sr=resample_sr, rms_mix_rate=rms_mix_rate, protect=protect)\n", " write_wav(filepath, SAMPLE_RATE, audio_array)\n", "\n", "Audio(audio_array, rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }