Spaces:

fffiloni
/

instant-TTS-Bark-cloning

Paused

File size: 2,969 Bytes

import gradio as gr
import os 
import shutil

from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile

model_ids = [
    'suno/bark',
]
for model_id in model_ids:
    model_name = model_id.split('/')[-1]
    snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')

from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark

config = BarkConfig()
model = Bark.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)

def infer(prompt, input_wav_file):

    # Path to your WAV file
    source_path = input_wav_file

    # Destination directory
    destination_directory = "bark_voices"

    # Extract the file name without the extension
    file_name = os.path.splitext(os.path.basename(source_path))[0]

    # Construct the full destination directory path
    destination_path = os.path.join(destination_directory, file_name)

    # Create the new directory
    os.makedirs(destination_path, exist_ok=True)

    # Move the WAV file to the new directory
    shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))

    text = prompt

    # with random speaker
    #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)

    # cloning a speaker.
    # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
    output_dict = model.synthesize(
        text, 
        config, 
        speaker_id=f"{file_name}", 
        voice_dirs="bark_voices/"
    )
    
    print(output_dict)

    sample_rate = 24000  # Replace with the actual sample rate

    wavfile.write(
        'output.wav', 
        sample_rate, 
        output_dict['wav']
    )

    # List all the files and subdirectories in the given directory
    contents = os.listdir(f"bark_voices/{file_name}")

    # Print the contents
    for item in contents:
        print(item)   
    
    return "output.wav", f"bark_voices/{file_name}/{contents[1]}"

css = """
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.HTML("""
        <h1>Instant Voice Cloning</h1>
        """)
        
        prompt = gr.Textbox(
            label="Text to speech prompt"
        )
        
        audio_in = gr.Audio(
            label="WAV voice to clone", 
            type="filepath",
            source="upload"
        )
        
        submit_btn = gr.Button("Submit")
        
        cloned_out = gr.Audio(
            label="Text to speech output"
        )
        
        npz_file = gr.File(
            label=".npz file"
        )
    
    submit_btn.click(
        fn = infer,
        inputs = [
            prompt,
            audio_in
        ],
        outputs = [
            cloned_out, 
            npz_file
        ]
    )

demo.queue().launch()