File size: 3,180 Bytes
95395b5
 
3460fd7
 
cbcbc7e
 
44e69f7
50e3e8e
cbcbc7e
 
95395b5
cbcbc7e
50e3e8e
 
cbcbc7e
 
 
95395b5
cbcbc7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95395b5
cbcbc7e
95395b5
 
cbcbc7e
95395b5
cbcbc7e
 
 
95395b5
cbcbc7e
95395b5
 
cbcbc7e
95395b5
 
 
cbcbc7e
95395b5
cbcbc7e
 
 
95395b5
cbcbc7e
 
 
95395b5
cbcbc7e
50e3e8e
cbcbc7e
 
 
 
50e3e8e
 
 
 
 
 
 
 
 
cbcbc7e
50e3e8e
 
 
 
cbcbc7e
50e3e8e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
from huggingface_hub import InferenceClient
import torch
from TTS.api import TTS
import os
import subprocess

# Load TTS Model
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)

# Hugging Face LLM Client (DeepSeek R1 7B)
client = InferenceClient("deepseek-ai/deepseek-r1-7b")

# RVC Model Paths
RVC_MODEL_PATH = "zeldabotw.pth"
RVC_INDEX_PATH = "zeldabotw.index"

# Function to call RVC for voice conversion
def convert_voice(input_wav, output_wav):
    """Converts the input TTS audio to ZeldaBotW voice using RVC."""
    if not os.path.exists(RVC_MODEL_PATH) or not os.path.exists(RVC_INDEX_PATH):
        raise FileNotFoundError("RVC model files not found: Ensure zeldabotw.pth and zeldabotw.index are in the same directory.")

    command = f"python infer_rvc.py --input {input_wav} --output {output_wav} --model {RVC_MODEL_PATH} --index {RVC_INDEX_PATH} --pitch_shift 0"
    
    process = subprocess.run(command, shell=True, capture_output=True, text=True)
    if process.returncode != 0:
        print("RVC conversion failed:", process.stderr)
        return None
    return output_wav

# Chatbot Response + TTS + RVC
def respond(
    message, history, system_message, max_tokens, temperature, top_p
):
    messages = [{"role": "system", "content": system_message}]
    
    for val in history:
        if val[0]: messages.append({"role": "user", "content": val[0]})
        if val[1]: messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": message})
    
    response = ""
    for message in client.chat_completion(
        messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p
    ):
        token = message.choices[0].delta.content
        response += token
        yield response, None, None  # Text first

    # Generate Speech from Text
    tts_audio_path = "tts_output.wav"
    tts_model.tts_to_file(text=response, file_path=tts_audio_path)

    # Convert TTS output to ZeldaBotW voice
    rvc_audio_path = "rvc_output.wav"
    rvc_converted_path = convert_voice(tts_audio_path, rvc_audio_path)

    yield response, tts_audio_path, rvc_converted_path  # Send text, TTS, and RVC output

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## DeepSeek R1 7B Chatbot with ZeldaBotW Voice")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="User Input")
    
    system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System Message")
    max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens")
    temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")
    
    tts_audio = gr.Audio(type="filepath", label="TTS Output")
    rvc_audio = gr.Audio(type="filepath", label="RVC ZeldaBotW Voice")

    def chat_fn(message, history):
        return respond(message, history, system_msg.value, max_tokens.value, temperature.value, top_p.value)

    msg.submit(chat_fn, inputs=[msg, chatbot], outputs=[chatbot, tts_audio, rvc_audio])

demo.launch()