File size: 3,267 Bytes
95395b5
 
3460fd7
 
cbcbc7e
 
44e69f7
6e54df2
 
 
 
 
95395b5
cbcbc7e
50e3e8e
 
cbcbc7e
 
 
95395b5
6e54df2
cbcbc7e
6e54df2
cbcbc7e
6150e28
cbcbc7e
6e54df2
cbcbc7e
 
 
 
 
 
 
 
6150e28
95395b5
cbcbc7e
95395b5
cbcbc7e
 
 
95395b5
cbcbc7e
95395b5
4c352f8
 
 
 
95395b5
6e54df2
cbcbc7e
 
95395b5
6e54df2
cbcbc7e
 
95395b5
4c352f8
50e3e8e
cbcbc7e
 
6e54df2
cbcbc7e
4c352f8
50e3e8e
 
 
 
 
 
 
 
cbcbc7e
50e3e8e
 
 
 
cbcbc7e
50e3e8e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from huggingface_hub import InferenceClient
import torch
from TTS.api import TTS
import os
import subprocess

# Force CPU usage
device = "cpu"

# Load TTS Model (Running on CPU)
tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC", gpu=False)  # ✅ Ensures CPU-only execution

# Hugging Face LLM Client (DeepSeek R1 7B)
client = InferenceClient("deepseek-ai/deepseek-r1-7b")

# RVC Model Paths
RVC_MODEL_PATH = "zeldabotw.pth"
RVC_INDEX_PATH = "zeldabotw.index"

# Function to call RVC for voice conversion (CPU Mode)
def convert_voice(input_wav, output_wav):
    """Converts input TTS audio to ZeldaBotW voice using RVC (CPU Mode)."""
    if not os.path.exists(RVC_MODEL_PATH) or not os.path.exists(RVC_INDEX_PATH):
        raise FileNotFoundError("RVC model files not found! Ensure zeldabotw.pth and zeldabotw.index are in the same directory.")

    command = f"python infer_rvc.py --input {input_wav} --output {output_wav} --model {RVC_MODEL_PATH} --index {RVC_INDEX_PATH} --pitch_shift 0 --device cpu"
    
    process = subprocess.run(command, shell=True, capture_output=True, text=True)
    if process.returncode != 0:
        print("RVC conversion failed:", process.stderr)
        return None
    return output_wav

# Chatbot Response + TTS + RVC
def respond(message, history, system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}]
    
    for val in history:
        if val[0]: messages.append({"role": "user", "content": val[0]})
        if val[1]: messages.append({"role": "assistant", "content": val[1]})
    
    messages.append({"role": "user", "content": message})
    
    response = ""

    # Get LLM Response
    for message in client.chat_completion(messages, max_tokens=max_tokens, stream=False, temperature=temperature, top_p=top_p):
        response += message.choices[0].message.content

    # Generate Speech from Text (CPU Mode)
    tts_audio_path = "tts_output.wav"
    tts_model.tts_to_file(text=response, file_path=tts_audio_path)

    # Convert TTS output to ZeldaBotW voice (CPU Mode)
    rvc_audio_path = "rvc_output.wav"
    rvc_converted_path = convert_voice(tts_audio_path, rvc_audio_path)

    return response, tts_audio_path, rvc_converted_path  # ✅ Now correctly returns all outputs

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## DeepSeek R1 7B Chatbot with ZeldaBotW Voice (CPU Mode)")
    
    chatbot = gr.Chatbot(type="messages")  # ✅ Fix deprecated type warning
    msg = gr.Textbox(label="User Input")
    
    system_msg = gr.Textbox(value="You are a friendly Chatbot.", label="System Message")
    max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max Tokens")
    temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
    top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p (Nucleus Sampling)")
    
    tts_audio = gr.Audio(type="filepath", label="TTS Output")
    rvc_audio = gr.Audio(type="filepath", label="RVC ZeldaBotW Voice")

    def chat_fn(message, history):
        return respond(message, history, system_msg.value, max_tokens.value, temperature.value, top_p.value)

    msg.submit(chat_fn, inputs=[msg, chatbot], outputs=[chatbot, tts_audio, rvc_audio])

demo.launch()