File size: 1,599 Bytes
3e9fe0d
 
51a7232
3e9fe0d
216a1d9
51a7232
3e9fe0d
 
216a1d9
e218c4e
51a7232
 
 
 
 
 
 
7e33221
51a7232
 
 
 
 
 
 
e218c4e
 
3e9fe0d
0aae4d0
4def658
51a7232
 
0d59c05
e218c4e
 
51a7232
 
 
 
4def658
0aae4d0
4def658
 
 
 
1496089
4def658
1496089
216a1d9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import gradio as gr
import torch
from transformers import pipeline


# Laden der Modelle (einmalig beim Start)
device = "cuda" if torch.cuda.is_available() else "cpu"

speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=device)
#text_to_speech = pipeline("text-to-speech", model="facebook/seamless-streaming", device=device)

def audio_to_audio_chatbot(audio):
    if audio is None:
        return None, "Bitte eine Audio-Datei hochladen."

    # 1. Speech-to-Text
    text = speech_to_text(audio)["text"]
    return text
    print(f"User: {text}")

    # 2. Text-to-Text (Hier wird ein einfacher Echo-Bot verwendet, kann durch ein komplexeres Modell ersetzt werden)
    response_text = f"Du hast gesagt: {text}"
    print(f"Bot: {response_text}")

    # 3. Text-to-Speech
    #speech = text_to_speech(response_text)
    #return speech["audio"], response_text

#if __name__ == "__main__":
    """
    iface = gr.Interface(
        fn=audio_to_audio_chatbot,
        inputs=gr.Audio(type="filepath"),
        outputs= gr.Textbox(),
        #outputs=[gr.Audio(), gr.Textbox()],
        title="Audio-zu-Audio-Chatbot (Streaming)",
        description="Spreche in das Mikrofon und der Bot antwortet mit einer Audio-Ausgabe.",
        live=True  # Aktiviert Streaming
    )
    """
with gr.Blocks() as speech:  
    with gr.Row():
        sr_outputs = gr.Textbox(label="Antwort")
    with gr.Row():
        sr_inputs = gr.Microphone(type="filepath") 
    sr_inputs.change(audio_to_audio_chatbot, inputs=sr_inputs, outputs=sr_outputs)

speech.launch()