import gradio as gr import torch from transformers import pipeline # Laden der Modelle (einmalig beim Start) device = "cuda" if torch.cuda.is_available() else "cpu" speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=device) #text_to_speech = pipeline("text-to-speech", model="facebook/seamless-streaming", device=device) def audio_to_audio_chatbot(audio): if audio is None: return None, "Bitte eine Audio-Datei hochladen." # 1. Speech-to-Text text = speech_to_text(audio)["text"] return text print(f"User: {text}") # 2. Text-to-Text (Hier wird ein einfacher Echo-Bot verwendet, kann durch ein komplexeres Modell ersetzt werden) response_text = f"Du hast gesagt: {text}" print(f"Bot: {response_text}") # 3. Text-to-Speech #speech = text_to_speech(response_text) #return speech["audio"], response_text #if __name__ == "__main__": """ iface = gr.Interface( fn=audio_to_audio_chatbot, inputs=gr.Audio(type="filepath"), outputs= gr.Textbox(), #outputs=[gr.Audio(), gr.Textbox()], title="Audio-zu-Audio-Chatbot (Streaming)", description="Spreche in das Mikrofon und der Bot antwortet mit einer Audio-Ausgabe.", live=True # Aktiviert Streaming ) """ with gr.Blocks() as speech: with gr.Row(): sr_outputs = gr.Textbox(label="Antwort") with gr.Row(): sr_inputs = gr.Microphone(type="filepath") sr_inputs.change(audio_to_audio_chatbot, inputs=sr_inputs, outputs=sr_outputs) speech.launch()