File size: 1,599 Bytes
3e9fe0d 51a7232 3e9fe0d 216a1d9 51a7232 3e9fe0d 216a1d9 e218c4e 51a7232 7e33221 51a7232 e218c4e 3e9fe0d 0aae4d0 4def658 51a7232 0d59c05 e218c4e 51a7232 4def658 0aae4d0 4def658 1496089 4def658 1496089 216a1d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import gradio as gr
import torch
from transformers import pipeline
# Laden der Modelle (einmalig beim Start)
device = "cuda" if torch.cuda.is_available() else "cpu"
speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo", device=device)
#text_to_speech = pipeline("text-to-speech", model="facebook/seamless-streaming", device=device)
def audio_to_audio_chatbot(audio):
if audio is None:
return None, "Bitte eine Audio-Datei hochladen."
# 1. Speech-to-Text
text = speech_to_text(audio)["text"]
return text
print(f"User: {text}")
# 2. Text-to-Text (Hier wird ein einfacher Echo-Bot verwendet, kann durch ein komplexeres Modell ersetzt werden)
response_text = f"Du hast gesagt: {text}"
print(f"Bot: {response_text}")
# 3. Text-to-Speech
#speech = text_to_speech(response_text)
#return speech["audio"], response_text
#if __name__ == "__main__":
"""
iface = gr.Interface(
fn=audio_to_audio_chatbot,
inputs=gr.Audio(type="filepath"),
outputs= gr.Textbox(),
#outputs=[gr.Audio(), gr.Textbox()],
title="Audio-zu-Audio-Chatbot (Streaming)",
description="Spreche in das Mikrofon und der Bot antwortet mit einer Audio-Ausgabe.",
live=True # Aktiviert Streaming
)
"""
with gr.Blocks() as speech:
with gr.Row():
sr_outputs = gr.Textbox(label="Antwort")
with gr.Row():
sr_inputs = gr.Microphone(type="filepath")
sr_inputs.change(audio_to_audio_chatbot, inputs=sr_inputs, outputs=sr_outputs)
speech.launch()
|