import transformers import gradio as gr import librosa import torch import spaces import numpy as np # Initialize the conversation history globally conversation_history = [] @spaces.GPU(duration=120) def transcribe_and_respond(audio_file, chat_history): try: pipe = transformers.pipeline( model='sarvamai/shuka_v1', trust_remote_code=True, device=0, torch_dtype=torch.bfloat16 ) # Load the audio file audio, sr = librosa.load(audio_file, sr=16000) # Debug: Print audio properties for debugging print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}") # Prepare conversation turns turns = chat_history.copy() # Take the existing chat history and append user input turns.append({'role': 'user', 'content': '<|audio|>'}) # Debug: Print the updated turns for debugging purposes print(f"Updated turns: {turns}") # Call the model with the updated conversation turns and audio output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512) # Append the model's response to the conversation history turns.append({'role': 'system', 'content': output}) # Debug: Print the model's response print(f"Model output: {output}") # Format the chat history for Gradio's Chatbot chat_history_for_display = [] for turn in turns: if turn['role'] == 'user': chat_history_for_display.append(("User", "🗣️ (Spoken Audio)")) else: chat_history_for_display.append(("AI", turn['content'])) return chat_history_for_display, turns # Return the formatted chat history for display and the updated history except Exception as e: return f"Error: {str(e)}", chat_history # Ensure history is returned even on error # Define the Gradio interface iface = gr.Interface( fn=transcribe_and_respond, inputs=[ gr.Audio(sources="microphone", type="filepath", label="Your Audio (Microphone)"), gr.State([]) # Hidden state to maintain conversation history ], outputs=[ gr.Chatbot(label="Conversation History"), # Display the conversation gr.State([]) # Hidden state to keep track of the updated conversation history ], title="Shuka demo", description="shuka live demo", live=True, # Enable live mode for real-time interaction allow_flagging="auto", # enable_queue=True ) if __name__ == "__main__": iface.launch()