import gradio as gr import time import numpy as np import os import requests import io from pydub import AudioSegment def translate_audio(audio, SARVAM_API_KEY): # API endpoint for speech-to-text translation api_url = "https://api.sarvam.ai/speech-to-text-translate" # Headers containing the API subscription key headers = { "api-subscription-key": SARVAM_API_KEY # Replace with your API key } # Data payload for the translation request model_data = { "model": "saaras:v2", # Specify the model to be used "with_diarization": False # Set to True for speaker diarization } chunk_buffer = io.BytesIO() audio.export(chunk_buffer, format="wav") chunk_buffer.seek(0) # Reset the pointer to the start of the stream # Prepare the file for the API request files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')} try: response = requests.post(api_url, headers=headers, files=files, data=model_data) if response.status_code == 200 or response.status_code == 201: response_data = response.json() transcript = response_data.get("transcript", "") elif response.status_code == 401 or response.status_code == 403: raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.") else: raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}") except Exception as e: raise e # Let the caller handle it finally: chunk_buffer.close() return transcript def stream_transcribe(history, new_chunk, SARVAM_API_KEY): start_time = time.time() if history is None: history = "" try: sr, y = new_chunk # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) # Convert to int16 for AudioSegment y_int16 = y.astype(np.int16) # Create AudioSegment from raw PCM data audio_segment = AudioSegment( data=y_int16.tobytes(), sample_width=2, frame_rate=sr, channels=1 ) transcription = translate_audio(audio_segment, SARVAM_API_KEY) end_time = time.time() latency = end_time - start_time history = history + '\n' + transcription return history, history, f"{latency:.2f}" except ValueError as ve: return history, str(ve), "Invalid Key" except Exception as e: print(f"Error during Transcription: {e}") return history, str(e), "Error" def clear(): return "" def clear_state(): return None def clear_api_key(): return "" with gr.Blocks(theme=gr.themes.Glass()) as microphone: with gr.Column(): gr.Markdown( """ ### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing. ### 🔑 Sarvam AI API Key Required To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai). 👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai) 👉 **Step 2:** Sign up or log in 👉 **Step 3:** Generate your API key and paste it below Your key stays on your device and is not stored. """ ) api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password") with gr.Row(): input_audio_microphone = gr.Audio(streaming=True) output = gr.Textbox(label="Transcription", value="") latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0) with gr.Row(): clear_button = gr.Button("Clear Output") clear_api_key_button = gr.Button("Clear API Key") state = gr.State(value="") def wrapped_stream_transcribe(history, new_chunk, api_key): return stream_transcribe(history, new_chunk, api_key) input_audio_microphone.stream( wrapped_stream_transcribe, [state, input_audio_microphone, api_key_box], [state, output, latency_textbox], time_limit=30, stream_every=5, concurrency_limit=None, ) clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) clear_api_key_button.click(clear_api_key, outputs=[api_key_box]) demo = microphone demo.launch()