Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Sleeping

App Files Files Community

indian-asr-with-sarvam / app.py

Mohan-diffuser

Create app.py

6af1e98 verified 2 months ago

raw

history blame

4.32 kB

	import gradio as gr
	import time
	import numpy as np
	import os
	import requests
	import io
	from pydub import AudioSegment



	def translate_audio(audio, SARVAM_API_KEY):

	# API endpoint for speech-to-text translation
	api_url = "https://api.sarvam.ai/speech-to-text-translate"

	# Headers containing the API subscription key
	headers = {
	"api-subscription-key": SARVAM_API_KEY # Replace with your API key
	}

	# Data payload for the translation request
	model_data = {
	"model": "saaras:v2", # Specify the model to be used
	"with_diarization": False # Set to True for speaker diarization
	}


	chunk_buffer = io.BytesIO()
	audio.export(chunk_buffer, format="wav")
	chunk_buffer.seek(0) # Reset the pointer to the start of the stream

	# Prepare the file for the API request
	files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

	try:
	# Make the POST request to the API
	response = requests.post(api_url, headers=headers, files=files, data=model_data)
	if response.status_code == 200 or response.status_code == 201:
	response_data = response.json()
	transcript = response_data.get("transcript", "")
	else:
	# Handle failed requests
	print(f"failed with status code: {response.status_code}")
	print("Response:", response.text)
	except Exception as e:
	# Handle any exceptions during the request
	print(f"Error processing chunk {e}")
	finally:
	# Ensure the buffer is closed after processing
	chunk_buffer.close()

	return transcript

	def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
	start_time = time.time()

	if history is None:
	history = ""

	try:
	sr, y = new_chunk

	# Convert to mono if stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	# Convert to int16 for AudioSegment
	y_int16 = y.astype(np.int16)

	# Create AudioSegment from raw PCM data
	audio_segment = AudioSegment(
	data=y_int16.tobytes(),
	sample_width=2,
	frame_rate=sr,
	channels=1
	)

	transcription = translate_audio(audio_segment, SARVAM_API_KEY)
	end_time = time.time()
	latency = end_time - start_time
	history = history + '\n' + transcription

	return history, history, f"{latency:.2f}"
	except Exception as e:
	print(f"Error during Transcription: {e}")
	return history, str(e), "Error"




	def clear():
	return ""

	def clear_state():
	return None

	def clear_api_key():
	return ""

	with open("gradio.css", "r") as f:
	custom_css = f.read()
	with gr.Blocks(theme=gr.themes.Glass()) as microphone:
	with gr.Column():

	gr.Markdown(
	"""
	### 🔑 Sarvam AI API Key Required
	To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).

	👉 Step 1: Visit [https://sarvam.ai](https://sarvam.ai)
	👉 Step 2: Sign up or log in
	👉 Step 3: Generate your API key and paste it below

	Your key stays on your device and is not stored.
	"""
	)
	api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

	with gr.Row():
	input_audio_microphone = gr.Audio(streaming=True)
	output = gr.Textbox(label="Transcription", value="")
	latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
	with gr.Row():
	clear_button = gr.Button("Clear Output")
	clear_api_key_button = gr.Button("Clear API Key")
	state = gr.State(value="")
	def wrapped_stream_transcribe(history, new_chunk, api_key):
	return stream_transcribe(history, new_chunk, api_key)

	input_audio_microphone.stream(
	wrapped_stream_transcribe,
	[state, input_audio_microphone, api_key_box],
	[state, output, latency_textbox],
	time_limit=30,
	stream_every=5,
	concurrency_limit=None,
	)

	clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
	clear_api_key_button.click(clear_api_key, outputs=[api_key_box])


	demo = microphone
	demo.launch()