Spaces:

archit11
/

shuka_demo

Sleeping

App Files Files Community

shuka_demo / app.py

archit11

Update app.py

05dddc6 verified 11 months ago

raw

history blame

2.05 kB

	import transformers
	import gradio as gr
	import torch
	import numpy as np
	from typing import Dict, List, Tuple
	import spaces
	import librosa
	import soundfile as sf

	MODEL_NAME = 'sarvamai/shuka_v1'
	SAMPLE_RATE = 16000
	MAX_NEW_TOKENS = 256

	def load_pipeline():
	return transformers.pipeline(
	model=MODEL_NAME,
	trust_remote_code=True,
	device=0,
	torch_dtype=torch.bfloat16
	)

	pipe = load_pipeline()

	def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
	return [
	{'role': 'system', 'content': 'Respond naturally and informatively.'},
	{'role': 'user', 'content': prompt}
	]

	@spaces.GPU(duration=120)
	def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
	try:
	# Unpack the audio input
	sample_rate, audio = audio_input

	# Ensure audio is float32
	if audio.dtype != np.float32:
	audio = audio.astype(np.float32)

	if sample_rate != SAMPLE_RATE:
	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)

	# Convert the audio to WAV format
	wav_data = librosa.util.buf_to_float(audio, n_bytes=2)
	sf.write('temp_audio.wav', wav_data, SAMPLE_RATE)

	# Prepare the inputs for the model
	turns = create_conversation_turns("")
	inputs = {
	'audio': wav_data,
	'turns': turns,
	'sampling_rate': SAMPLE_RATE
	}

	response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)

	return response
	except Exception as e:
	return f"Error processing audio: {str(e)}"

	iface = gr.Interface(
	fn=transcribe_and_respond,
	inputs=gr.Audio(sources="microphone", type="numpy"),
	outputs="text",
	title="Live Voice Input for Transcription and Response",
	description="Speak into your microphone, and the model will respond naturally and informatively.",
	live=True
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()