Spaces:

archit11
/

shuka_demo

Running on Zero

shuka_demo / app.py

Update app.py

70351e3 verified 10 months ago

1.57 kB

	import transformers
	import gradio as gr
	import torch
	import numpy as np
	from typing import Dict, List
	import spaces

	# Constants
	MODEL_NAME = 'sarvamai/shuka_v1'
	SAMPLE_RATE = 16000
	MAX_NEW_TOKENS = 256

	# Load the pipeline
	pipe = transformers.pipeline(
	model=MODEL_NAME,
	trust_remote_code=True,
	device=0,
	torch_dtype='bfloat16'
	)

	def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
	return [
	{'role': 'system', 'content': 'Respond naturally and informatively.'},
	{'role': 'user', 'content': prompt}
	]

	@spaces.GPU(duration=120)
	def transcribe_and_respond(audio: np.ndarray) -> str:
	try:
	# Ensure audio is float32
	if audio.dtype != np.float32:
	audio = audio.astype(np.float32)

	# Create input for the pipeline
	turns = create_conversation_turns("<\|audio\|>")
	inputs = {
	'audio': audio,
	'turns': turns,
	}

	# Generate response
	response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)

	return response
	except Exception as e:
	return f"Error processing audio: {str(e)}"

	# Create the Gradio interface
	iface = gr.Interface(
	fn=transcribe_and_respond,
	inputs=gr.Audio(sources="microphone", type="numpy"),
	outputs="text",
	title="Live Voice Input for Transcription and Response",
	description="Speak into your microphone, and the model will respond naturally and informatively.",
	live=True
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()