shuka_demo / app.py
archit11's picture
Update app.py
05dddc6 verified
raw
history blame
2.05 kB
import transformers
import gradio as gr
import torch
import numpy as np
from typing import Dict, List, Tuple
import spaces
import librosa
import soundfile as sf
MODEL_NAME = 'sarvamai/shuka_v1'
SAMPLE_RATE = 16000
MAX_NEW_TOKENS = 256
def load_pipeline():
return transformers.pipeline(
model=MODEL_NAME,
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
pipe = load_pipeline()
def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
return [
{'role': 'system', 'content': 'Respond naturally and informatively.'},
{'role': 'user', 'content': prompt}
]
@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
try:
# Unpack the audio input
sample_rate, audio = audio_input
# Ensure audio is float32
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
if sample_rate != SAMPLE_RATE:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
# Convert the audio to WAV format
wav_data = librosa.util.buf_to_float(audio, n_bytes=2)
sf.write('temp_audio.wav', wav_data, SAMPLE_RATE)
# Prepare the inputs for the model
turns = create_conversation_turns("")
inputs = {
'audio': wav_data,
'turns': turns,
'sampling_rate': SAMPLE_RATE
}
response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)
return response
except Exception as e:
return f"Error processing audio: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="numpy"),
outputs="text",
title="Live Voice Input for Transcription and Response",
description="Speak into your microphone, and the model will respond naturally and informatively.",
live=True
)
# Launch the app
if __name__ == "__main__":
iface.launch()