Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,045 Bytes
2ed7223 c621812 b268601 2ed7223 ee83532 05dddc6 62dda31 c621812 b268601 c621812 2ed7223 b268601 ab07d9e b268601 c621812 ab07d9e b268601 05dddc6 c621812 05dddc6 c621812 b268601 c621812 ab07d9e 4269171 ab07d9e 2ed7223 70351e3 c621812 2ed7223 c621812 05dddc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import transformers
import gradio as gr
import torch
import numpy as np
from typing import Dict, List, Tuple
import spaces
import librosa
import soundfile as sf
MODEL_NAME = 'sarvamai/shuka_v1'
SAMPLE_RATE = 16000
MAX_NEW_TOKENS = 256
def load_pipeline():
return transformers.pipeline(
model=MODEL_NAME,
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
pipe = load_pipeline()
def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
return [
{'role': 'system', 'content': 'Respond naturally and informatively.'},
{'role': 'user', 'content': prompt}
]
@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
try:
# Unpack the audio input
sample_rate, audio = audio_input
# Ensure audio is float32
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
if sample_rate != SAMPLE_RATE:
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
# Convert the audio to WAV format
wav_data = librosa.util.buf_to_float(audio, n_bytes=2)
sf.write('temp_audio.wav', wav_data, SAMPLE_RATE)
# Prepare the inputs for the model
turns = create_conversation_turns("")
inputs = {
'audio': wav_data,
'turns': turns,
'sampling_rate': SAMPLE_RATE
}
response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)
return response
except Exception as e:
return f"Error processing audio: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="numpy"),
outputs="text",
title="Live Voice Input for Transcription and Response",
description="Speak into your microphone, and the model will respond naturally and informatively.",
live=True
)
# Launch the app
if __name__ == "__main__":
iface.launch()
|