File size: 2,045 Bytes
2ed7223
 
c621812
 
b268601
2ed7223
ee83532
05dddc6
62dda31
c621812
 
 
 
b268601
 
 
 
 
 
 
 
 
c621812
 
 
 
 
 
2ed7223
 
b268601
ab07d9e
b268601
 
 
c621812
 
 
ab07d9e
b268601
 
 
05dddc6
 
 
 
 
 
c621812
05dddc6
c621812
b268601
c621812
ab07d9e
4269171
ab07d9e
 
 
 
2ed7223
 
 
70351e3
c621812
 
 
 
2ed7223
 
c621812
 
05dddc6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import transformers
import gradio as gr
import torch
import numpy as np
from typing import Dict, List, Tuple
import spaces
import librosa
import soundfile as sf

MODEL_NAME = 'sarvamai/shuka_v1'
SAMPLE_RATE = 16000
MAX_NEW_TOKENS = 256

def load_pipeline():
    return transformers.pipeline(
        model=MODEL_NAME,
        trust_remote_code=True,
        device=0,
        torch_dtype=torch.bfloat16
    )

pipe = load_pipeline()

def create_conversation_turns(prompt: str) -> List[Dict[str, str]]:
    return [
        {'role': 'system', 'content': 'Respond naturally and informatively.'},
        {'role': 'user', 'content': prompt}
    ]

@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[int, np.ndarray]) -> str:
    try:
        # Unpack the audio input
        sample_rate, audio = audio_input
        
        # Ensure audio is float32
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        
        if sample_rate != SAMPLE_RATE:
            audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=SAMPLE_RATE)
        
        # Convert the audio to WAV format
        wav_data = librosa.util.buf_to_float(audio, n_bytes=2)
        sf.write('temp_audio.wav', wav_data, SAMPLE_RATE)
        
        # Prepare the inputs for the model
        turns = create_conversation_turns("")
        inputs = {
            'audio': wav_data,
            'turns': turns,
            'sampling_rate': SAMPLE_RATE
        }
        
        response = pipe(inputs, max_new_tokens=MAX_NEW_TOKENS)
        
        return response
    except Exception as e:
        return f"Error processing audio: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="numpy"),
    outputs="text",
    title="Live Voice Input for Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True
)

# Launch the app
if __name__ == "__main__":
    iface.launch()