File size: 1,545 Bytes
2ed7223
 
dc03737
c621812
8b70c99
 
62dda31
dc03737
8b70c99
dc03737
 
8b70c99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc03737
 
 
 
c621812
8b70c99
 
dc03737
8b70c99
dc03737
2ed7223
ab07d9e
8b70c99
2ed7223
 
 
8b70c99
 
dc03737
c621812
dc03737
2ed7223
 
c621812
05dddc6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import transformers
import gradio as gr
import librosa
import torch
import numpy as np


@spaces.GPU(duration=120)
def transcribe_and_respond(audio_input: Tuple[np.ndarray, int]) -> str:
    try:
        pipe = transformers.pipeline(
    model='sarvamai/shuka_v1',
    trust_remote_code=True,
    device=0,
    torch_dtype=torch.bfloat16
)
        # Unpack the audio input
        audio, sr = audio_input
        
        # Ensure audio is float32
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        
        # Resample if necessary
        if sr != 16000:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        
        # Define conversation turns
        turns = [
            {'role': 'system', 'content': 'Respond naturally and informatively.'},
            {'role': 'user', 'content': ''}
        ]

        # Run the pipeline with the audio and conversation turns
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': 16000}, max_new_tokens=512)

        # Return the model's response
        return output

    except Exception as e:
        return f"Error processing audio: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(source="microphone", type="numpy"),  
    outputs="text", 
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will respond naturally and informatively.",
    live=True  # Enable live processing
)

if __name__ == "__main__":
    iface.launch()