File size: 3,642 Bytes
b02b868
 
 
 
 
 
 
 
 
 
 
 
f4c79df
 
 
 
b02b868
f4c79df
 
 
 
 
 
b02b868
 
 
 
 
f4c79df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b02b868
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from huggingface_hub import snapshot_download

# Download models from Hugging Face to local folders
snapshot_download(
    repo_id="OpenVINO/Mistral-7B-Instruct-v0.2-int4-ov",
    local_dir="mistral-ov"
)
snapshot_download(
    repo_id="OpenVINO/whisper-tiny-fp16-ov",
    local_dir="whisper-ov-model"
)

import gradio as gr
import openvino_genai
import librosa
import numpy as np
from threading import Thread, Lock, Event
from scipy.ndimage import uniform_filter1d
from queue import Queue, Empty

# Initialize Mistral pipeline
mistral_pipe = openvino_genai.LLMPipeline("mistral-ov", device="CPU")
config = openvino_genai.GenerationConfig(
    max_new_tokens=100,
    num_beams=1,
    do_sample=False,
    temperature=0.0,
    top_p=1.0,
    top_k=50  
)
pipe_lock = Lock()

# Initialize Whisper pipeline 
whisper_pipe = openvino_genai.WhisperPipeline("whisper-ov-model", device="CPU")

def process_audio(data, sr):
    """Audio processing with silence trimming"""
    data = librosa.to_mono(data.T) if data.ndim > 1 else data
    data = data.astype(np.float32)
    data /= np.max(np.abs(data))
    
    # Voice activity detection
    frame_length, hop_length = 2048, 512
    rms = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)[0]
    smoothed_rms = uniform_filter1d(rms, size=5)
    speech_frames = np.where(smoothed_rms > 0.025)[0]
    
    if not speech_frames.size:
        return None
    
    start = max(0, int(speech_frames[0] * hop_length - 0.1*sr))
    end = min(len(data), int((speech_frames[-1]+1) * hop_length + 0.1*sr))
    return data[start:end]

def transcribe(audio):
    """Audio to text transcription"""
    sr, data = audio
    processed = process_audio(data, sr)
    if processed is None or len(processed) < 1600:
        return ""
    
    if sr != 16000:
        processed = librosa.resample(processed, orig_sr=sr, target_sr=16000)
    
    return whisper_pipe.generate(processed)

def stream_generator(message, history):
    response_queue = Queue()
    completion_event = Event()
    error_message = [None]

    def callback(token):
        response_queue.put(token)
        return openvino_genai.StreamingStatus.RUNNING

    def generate():
        try:
            with pipe_lock:
                mistral_pipe.generate(message, config, callback)
        except Exception as e:
            error_message[0] = str(e)
        finally:
            completion_event.set()

    Thread(target=generate, daemon=True).start()

    accumulated = []
    while not completion_event.is_set() or not response_queue.empty():
        if error_message[0]:
            yield f"Error: {error_message[0]}"
            return

        try:
            token = response_queue.get_nowait()
            accumulated.append(token)
            yield "".join(accumulated)
        except Empty:
            continue

    yield "".join(accumulated)

with gr.Blocks() as demo:
    chat_interface = gr.ChatInterface(
        stream_generator,
        textbox=gr.Textbox(placeholder="Ask Mistral...", container=False),
        title="EDU CHAT BY PHANINDRA REDDY K",
        examples=[
            "Explain quantum physics simply",
            "Write a haiku about technology",
            "What's the meaning of life?"
        ],
        cache_examples=False,
    )
    
    with gr.Row():
        audio = gr.Audio(sources=["microphone"], type="numpy", label="Voice Input")
        transcribe_btn = gr.Button("Send Transcription")
    
    transcribe_btn.click(
        transcribe,
        inputs=audio,
        outputs=chat_interface.textbox
    )

if __name__ == "__main__":
    demo.launch(share=True,debug=True)