Spaces:
Sleeping
Sleeping
from huggingface_hub import snapshot_download | |
# Download models from Hugging Face to local folders | |
snapshot_download( | |
repo_id="OpenVINO/Mistral-7B-Instruct-v0.2-int4-ov", | |
local_dir="mistral-ov" | |
) | |
snapshot_download( | |
repo_id="OpenVINO/whisper-tiny-fp16-ov", | |
local_dir="whisper-ov-model" | |
) | |
import gradio as gr | |
import openvino_genai | |
import librosa | |
import numpy as np | |
from threading import Thread, Lock, Event | |
from scipy.ndimage import uniform_filter1d | |
from queue import Queue, Empty | |
# Initialize Mistral pipeline | |
mistral_pipe = openvino_genai.LLMPipeline("mistral-ov", device="CPU") | |
config = openvino_genai.GenerationConfig( | |
max_new_tokens=100, | |
num_beams=1, | |
do_sample=False, | |
temperature=0.0, | |
top_p=1.0, | |
top_k=50 | |
) | |
pipe_lock = Lock() | |
# Initialize Whisper pipeline | |
whisper_pipe = openvino_genai.WhisperPipeline("whisper-ov-model", device="CPU") | |
def process_audio(data, sr): | |
"""Audio processing with silence trimming""" | |
data = librosa.to_mono(data.T) if data.ndim > 1 else data | |
data = data.astype(np.float32) | |
data /= np.max(np.abs(data)) | |
# Voice activity detection | |
frame_length, hop_length = 2048, 512 | |
rms = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)[0] | |
smoothed_rms = uniform_filter1d(rms, size=5) | |
speech_frames = np.where(smoothed_rms > 0.025)[0] | |
if not speech_frames.size: | |
return None | |
start = max(0, int(speech_frames[0] * hop_length - 0.1*sr)) | |
end = min(len(data), int((speech_frames[-1]+1) * hop_length + 0.1*sr)) | |
return data[start:end] | |
def transcribe(audio): | |
"""Audio to text transcription""" | |
sr, data = audio | |
processed = process_audio(data, sr) | |
if processed is None or len(processed) < 1600: | |
return "" | |
if sr != 16000: | |
processed = librosa.resample(processed, orig_sr=sr, target_sr=16000) | |
return whisper_pipe.generate(processed) | |
def stream_generator(message, history): | |
response_queue = Queue() | |
completion_event = Event() | |
error_message = [None] | |
def callback(token): | |
response_queue.put(token) | |
return openvino_genai.StreamingStatus.RUNNING | |
def generate(): | |
try: | |
with pipe_lock: | |
mistral_pipe.generate(message, config, callback) | |
except Exception as e: | |
error_message[0] = str(e) | |
finally: | |
completion_event.set() | |
Thread(target=generate, daemon=True).start() | |
accumulated = [] | |
while not completion_event.is_set() or not response_queue.empty(): | |
if error_message[0]: | |
yield f"Error: {error_message[0]}" | |
return | |
try: | |
token = response_queue.get_nowait() | |
accumulated.append(token) | |
yield "".join(accumulated) | |
except Empty: | |
continue | |
yield "".join(accumulated) | |
with gr.Blocks() as demo: | |
chat_interface = gr.ChatInterface( | |
stream_generator, | |
textbox=gr.Textbox(placeholder="Ask Mistral...", container=False), | |
title="EDU CHAT BY PHANINDRA REDDY K", | |
examples=[ | |
"Explain quantum physics simply", | |
"Write a haiku about technology", | |
"What's the meaning of life?" | |
], | |
cache_examples=False, | |
) | |
with gr.Row(): | |
audio = gr.Audio(sources=["microphone"], type="numpy", label="Voice Input") | |
transcribe_btn = gr.Button("Send Transcription") | |
transcribe_btn.click( | |
transcribe, | |
inputs=audio, | |
outputs=chat_interface.textbox | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True,debug=True) | |