Spaces:
Sleeping
Sleeping
Commit
·
f4c79df
1
Parent(s):
0a7cc32
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import openvino_genai
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
from threading import Thread, Lock,Event
|
6 |
+
from scipy.ndimage import uniform_filter1d
|
7 |
+
from queue import Queue, Empty
|
8 |
+
|
9 |
+
|
10 |
+
# Initialize Mistral pipeline
|
11 |
+
mistral_pipe = openvino_genai.LLMPipeline("mistral-ov", device="CPU")
|
12 |
+
config = openvino_genai.GenerationConfig(
|
13 |
+
max_new_tokens=100, # Increased for better context
|
14 |
+
num_beams=1, # Keep greedy search for speed
|
15 |
+
do_sample=False, # Disable sampling for faster generation
|
16 |
+
temperature=0.0, # Set to 0 since sampling is disabled
|
17 |
+
top_p=1.0, # Disable top-p filtering
|
18 |
+
top_k=50
|
19 |
+
)
|
20 |
+
pipe_lock = Lock()
|
21 |
+
|
22 |
+
# Initialize Whisper pipeline
|
23 |
+
whisper_pipe = openvino_genai.WhisperPipeline("whisper-ov-model", device="CPU")
|
24 |
+
|
25 |
+
def process_audio(data, sr):
|
26 |
+
"""Audio processing with silence trimming"""
|
27 |
+
data = librosa.to_mono(data.T) if data.ndim > 1 else data
|
28 |
+
data = data.astype(np.float32)
|
29 |
+
data /= np.max(np.abs(data))
|
30 |
+
|
31 |
+
# Voice activity detection
|
32 |
+
frame_length, hop_length = 2048, 512
|
33 |
+
rms = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)[0]
|
34 |
+
smoothed_rms = uniform_filter1d(rms, size=5)
|
35 |
+
speech_frames = np.where(smoothed_rms > 0.025)[0]
|
36 |
+
|
37 |
+
if not speech_frames.size:
|
38 |
+
return None
|
39 |
+
|
40 |
+
start = max(0, int(speech_frames[0] * hop_length - 0.1*sr))
|
41 |
+
end = min(len(data), int((speech_frames[-1]+1) * hop_length + 0.1*sr))
|
42 |
+
return data[start:end]
|
43 |
+
|
44 |
+
def transcribe(audio):
|
45 |
+
"""Audio to text transcription"""
|
46 |
+
sr, data = audio
|
47 |
+
processed = process_audio(data, sr)
|
48 |
+
if processed is None or len(processed) < 1600:
|
49 |
+
return ""
|
50 |
+
|
51 |
+
if sr != 16000:
|
52 |
+
processed = librosa.resample(processed, orig_sr=sr, target_sr=16000)
|
53 |
+
|
54 |
+
return whisper_pipe.generate(processed)
|
55 |
+
|
56 |
+
def stream_generator(message, history):
|
57 |
+
"""Original Mistral streaming function (unchanged)"""
|
58 |
+
response_queue = Queue()
|
59 |
+
completion_event = Event()
|
60 |
+
error_message = [None]
|
61 |
+
|
62 |
+
def callback(token):
|
63 |
+
response_queue.put(token)
|
64 |
+
return openvino_genai.StreamingStatus.RUNNING
|
65 |
+
|
66 |
+
def generate():
|
67 |
+
try:
|
68 |
+
with pipe_lock:
|
69 |
+
|
70 |
+
mistral_pipe.generate(message, config, callback)
|
71 |
+
|
72 |
+
except Exception as e:
|
73 |
+
error_message[0] = str(e)
|
74 |
+
finally:
|
75 |
+
completion_event.set()
|
76 |
+
|
77 |
+
Thread(target=generate, daemon=True).start()
|
78 |
+
|
79 |
+
accumulated = []
|
80 |
+
while not completion_event.is_set() or not response_queue.empty():
|
81 |
+
if error_message[0]:
|
82 |
+
yield f"Error: {error_message[0]}"
|
83 |
+
return
|
84 |
+
|
85 |
+
try:
|
86 |
+
token = response_queue.get_nowait()
|
87 |
+
accumulated.append(token)
|
88 |
+
yield "".join(accumulated)
|
89 |
+
except Empty:
|
90 |
+
continue
|
91 |
+
|
92 |
+
yield "".join(accumulated)
|
93 |
+
|
94 |
+
# Create interface with added voice input
|
95 |
+
with gr.Blocks() as demo:
|
96 |
+
# Original chat interface
|
97 |
+
chat_interface = gr.ChatInterface(
|
98 |
+
stream_generator,
|
99 |
+
textbox=gr.Textbox(placeholder="Ask Mistral...", container=False),
|
100 |
+
title="EDU CHAT BY PHANINDRA REDDY K",
|
101 |
+
examples=[
|
102 |
+
"Explain quantum physics simply",
|
103 |
+
"Write a haiku about technology",
|
104 |
+
"What's the meaning of life?"
|
105 |
+
],
|
106 |
+
cache_examples=False,
|
107 |
+
)
|
108 |
+
|
109 |
+
# Add voice input below examples
|
110 |
+
with gr.Row():
|
111 |
+
audio = gr.Audio(sources=["microphone"], type="numpy", label="Voice Input")
|
112 |
+
transcribe_btn = gr.Button("Send Transcription")
|
113 |
+
|
114 |
+
# Connect transcription to chat input
|
115 |
+
transcribe_btn.click(
|
116 |
+
transcribe,
|
117 |
+
inputs=audio,
|
118 |
+
outputs=chat_interface.textbox
|
119 |
+
)
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
demo.launch(share=True, debug=True)
|