Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,209 @@
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import subprocess
|
3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
LANGUAGE_CODES = {
|
7 |
"English": "eng",
|
8 |
"Spanish": "spa",
|
@@ -12,58 +213,107 @@ LANGUAGE_CODES = {
|
|
12 |
"Chinese": "cmn"
|
13 |
}
|
14 |
|
15 |
-
def transcribe(audio):
|
16 |
-
if audio is None:
|
17 |
-
return "No audio input detected. Please record or upload an audio file."
|
18 |
-
|
19 |
-
try:
|
20 |
-
text = model.stt_file(audio)[0]
|
21 |
-
return text
|
22 |
-
except Exception as e:
|
23 |
-
return f"Error transcribing audio: {str(e)}"
|
24 |
-
|
25 |
def translate_speech(audio_file, target_language):
|
|
|
|
|
|
|
26 |
if audio_file is None:
|
27 |
-
return
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
"--output_path", output_file
|
41 |
-
]
|
42 |
-
|
43 |
-
subprocess.run(command, check=True)
|
44 |
-
|
45 |
-
if os.path.exists(output_file):
|
46 |
-
print(f"File created successfully: {output_file}")
|
47 |
-
return output_file
|
48 |
-
else:
|
49 |
-
return "Error: Translated audio file not found."
|
50 |
-
except Exception as e:
|
51 |
-
return f"Error translating speech: {str(e)}"
|
52 |
-
|
53 |
-
def create_interface():
|
54 |
-
inputs = [
|
55 |
-
gr.Audio(label="User", sources=["microphone"], type="filepath"),
|
56 |
-
gr.Dropdown(list(LANGUAGE_CODES.keys()), label="Target Language")
|
57 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
inputs=inputs,
|
62 |
-
outputs=gr.Audio(label="Translated Audio", interactive=False, autoplay=True, elem_classes="audio"),
|
63 |
-
title="Seamless Expressive Speech-To-Speech Translator",
|
64 |
-
description="Hear how you sound in another language.",
|
65 |
-
)
|
66 |
|
67 |
if __name__ == "__main__":
|
68 |
-
|
69 |
-
iface.launch()
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import edge_tts
|
3 |
+
import asyncio
|
4 |
+
import tempfile
|
5 |
import os
|
6 |
+
from huggingface_hub import InferenceClient
|
7 |
+
import re
|
8 |
+
from streaming_stt_nemo import Model
|
9 |
+
import torch
|
10 |
+
import random
|
11 |
+
from openai import OpenAI
|
12 |
import subprocess
|
13 |
+
import threading
|
14 |
+
import queue
|
15 |
+
import sounddevice as sd
|
16 |
+
import numpy as np
|
17 |
+
import wave
|
18 |
+
import sys
|
19 |
+
|
20 |
+
default_lang = "en"
|
21 |
+
|
22 |
+
engines = { default_lang: Model(default_lang) }
|
23 |
+
|
24 |
+
def transcribe(audio):
|
25 |
+
if audio is None:
|
26 |
+
return ""
|
27 |
+
lang = "en"
|
28 |
+
model = engines[lang]
|
29 |
+
text = model.stt_file(audio)[0]
|
30 |
+
return text
|
31 |
+
|
32 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
33 |
+
|
34 |
+
def client_fn(model):
|
35 |
+
if "Llama 3 8B Service" in model:
|
36 |
+
return OpenAI(
|
37 |
+
base_url="http://52.76.81.56:60002/v1",
|
38 |
+
api_key="token-abc123"
|
39 |
+
)
|
40 |
+
elif "Llama" in model:
|
41 |
+
return InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
|
42 |
+
elif "Mistral" in model:
|
43 |
+
return InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
|
44 |
+
elif "Phi" in model:
|
45 |
+
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
|
46 |
+
elif "Mixtral" in model:
|
47 |
+
return InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
48 |
+
else:
|
49 |
+
return InferenceClient("microsoft/Phi-3-mini-4k-instruct")
|
50 |
+
|
51 |
+
def randomize_seed_fn(seed: int) -> int:
|
52 |
+
seed = random.randint(0, 999999)
|
53 |
+
return seed
|
54 |
+
|
55 |
+
system_instructions1 = """
|
56 |
+
[SYSTEM] You are OPTIMUS Prime a personal AI voice assistant, Created by Jaward.
|
57 |
+
Keep conversation friendly, short, clear, and concise.
|
58 |
+
Avoid unnecessary introductions and answer the user's questions directly.
|
59 |
+
Respond in a normal, conversational manner while being friendly and helpful.
|
60 |
+
Remember previous parts of the conversation and use that context in your responses.
|
61 |
+
Your creator Jaward is an AI Research Engineer at Linksoul AI. He is currently specializing in Artificial Intelligence (AI) research more specifically training and optimizing advance AI systems. He aspires to build not just human-like intelligence but AI Systems that augment human intelligence. He has contributed greatly to the opensource community with first-principles code implementations of AI/ML research papers. He did his first internship at Beijing Academy of Artificial Intelligence as an AI Researher where he contributed in cutting-edge AI research leading to him contributing to an insightful paper (AUTOAGENTS - A FRAMEWORK FOR AUTOMATIC AGENT GENERATION). The paper got accepted this year at IJCAI(International Joint Conference On AI). He is currently doing internship at LinkSoul AI - a small opensource AI Research startup in Beijing.
|
62 |
+
[USER]
|
63 |
+
"""
|
64 |
+
|
65 |
+
conversation_history = []
|
66 |
+
|
67 |
+
def models(text, model="Llama 3 8B Service", seed=42):
|
68 |
+
global conversation_history
|
69 |
+
seed = int(randomize_seed_fn(seed))
|
70 |
+
generator = torch.Generator().manual_seed(seed)
|
71 |
+
|
72 |
+
client = client_fn(model)
|
73 |
+
|
74 |
+
if "Llama 3 8B Service" in model:
|
75 |
+
messages = [
|
76 |
+
{"role": "system", "content": system_instructions1},
|
77 |
+
] + conversation_history + [
|
78 |
+
{"role": "user", "content": text}
|
79 |
+
]
|
80 |
+
completion = client.chat.completions.create(
|
81 |
+
model="/data/shared/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/c4a54320a52ed5f88b7a2f84496903ea4ff07b45/",
|
82 |
+
messages=messages
|
83 |
+
)
|
84 |
+
assistant_response = completion.choices[0].message.content
|
85 |
+
|
86 |
+
# Update conversation history
|
87 |
+
conversation_history.append({"role": "user", "content": text})
|
88 |
+
conversation_history.append({"role": "assistant", "content": assistant_response})
|
89 |
+
|
90 |
+
# Keep only the last 10 messages to avoid token limit issues
|
91 |
+
if len(conversation_history) > 20:
|
92 |
+
conversation_history = conversation_history[-20:]
|
93 |
+
|
94 |
+
return assistant_response
|
95 |
+
else:
|
96 |
+
# For other models, we'll concatenate the conversation history into a single string
|
97 |
+
history_text = "\n".join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in conversation_history])
|
98 |
+
formatted_prompt = f"{system_instructions1}\n\nConversation history:\n{history_text}\n\nUser: {text}\nOPTIMUS:"
|
99 |
+
|
100 |
+
generate_kwargs = dict(
|
101 |
+
max_new_tokens=300,
|
102 |
+
seed=seed
|
103 |
+
)
|
104 |
+
stream = client.text_generation(
|
105 |
+
formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
|
106 |
+
output = ""
|
107 |
+
for response in stream:
|
108 |
+
if not response.token.text == "</s>":
|
109 |
+
output += response.token.text
|
110 |
+
|
111 |
+
# Update conversation history
|
112 |
+
conversation_history.append({"role": "user", "content": text})
|
113 |
+
conversation_history.append({"role": "assistant", "content": output})
|
114 |
+
|
115 |
+
# Keep only the last 10 messages to avoid token limit issues
|
116 |
+
if len(conversation_history) > 20:
|
117 |
+
conversation_history = conversation_history[-20:]
|
118 |
+
|
119 |
+
return output
|
120 |
+
|
121 |
+
# New global variables for audio processing
|
122 |
+
RATE = 16000
|
123 |
+
CHUNK = int(RATE / 10) # 100ms
|
124 |
+
audio_queue = queue.Queue()
|
125 |
+
is_listening = False
|
126 |
+
|
127 |
+
def audio_callback(indata, frames, time, status):
|
128 |
+
if status:
|
129 |
+
print(status, file=sys.stderr)
|
130 |
+
audio_queue.put(indata.copy())
|
131 |
+
|
132 |
+
def process_audio_stream(model, seed):
|
133 |
+
global is_listening
|
134 |
+
audio_buffer = []
|
135 |
+
silence_threshold = 0.01
|
136 |
+
silence_duration = 0
|
137 |
+
max_silence = 2 # seconds
|
138 |
+
|
139 |
+
while True:
|
140 |
+
if not is_listening:
|
141 |
+
audio_buffer.clear()
|
142 |
+
silence_duration = 0
|
143 |
+
audio_queue.queue.clear()
|
144 |
+
continue
|
145 |
|
146 |
+
try:
|
147 |
+
chunk = audio_queue.get(timeout=1)
|
148 |
+
audio_buffer.append(chunk)
|
149 |
+
|
150 |
+
# Check for silence
|
151 |
+
if np.abs(chunk).mean() < silence_threshold:
|
152 |
+
silence_duration += CHUNK / RATE
|
153 |
+
else:
|
154 |
+
silence_duration = 0
|
155 |
+
|
156 |
+
if silence_duration > max_silence:
|
157 |
+
# Process the buffered audio
|
158 |
+
audio_data = np.concatenate(audio_buffer)
|
159 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
160 |
+
tmp_path = tmp_file.name
|
161 |
+
with wave.open(tmp_path, 'wb') as wf:
|
162 |
+
wf.setnchannels(1)
|
163 |
+
wf.setsampwidth(2)
|
164 |
+
wf.setframerate(RATE)
|
165 |
+
wf.writeframes((audio_data * 32767).astype(np.int16).tobytes())
|
166 |
+
|
167 |
+
# Transcribe and process
|
168 |
+
user_input = transcribe(tmp_path)
|
169 |
+
if user_input:
|
170 |
+
is_listening = False
|
171 |
+
reply = models(user_input, model, seed)
|
172 |
+
asyncio.run(respond_and_play(reply))
|
173 |
+
is_listening = True
|
174 |
+
|
175 |
+
# Clear the buffer
|
176 |
+
audio_buffer.clear()
|
177 |
+
silence_duration = 0
|
178 |
+
|
179 |
+
except queue.Empty:
|
180 |
+
pass
|
181 |
+
|
182 |
+
async def respond_and_play(text):
|
183 |
+
communicate = edge_tts.Communicate(text, voice="en-US-ChristopherNeural")
|
184 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
185 |
+
tmp_path = tmp_file.name
|
186 |
+
await communicate.save(tmp_path)
|
187 |
+
|
188 |
+
# Play the audio
|
189 |
+
with wave.open(tmp_path, 'rb') as wf:
|
190 |
+
data = wf.readframes(wf.getnframes())
|
191 |
+
sd.play(np.frombuffer(data, dtype=np.int16), wf.getframerate())
|
192 |
+
sd.wait()
|
193 |
+
|
194 |
+
def start_listening(model, seed):
|
195 |
+
global is_listening
|
196 |
+
is_listening = True
|
197 |
+
threading.Thread(target=process_audio_stream, args=(model, seed), daemon=True).start()
|
198 |
+
with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
|
199 |
+
while is_listening:
|
200 |
+
sd.sleep(100)
|
201 |
+
|
202 |
+
def stop_listening():
|
203 |
+
global is_listening
|
204 |
+
is_listening = False
|
205 |
+
|
206 |
+
# Supported languages for seamless-expressive
|
207 |
LANGUAGE_CODES = {
|
208 |
"English": "eng",
|
209 |
"Spanish": "spa",
|
|
|
213 |
"Chinese": "cmn"
|
214 |
}
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
def translate_speech(audio_file, target_language):
|
217 |
+
"""
|
218 |
+
Translate input speech (audio file) to the specified target language.
|
219 |
+
"""
|
220 |
if audio_file is None:
|
221 |
+
return None
|
222 |
|
223 |
+
language_code = LANGUAGE_CODES[target_language]
|
224 |
+
output_file = "translated_audio.wav"
|
225 |
+
|
226 |
+
command = [
|
227 |
+
"expressivity_predict",
|
228 |
+
audio_file,
|
229 |
+
"--tgt_lang", language_code,
|
230 |
+
"--model_name", "seamless_expressivity",
|
231 |
+
"--vocoder_name", "vocoder_pretssel",
|
232 |
+
"--gated-model-dir", "models",
|
233 |
+
"--output_path", output_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
]
|
235 |
+
|
236 |
+
subprocess.run(command, check=True)
|
237 |
+
|
238 |
+
if os.path.exists(output_file):
|
239 |
+
print(f"File created successfully: {output_file}")
|
240 |
+
return output_file
|
241 |
+
else:
|
242 |
+
print(f"File not found: {output_file}")
|
243 |
+
return None
|
244 |
+
|
245 |
+
def clear_history():
|
246 |
+
global conversation_history
|
247 |
+
conversation_history = []
|
248 |
+
return None, None, None, None
|
249 |
+
|
250 |
+
def voice_assistant_tab():
|
251 |
+
return "# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>"
|
252 |
+
|
253 |
+
def speech_translation_tab():
|
254 |
+
return "# <center><b>Hear how you sound in another language</b></center>"
|
255 |
+
|
256 |
+
with gr.Blocks(css="style.css") as demo:
|
257 |
+
description = gr.Markdown("# <center><b>Hello, I am Optimus Prime your personal AI voice assistant</b></center>")
|
258 |
+
|
259 |
+
with gr.Tabs() as tabs:
|
260 |
+
with gr.TabItem("Voice Assistant") as voice_assistant:
|
261 |
+
select = gr.Dropdown([
|
262 |
+
'Llama 3 8B Service',
|
263 |
+
'Mixtral 8x7B',
|
264 |
+
'Llama 3 8B',
|
265 |
+
'Mistral 7B v0.3',
|
266 |
+
'Phi 3 mini',
|
267 |
+
],
|
268 |
+
value="Llama 3 8B Service",
|
269 |
+
label="Model"
|
270 |
+
)
|
271 |
+
seed = gr.Slider(
|
272 |
+
label="Seed",
|
273 |
+
minimum=0,
|
274 |
+
maximum=999999,
|
275 |
+
step=1,
|
276 |
+
value=0,
|
277 |
+
visible=False
|
278 |
+
)
|
279 |
+
start_button = gr.Button("Start Listening")
|
280 |
+
stop_button = gr.Button("Stop Listening")
|
281 |
+
status = gr.Markdown("Status: Not listening")
|
282 |
+
|
283 |
+
start_button.click(
|
284 |
+
fn=lambda model, seed: start_listening(model, seed),
|
285 |
+
inputs=[select, seed],
|
286 |
+
outputs=[status],
|
287 |
+
_js="() => {document.getElementById('status').textContent = 'Status: Listening'}"
|
288 |
+
)
|
289 |
+
stop_button.click(
|
290 |
+
fn=stop_listening,
|
291 |
+
inputs=[],
|
292 |
+
outputs=[status],
|
293 |
+
_js="() => {document.getElementById('status').textContent = 'Status: Not listening'}"
|
294 |
+
)
|
295 |
+
|
296 |
+
with gr.TabItem("Speech Translation") as speech_translation:
|
297 |
+
input_audio = gr.Audio(label="User", sources=["microphone"], type="filepath")
|
298 |
+
target_lang = gr.Dropdown(
|
299 |
+
choices=list(LANGUAGE_CODES.keys()),
|
300 |
+
value="German",
|
301 |
+
label="Target Language"
|
302 |
+
)
|
303 |
+
output_audio = gr.Audio(label="Translated Audio",
|
304 |
+
interactive=False,
|
305 |
+
autoplay=True,
|
306 |
+
elem_classes="audio")
|
307 |
+
|
308 |
+
gr.Interface(
|
309 |
+
fn=translate_speech,
|
310 |
+
inputs=[input_audio, target_lang],
|
311 |
+
outputs=[output_audio],
|
312 |
+
live=True
|
313 |
+
)
|
314 |
|
315 |
+
voice_assistant.select(fn=voice_assistant_tab, inputs=None, outputs=description)
|
316 |
+
speech_translation.select(fn=speech_translation_tab, inputs=None, outputs=description)
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
+
demo.queue(max_size=200).launch()
|
|