Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -7,6 +7,7 @@ def install_parler_tts():
|
|
7 |
# Call the function to install parler-tts
|
8 |
install_parler_tts()
|
9 |
|
|
|
10 |
import gradio as gr
|
11 |
import requests
|
12 |
import os
|
@@ -24,7 +25,6 @@ from googlemaps import Client as GoogleMapsClient
|
|
24 |
from gtts import gTTS
|
25 |
from diffusers import StableDiffusion3Pipeline
|
26 |
import soundfile as sf
|
27 |
-
import numpy as np
|
28 |
|
29 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
30 |
from langchain_pinecone import PineconeVectorStore
|
@@ -173,7 +173,7 @@ def fetch_local_weather():
|
|
173 |
}}
|
174 |
.weather-content {{
|
175 |
display: flex;
|
176 |
-
align-items: center
|
177 |
}}
|
178 |
.weather-icon {{
|
179 |
flex: 1;
|
@@ -278,84 +278,23 @@ def generate_answer(message, choice):
|
|
278 |
def bot(history, choice, tts_model):
|
279 |
if not history:
|
280 |
return history
|
281 |
-
|
282 |
response, addresses = generate_answer(history[-1][0], choice)
|
283 |
history[-1][1] = ""
|
284 |
-
|
285 |
-
# Generate audio
|
286 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
287 |
-
|
288 |
-
|
|
|
|
|
289 |
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
yield history,
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
if tts_model == "ElevenLabs":
|
298 |
-
return generate_audio_elevenlabs_chunks(text)
|
299 |
-
else:
|
300 |
-
return generate_audio_parler_tts_chunks(text)
|
301 |
-
|
302 |
-
def generate_audio_elevenlabs_chunks(text):
|
303 |
-
XI_API_KEY = os.environ['ELEVENLABS_API']
|
304 |
-
VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
|
305 |
-
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
306 |
-
headers = {
|
307 |
-
"Accept": "application/json",
|
308 |
-
"xi-api-key": XI_API_KEY
|
309 |
-
}
|
310 |
-
data = {
|
311 |
-
"text": str(text),
|
312 |
-
"model_id": "eleven_multilingual_v2",
|
313 |
-
"voice_settings": {
|
314 |
-
"stability": 1.0,
|
315 |
-
"similarity_boost": 0.0,
|
316 |
-
"style": 0.60, # Adjust style for more romantic tone
|
317 |
-
"use_speaker_boost": False
|
318 |
-
}
|
319 |
-
}
|
320 |
-
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
321 |
-
audio_chunks = []
|
322 |
-
if response.ok:
|
323 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
324 |
-
for chunk in response.iter_content(chunk_size=1024):
|
325 |
-
f.write(chunk)
|
326 |
-
audio_chunks.append(f.name)
|
327 |
-
return audio_chunks
|
328 |
-
else:
|
329 |
-
logging.error(f"Error generating audio: {response.text}")
|
330 |
-
return []
|
331 |
-
|
332 |
-
def generate_audio_parler_tts_chunks(text):
|
333 |
-
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
334 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
335 |
-
try:
|
336 |
-
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
337 |
-
except torch.cuda.OutOfMemoryError:
|
338 |
-
print("CUDA out of memory. Switching to CPU.")
|
339 |
-
device = "cpu"
|
340 |
-
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
341 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
342 |
-
|
343 |
-
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
344 |
-
|
345 |
-
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
346 |
-
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
347 |
-
|
348 |
-
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids, max_new_tokens=200)
|
349 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
350 |
-
|
351 |
-
chunk_size = 16000 # Define a chunk size (adjust as needed)
|
352 |
-
audio_chunks = []
|
353 |
-
for i in range(0, len(audio_arr), chunk_size):
|
354 |
-
audio_chunk = audio_arr[i:i + chunk_size]
|
355 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
356 |
-
sf.write(f.name, audio_chunk, model.config.sampling_rate)
|
357 |
-
audio_chunks.append(f.name)
|
358 |
-
return audio_chunks
|
359 |
|
360 |
def add_message(history, message):
|
361 |
history.append((message, None))
|
@@ -409,7 +348,7 @@ def fetch_local_news():
|
|
409 |
api_key = os.environ['SERP_API']
|
410 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
411 |
response = requests.get(url)
|
412 |
-
if response.status_code == 200:
|
413 |
results = response.json().get("news_results", [])
|
414 |
news_html = """
|
415 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
@@ -490,6 +429,7 @@ model_id = 'openai/whisper-large-v3'
|
|
490 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
491 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
492 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
|
|
|
493 |
use_safetensors=True).to(device)
|
494 |
processor = AutoProcessor.from_pretrained(model_id)
|
495 |
|
@@ -536,6 +476,62 @@ def show_map_if_details(history,choice):
|
|
536 |
else:
|
537 |
return gr.update(visible=False), ""
|
538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
# Stable Diffusion setup
|
540 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
541 |
pipe = pipe.to("cuda")
|
|
|
7 |
# Call the function to install parler-tts
|
8 |
install_parler_tts()
|
9 |
|
10 |
+
|
11 |
import gradio as gr
|
12 |
import requests
|
13 |
import os
|
|
|
25 |
from gtts import gTTS
|
26 |
from diffusers import StableDiffusion3Pipeline
|
27 |
import soundfile as sf
|
|
|
28 |
|
29 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
30 |
from langchain_pinecone import PineconeVectorStore
|
|
|
173 |
}}
|
174 |
.weather-content {{
|
175 |
display: flex;
|
176 |
+
align-items: center;
|
177 |
}}
|
178 |
.weather-icon {{
|
179 |
flex: 1;
|
|
|
278 |
def bot(history, choice, tts_model):
|
279 |
if not history:
|
280 |
return history
|
|
|
281 |
response, addresses = generate_answer(history[-1][0], choice)
|
282 |
history[-1][1] = ""
|
283 |
+
|
284 |
+
# Generate audio for the entire response in a separate thread
|
285 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
286 |
+
if tts_model == "ElevenLabs":
|
287 |
+
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
288 |
+
else:
|
289 |
+
audio_future = executor.submit(generate_audio_parler_tts, response)
|
290 |
|
291 |
+
for character in response:
|
292 |
+
history[-1][1] += character
|
293 |
+
time.sleep(0.05) # Adjust the speed of text appearance
|
294 |
+
yield history, None
|
295 |
+
|
296 |
+
audio_path = audio_future.result()
|
297 |
+
yield history, audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
def add_message(history, message):
|
300 |
history.append((message, None))
|
|
|
348 |
api_key = os.environ['SERP_API']
|
349 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
350 |
response = requests.get(url)
|
351 |
+
if response.status_code == 200):
|
352 |
results = response.json().get("news_results", [])
|
353 |
news_html = """
|
354 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
|
429 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
430 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
431 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
|
432 |
+
#low_cpu_mem_usage=True,
|
433 |
use_safetensors=True).to(device)
|
434 |
processor = AutoProcessor.from_pretrained(model_id)
|
435 |
|
|
|
476 |
else:
|
477 |
return gr.update(visible=False), ""
|
478 |
|
479 |
+
def generate_audio_elevenlabs(text):
|
480 |
+
XI_API_KEY = os.environ['ELEVENLABS_API']
|
481 |
+
VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
|
482 |
+
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
483 |
+
headers = {
|
484 |
+
"Accept": "application/json",
|
485 |
+
"xi-api-key": XI_API_KEY
|
486 |
+
}
|
487 |
+
data = {
|
488 |
+
"text": str(text),
|
489 |
+
"model_id": "eleven_multilingual_v2",
|
490 |
+
"voice_settings": {
|
491 |
+
"stability": 1.0,
|
492 |
+
"similarity_boost": 0.0,
|
493 |
+
"style": 0.60, # Adjust style for more romantic tone
|
494 |
+
"use_speaker_boost": False
|
495 |
+
}
|
496 |
+
}
|
497 |
+
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
498 |
+
if response.ok:
|
499 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
|
500 |
+
for chunk in response.iter_content(chunk_size=1024):
|
501 |
+
f.write(chunk)
|
502 |
+
temp_audio_path = f.name
|
503 |
+
logging.debug(f"Audio saved to {temp_audio_path}")
|
504 |
+
return temp_audio_path
|
505 |
+
else:
|
506 |
+
logging.error(f"Error generating audio: {response.text}")
|
507 |
+
return None
|
508 |
+
|
509 |
+
def generate_audio_parler_tts(text):
|
510 |
+
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
511 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
512 |
+
try:
|
513 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
514 |
+
except torch.cuda.OutOfMemoryError:
|
515 |
+
print("CUDA out of memory. Switching to CPU.")
|
516 |
+
device = "cpu"
|
517 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
518 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
519 |
+
|
520 |
+
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
521 |
+
|
522 |
+
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
523 |
+
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
524 |
+
|
525 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
526 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
527 |
+
|
528 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
529 |
+
sf.write(f.name, audio_arr, model.config.sampling_rate)
|
530 |
+
temp_audio_path = f.name
|
531 |
+
|
532 |
+
logging.debug(f"Audio saved to {temp_audio_path}")
|
533 |
+
return temp_audio_path
|
534 |
+
|
535 |
# Stable Diffusion setup
|
536 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
537 |
pipe = pipe.to("cuda")
|