Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ def install_parler_tts():
|
|
| 7 |
# Call the function to install parler-tts
|
| 8 |
install_parler_tts()
|
| 9 |
|
|
|
|
| 10 |
import gradio as gr
|
| 11 |
import requests
|
| 12 |
import os
|
|
@@ -24,7 +25,6 @@ from googlemaps import Client as GoogleMapsClient
|
|
| 24 |
from gtts import gTTS
|
| 25 |
from diffusers import StableDiffusion3Pipeline
|
| 26 |
import soundfile as sf
|
| 27 |
-
import numpy as np
|
| 28 |
|
| 29 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 30 |
from langchain_pinecone import PineconeVectorStore
|
|
@@ -173,7 +173,7 @@ def fetch_local_weather():
|
|
| 173 |
}}
|
| 174 |
.weather-content {{
|
| 175 |
display: flex;
|
| 176 |
-
align-items: center
|
| 177 |
}}
|
| 178 |
.weather-icon {{
|
| 179 |
flex: 1;
|
|
@@ -278,84 +278,23 @@ def generate_answer(message, choice):
|
|
| 278 |
def bot(history, choice, tts_model):
|
| 279 |
if not history:
|
| 280 |
return history
|
| 281 |
-
|
| 282 |
response, addresses = generate_answer(history[-1][0], choice)
|
| 283 |
history[-1][1] = ""
|
| 284 |
-
|
| 285 |
-
# Generate audio
|
| 286 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
yield history,
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
if tts_model == "ElevenLabs":
|
| 298 |
-
return generate_audio_elevenlabs_chunks(text)
|
| 299 |
-
else:
|
| 300 |
-
return generate_audio_parler_tts_chunks(text)
|
| 301 |
-
|
| 302 |
-
def generate_audio_elevenlabs_chunks(text):
|
| 303 |
-
XI_API_KEY = os.environ['ELEVENLABS_API']
|
| 304 |
-
VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
|
| 305 |
-
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
| 306 |
-
headers = {
|
| 307 |
-
"Accept": "application/json",
|
| 308 |
-
"xi-api-key": XI_API_KEY
|
| 309 |
-
}
|
| 310 |
-
data = {
|
| 311 |
-
"text": str(text),
|
| 312 |
-
"model_id": "eleven_multilingual_v2",
|
| 313 |
-
"voice_settings": {
|
| 314 |
-
"stability": 1.0,
|
| 315 |
-
"similarity_boost": 0.0,
|
| 316 |
-
"style": 0.60, # Adjust style for more romantic tone
|
| 317 |
-
"use_speaker_boost": False
|
| 318 |
-
}
|
| 319 |
-
}
|
| 320 |
-
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
| 321 |
-
audio_chunks = []
|
| 322 |
-
if response.ok:
|
| 323 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 324 |
-
for chunk in response.iter_content(chunk_size=1024):
|
| 325 |
-
f.write(chunk)
|
| 326 |
-
audio_chunks.append(f.name)
|
| 327 |
-
return audio_chunks
|
| 328 |
-
else:
|
| 329 |
-
logging.error(f"Error generating audio: {response.text}")
|
| 330 |
-
return []
|
| 331 |
-
|
| 332 |
-
def generate_audio_parler_tts_chunks(text):
|
| 333 |
-
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 334 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 335 |
-
try:
|
| 336 |
-
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 337 |
-
except torch.cuda.OutOfMemoryError:
|
| 338 |
-
print("CUDA out of memory. Switching to CPU.")
|
| 339 |
-
device = "cpu"
|
| 340 |
-
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 341 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 342 |
-
|
| 343 |
-
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 344 |
-
|
| 345 |
-
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 346 |
-
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
| 347 |
-
|
| 348 |
-
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids, max_new_tokens=200)
|
| 349 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
| 350 |
-
|
| 351 |
-
chunk_size = 16000 # Define a chunk size (adjust as needed)
|
| 352 |
-
audio_chunks = []
|
| 353 |
-
for i in range(0, len(audio_arr), chunk_size):
|
| 354 |
-
audio_chunk = audio_arr[i:i + chunk_size]
|
| 355 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 356 |
-
sf.write(f.name, audio_chunk, model.config.sampling_rate)
|
| 357 |
-
audio_chunks.append(f.name)
|
| 358 |
-
return audio_chunks
|
| 359 |
|
| 360 |
def add_message(history, message):
|
| 361 |
history.append((message, None))
|
|
@@ -409,7 +348,7 @@ def fetch_local_news():
|
|
| 409 |
api_key = os.environ['SERP_API']
|
| 410 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
| 411 |
response = requests.get(url)
|
| 412 |
-
if response.status_code == 200:
|
| 413 |
results = response.json().get("news_results", [])
|
| 414 |
news_html = """
|
| 415 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
@@ -490,6 +429,7 @@ model_id = 'openai/whisper-large-v3'
|
|
| 490 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 491 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 492 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
|
|
|
|
| 493 |
use_safetensors=True).to(device)
|
| 494 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 495 |
|
|
@@ -536,6 +476,62 @@ def show_map_if_details(history,choice):
|
|
| 536 |
else:
|
| 537 |
return gr.update(visible=False), ""
|
| 538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
# Stable Diffusion setup
|
| 540 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
| 541 |
pipe = pipe.to("cuda")
|
|
|
|
| 7 |
# Call the function to install parler-tts
|
| 8 |
install_parler_tts()
|
| 9 |
|
| 10 |
+
|
| 11 |
import gradio as gr
|
| 12 |
import requests
|
| 13 |
import os
|
|
|
|
| 25 |
from gtts import gTTS
|
| 26 |
from diffusers import StableDiffusion3Pipeline
|
| 27 |
import soundfile as sf
|
|
|
|
| 28 |
|
| 29 |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 30 |
from langchain_pinecone import PineconeVectorStore
|
|
|
|
| 173 |
}}
|
| 174 |
.weather-content {{
|
| 175 |
display: flex;
|
| 176 |
+
align-items: center;
|
| 177 |
}}
|
| 178 |
.weather-icon {{
|
| 179 |
flex: 1;
|
|
|
|
| 278 |
def bot(history, choice, tts_model):
|
| 279 |
if not history:
|
| 280 |
return history
|
|
|
|
| 281 |
response, addresses = generate_answer(history[-1][0], choice)
|
| 282 |
history[-1][1] = ""
|
| 283 |
+
|
| 284 |
+
# Generate audio for the entire response in a separate thread
|
| 285 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 286 |
+
if tts_model == "ElevenLabs":
|
| 287 |
+
audio_future = executor.submit(generate_audio_elevenlabs, response)
|
| 288 |
+
else:
|
| 289 |
+
audio_future = executor.submit(generate_audio_parler_tts, response)
|
| 290 |
|
| 291 |
+
for character in response:
|
| 292 |
+
history[-1][1] += character
|
| 293 |
+
time.sleep(0.05) # Adjust the speed of text appearance
|
| 294 |
+
yield history, None
|
| 295 |
+
|
| 296 |
+
audio_path = audio_future.result()
|
| 297 |
+
yield history, audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
def add_message(history, message):
|
| 300 |
history.append((message, None))
|
|
|
|
| 348 |
api_key = os.environ['SERP_API']
|
| 349 |
url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
|
| 350 |
response = requests.get(url)
|
| 351 |
+
if response.status_code == 200):
|
| 352 |
results = response.json().get("news_results", [])
|
| 353 |
news_html = """
|
| 354 |
<h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
|
|
|
|
| 429 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 430 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 431 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
|
| 432 |
+
#low_cpu_mem_usage=True,
|
| 433 |
use_safetensors=True).to(device)
|
| 434 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 435 |
|
|
|
|
| 476 |
else:
|
| 477 |
return gr.update(visible=False), ""
|
| 478 |
|
| 479 |
+
def generate_audio_elevenlabs(text):
|
| 480 |
+
XI_API_KEY = os.environ['ELEVENLABS_API']
|
| 481 |
+
VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
|
| 482 |
+
tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
|
| 483 |
+
headers = {
|
| 484 |
+
"Accept": "application/json",
|
| 485 |
+
"xi-api-key": XI_API_KEY
|
| 486 |
+
}
|
| 487 |
+
data = {
|
| 488 |
+
"text": str(text),
|
| 489 |
+
"model_id": "eleven_multilingual_v2",
|
| 490 |
+
"voice_settings": {
|
| 491 |
+
"stability": 1.0,
|
| 492 |
+
"similarity_boost": 0.0,
|
| 493 |
+
"style": 0.60, # Adjust style for more romantic tone
|
| 494 |
+
"use_speaker_boost": False
|
| 495 |
+
}
|
| 496 |
+
}
|
| 497 |
+
response = requests.post(tts_url, headers=headers, json=data, stream=True)
|
| 498 |
+
if response.ok:
|
| 499 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
|
| 500 |
+
for chunk in response.iter_content(chunk_size=1024):
|
| 501 |
+
f.write(chunk)
|
| 502 |
+
temp_audio_path = f.name
|
| 503 |
+
logging.debug(f"Audio saved to {temp_audio_path}")
|
| 504 |
+
return temp_audio_path
|
| 505 |
+
else:
|
| 506 |
+
logging.error(f"Error generating audio: {response.text}")
|
| 507 |
+
return None
|
| 508 |
+
|
| 509 |
+
def generate_audio_parler_tts(text):
|
| 510 |
+
model_id = 'parler-tts/parler_tts_mini_v0.1'
|
| 511 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 512 |
+
try:
|
| 513 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 514 |
+
except torch.cuda.OutOfMemoryError:
|
| 515 |
+
print("CUDA out of memory. Switching to CPU.")
|
| 516 |
+
device = "cpu"
|
| 517 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
|
| 518 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 519 |
+
|
| 520 |
+
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
|
| 521 |
+
|
| 522 |
+
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
|
| 523 |
+
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
|
| 524 |
+
|
| 525 |
+
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 526 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
| 527 |
+
|
| 528 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
|
| 529 |
+
sf.write(f.name, audio_arr, model.config.sampling_rate)
|
| 530 |
+
temp_audio_path = f.name
|
| 531 |
+
|
| 532 |
+
logging.debug(f"Audio saved to {temp_audio_path}")
|
| 533 |
+
return temp_audio_path
|
| 534 |
+
|
| 535 |
# Stable Diffusion setup
|
| 536 |
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
|
| 537 |
pipe = pipe.to("cuda")
|