Pijush2023 commited on
Commit
c49ac12
·
verified ·
1 Parent(s): 4743a3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -77
app.py CHANGED
@@ -7,6 +7,7 @@ def install_parler_tts():
7
  # Call the function to install parler-tts
8
  install_parler_tts()
9
 
 
10
  import gradio as gr
11
  import requests
12
  import os
@@ -24,7 +25,6 @@ from googlemaps import Client as GoogleMapsClient
24
  from gtts import gTTS
25
  from diffusers import StableDiffusion3Pipeline
26
  import soundfile as sf
27
- import numpy as np
28
 
29
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
30
  from langchain_pinecone import PineconeVectorStore
@@ -173,7 +173,7 @@ def fetch_local_weather():
173
  }}
174
  .weather-content {{
175
  display: flex;
176
- align-items: center.
177
  }}
178
  .weather-icon {{
179
  flex: 1;
@@ -278,84 +278,23 @@ def generate_answer(message, choice):
278
  def bot(history, choice, tts_model):
279
  if not history:
280
  return history
281
-
282
  response, addresses = generate_answer(history[-1][0], choice)
283
  history[-1][1] = ""
284
-
285
- # Generate audio and process output prompt in parallel
286
  with concurrent.futures.ThreadPoolExecutor() as executor:
287
- audio_chunks_future = executor.submit(generate_audio_chunks, tts_model, response)
288
- text_chunks = [response[i:i + 100] for i in range(0, len(response), 100)] # Chunk the text for streaming
 
 
289
 
290
- audio_chunks = audio_chunks_future.result()
291
- for text_chunk, audio_chunk in zip(text_chunks, audio_chunks):
292
- history[-1][1] += text_chunk
293
- yield history, audio_chunk
294
- time.sleep(0.2) # Adjust this to synchronize text and audio appearance
295
-
296
- def generate_audio_chunks(tts_model, text):
297
- if tts_model == "ElevenLabs":
298
- return generate_audio_elevenlabs_chunks(text)
299
- else:
300
- return generate_audio_parler_tts_chunks(text)
301
-
302
- def generate_audio_elevenlabs_chunks(text):
303
- XI_API_KEY = os.environ['ELEVENLABS_API']
304
- VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
305
- tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
306
- headers = {
307
- "Accept": "application/json",
308
- "xi-api-key": XI_API_KEY
309
- }
310
- data = {
311
- "text": str(text),
312
- "model_id": "eleven_multilingual_v2",
313
- "voice_settings": {
314
- "stability": 1.0,
315
- "similarity_boost": 0.0,
316
- "style": 0.60, # Adjust style for more romantic tone
317
- "use_speaker_boost": False
318
- }
319
- }
320
- response = requests.post(tts_url, headers=headers, json=data, stream=True)
321
- audio_chunks = []
322
- if response.ok:
323
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
324
- for chunk in response.iter_content(chunk_size=1024):
325
- f.write(chunk)
326
- audio_chunks.append(f.name)
327
- return audio_chunks
328
- else:
329
- logging.error(f"Error generating audio: {response.text}")
330
- return []
331
-
332
- def generate_audio_parler_tts_chunks(text):
333
- model_id = 'parler-tts/parler_tts_mini_v0.1'
334
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
335
- try:
336
- model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
337
- except torch.cuda.OutOfMemoryError:
338
- print("CUDA out of memory. Switching to CPU.")
339
- device = "cpu"
340
- model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
341
- tokenizer = AutoTokenizer.from_pretrained(model_id)
342
-
343
- description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
344
-
345
- input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
346
- prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
347
-
348
- generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids, max_new_tokens=200)
349
- audio_arr = generation.cpu().numpy().squeeze()
350
-
351
- chunk_size = 16000 # Define a chunk size (adjust as needed)
352
- audio_chunks = []
353
- for i in range(0, len(audio_arr), chunk_size):
354
- audio_chunk = audio_arr[i:i + chunk_size]
355
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
356
- sf.write(f.name, audio_chunk, model.config.sampling_rate)
357
- audio_chunks.append(f.name)
358
- return audio_chunks
359
 
360
  def add_message(history, message):
361
  history.append((message, None))
@@ -409,7 +348,7 @@ def fetch_local_news():
409
  api_key = os.environ['SERP_API']
410
  url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
411
  response = requests.get(url)
412
- if response.status_code == 200:
413
  results = response.json().get("news_results", [])
414
  news_html = """
415
  <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
@@ -490,6 +429,7 @@ model_id = 'openai/whisper-large-v3'
490
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
491
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
492
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
 
493
  use_safetensors=True).to(device)
494
  processor = AutoProcessor.from_pretrained(model_id)
495
 
@@ -536,6 +476,62 @@ def show_map_if_details(history,choice):
536
  else:
537
  return gr.update(visible=False), ""
538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  # Stable Diffusion setup
540
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
541
  pipe = pipe.to("cuda")
 
7
  # Call the function to install parler-tts
8
  install_parler_tts()
9
 
10
+
11
  import gradio as gr
12
  import requests
13
  import os
 
25
  from gtts import gTTS
26
  from diffusers import StableDiffusion3Pipeline
27
  import soundfile as sf
 
28
 
29
  from langchain_openai import OpenAIEmbeddings, ChatOpenAI
30
  from langchain_pinecone import PineconeVectorStore
 
173
  }}
174
  .weather-content {{
175
  display: flex;
176
+ align-items: center;
177
  }}
178
  .weather-icon {{
179
  flex: 1;
 
278
  def bot(history, choice, tts_model):
279
  if not history:
280
  return history
 
281
  response, addresses = generate_answer(history[-1][0], choice)
282
  history[-1][1] = ""
283
+
284
+ # Generate audio for the entire response in a separate thread
285
  with concurrent.futures.ThreadPoolExecutor() as executor:
286
+ if tts_model == "ElevenLabs":
287
+ audio_future = executor.submit(generate_audio_elevenlabs, response)
288
+ else:
289
+ audio_future = executor.submit(generate_audio_parler_tts, response)
290
 
291
+ for character in response:
292
+ history[-1][1] += character
293
+ time.sleep(0.05) # Adjust the speed of text appearance
294
+ yield history, None
295
+
296
+ audio_path = audio_future.result()
297
+ yield history, audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  def add_message(history, message):
300
  history.append((message, None))
 
348
  api_key = os.environ['SERP_API']
349
  url = f'https://serpapi.com/search.json?engine=google_news&q=birmingham headline&api_key={api_key}'
350
  response = requests.get(url)
351
+ if response.status_code == 200):
352
  results = response.json().get("news_results", [])
353
  news_html = """
354
  <h2 style="font-family: 'Georgia', serif; color: #ff0000; background-color: #f8f8f8; padding: 10px; border-radius: 10px;">Birmingham Today</h2>
 
429
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
430
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
431
  model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype,
432
+ #low_cpu_mem_usage=True,
433
  use_safetensors=True).to(device)
434
  processor = AutoProcessor.from_pretrained(model_id)
435
 
 
476
  else:
477
  return gr.update(visible=False), ""
478
 
479
+ def generate_audio_elevenlabs(text):
480
+ XI_API_KEY = os.environ['ELEVENLABS_API']
481
+ VOICE_ID = 'd9MIrwLnvDeH7aZb61E9' # Replace with your voice ID
482
+ tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
483
+ headers = {
484
+ "Accept": "application/json",
485
+ "xi-api-key": XI_API_KEY
486
+ }
487
+ data = {
488
+ "text": str(text),
489
+ "model_id": "eleven_multilingual_v2",
490
+ "voice_settings": {
491
+ "stability": 1.0,
492
+ "similarity_boost": 0.0,
493
+ "style": 0.60, # Adjust style for more romantic tone
494
+ "use_speaker_boost": False
495
+ }
496
+ }
497
+ response = requests.post(tts_url, headers=headers, json=data, stream=True)
498
+ if response.ok:
499
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
500
+ for chunk in response.iter_content(chunk_size=1024):
501
+ f.write(chunk)
502
+ temp_audio_path = f.name
503
+ logging.debug(f"Audio saved to {temp_audio_path}")
504
+ return temp_audio_path
505
+ else:
506
+ logging.error(f"Error generating audio: {response.text}")
507
+ return None
508
+
509
+ def generate_audio_parler_tts(text):
510
+ model_id = 'parler-tts/parler_tts_mini_v0.1'
511
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
512
+ try:
513
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
514
+ except torch.cuda.OutOfMemoryError:
515
+ print("CUDA out of memory. Switching to CPU.")
516
+ device = "cpu"
517
+ model = ParlerTTSForConditionalGeneration.from_pretrained(model_id).to(device)
518
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
519
+
520
+ description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."
521
+
522
+ input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
523
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
524
+
525
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
526
+ audio_arr = generation.cpu().numpy().squeeze()
527
+
528
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
529
+ sf.write(f.name, audio_arr, model.config.sampling_rate)
530
+ temp_audio_path = f.name
531
+
532
+ logging.debug(f"Audio saved to {temp_audio_path}")
533
+ return temp_audio_path
534
+
535
  # Stable Diffusion setup
536
  pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
537
  pipe = pipe.to("cuda")