Pijush2023 commited on
Commit
da4dbd8
·
verified ·
1 Parent(s): 7e81093

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -106
app.py CHANGED
@@ -713,54 +713,29 @@ def generate_map(location_names):
713
  # return image_1, image_2, image_3
714
 
715
 
716
-
717
-
718
- # Clear any cached memory
719
- torch.cuda.empty_cache()
720
-
721
-
722
- import gradio as gr
723
  import torch
724
  from diffusers import FluxPipeline
725
 
726
- # Check if CUDA (GPU) is available, otherwise fallback to CPU
727
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
728
-
729
- # Function to initialize Flux bot model
730
- def initialize_flux_bot():
731
- try:
732
- torch.cuda.empty_cache() # Clear GPU memory cache
733
- pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float16) # Use FP16
734
- pipe.to(device) # Move the model to the correct device (GPU/CPU)
735
- except torch.cuda.OutOfMemoryError:
736
- print("CUDA out of memory, switching to CPU.")
737
- pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float32) # Use FP32 for CPU
738
- pipe.to("cpu")
739
  return pipe
740
 
741
- # Function to generate image using Flux bot on the specified device
742
- def generate_image_flux(prompt):
743
- pipe = initialize_flux_bot()
744
- image = pipe(
 
 
 
745
  prompt,
746
  guidance_scale=0.0,
747
- num_inference_steps=2,
748
- max_sequence_length=128,
749
- generator=torch.Generator(device).manual_seed(0)
750
  ).images[0]
751
  return image
752
 
753
- # Hardcoded prompts for the images
754
- hardcoded_prompt_1 = "A high quality cinematic image for Toyota Truck in Birmingham skyline shot in the style of Michael Mann"
755
- hardcoded_prompt_2 = "A high quality cinematic image for Alabama Quarterback close up emotional shot in the style of Michael Mann"
756
- hardcoded_prompt_3 = "A high quality cinematic image for Taylor Swift concert in Birmingham skyline style of Michael Mann"
757
-
758
- # Function to update images
759
- def update_images():
760
- image_1 = generate_image_flux(hardcoded_prompt_1)
761
- image_2 = generate_image_flux(hardcoded_prompt_2)
762
- image_3 = generate_image_flux(hardcoded_prompt_3)
763
- return image_1, image_2, image_3
764
 
765
 
766
 
@@ -949,76 +924,76 @@ def generate_audio_elevenlabs(text):
949
 
950
  # chunking audio and then Process
951
 
952
- # import concurrent.futures
953
- # import tempfile
954
- # import os
955
- # import numpy as np
956
- # import logging
957
- # from queue import Queue
958
- # from threading import Thread
959
- # from scipy.io.wavfile import write as write_wav
960
- # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
961
- # from transformers import AutoTokenizer
962
-
963
- # # Ensure your device is set to CUDA
964
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
965
-
966
- # repo_id = "parler-tts/parler-tts-mini-v1"
967
-
968
- # def generate_audio_parler_tts(text):
969
- # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
970
- # chunk_size_in_s = 0.5
971
-
972
- # # Initialize the tokenizer and model
973
- # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
974
- # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
975
- # sampling_rate = parler_model.audio_encoder.config.sampling_rate
976
- # frame_rate = parler_model.audio_encoder.config.frame_rate
977
-
978
- # def generate(text, description, play_steps_in_s=0.5):
979
- # play_steps = int(frame_rate * play_steps_in_s)
980
- # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
981
 
982
- # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
983
- # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
984
 
985
- # generation_kwargs = dict(
986
- # input_ids=inputs.input_ids,
987
- # prompt_input_ids=prompt.input_ids,
988
- # attention_mask=inputs.attention_mask,
989
- # prompt_attention_mask=prompt.attention_mask,
990
- # streamer=streamer,
991
- # do_sample=True,
992
- # temperature=1.0,
993
- # min_new_tokens=10,
994
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
 
996
- # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
997
- # thread.start()
998
 
999
- # for new_audio in streamer:
1000
- # if new_audio.shape[0] == 0:
1001
- # break
1002
- # # Save or process each audio chunk as it is generated
1003
- # yield sampling_rate, new_audio
1004
 
1005
- # audio_segments = []
1006
- # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1007
- # audio_segments.append(audio_chunk)
1008
 
1009
- # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1010
- # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1011
- # logging.debug(f"Saved chunk to {temp_audio_path}")
1012
 
1013
 
1014
- # # Combine all the audio chunks into one audio file
1015
- # combined_audio = np.concatenate(audio_segments)
1016
- # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1017
 
1018
- # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1019
 
1020
- # logging.debug(f"Combined audio saved to {combined_audio_path}")
1021
- # return combined_audio_path
1022
 
1023
 
1024
  def fetch_local_events():
@@ -1503,15 +1478,19 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1503
  news_output = gr.HTML(value=fetch_local_news())
1504
  events_output = gr.HTML(value=fetch_local_events())
1505
 
 
1506
  with gr.Column():
1507
- # image_output_1 = gr.Image(value=generate_image(hardcoded_prompt_1), width=400, height=400)
1508
- # image_output_2 = gr.Image(value=generate_image(hardcoded_prompt_2), width=400, height=400)
1509
- # image_output_3 = gr.Image(value=generate_image(hardcoded_prompt_3), width=400, height=400)
1510
-
1511
- # Display images
1512
- image_output_1 = gr.Image(value=generate_image_flux(hardcoded_prompt_1), width=400, height=400)
1513
- image_output_2 = gr.Image(value=generate_image_flux(hardcoded_prompt_2), width=400, height=400)
1514
- image_output_3 = gr.Image(value=generate_image_flux(hardcoded_prompt_3), width=400, height=400)
 
 
 
1515
 
1516
  # Refresh button to update images
1517
  refresh_button = gr.Button("Refresh Images")
 
713
  # return image_1, image_2, image_3
714
 
715
 
 
 
 
 
 
 
 
716
  import torch
717
  from diffusers import FluxPipeline
718
 
719
+ def initialize_flux_pipeline():
720
+ pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
721
+ pipe.enable_model_cpu_offload() # Offload to CPU to save VRAM
 
 
 
 
 
 
 
 
 
 
722
  return pipe
723
 
724
+ # Initialize the model
725
+ flux_pipe = initialize_flux_pipeline()
726
+
727
+
728
+ def generate_flux_image(prompt):
729
+ # Use the initialized flux_pipe to generate an image based on the input prompt
730
+ image = flux_pipe(
731
  prompt,
732
  guidance_scale=0.0,
733
+ num_inference_steps=4,
734
+ max_sequence_length=256,
735
+ generator=torch.Generator("cpu").manual_seed(0)
736
  ).images[0]
737
  return image
738
 
 
 
 
 
 
 
 
 
 
 
 
739
 
740
 
741
 
 
924
 
925
  # chunking audio and then Process
926
 
927
+ import concurrent.futures
928
+ import tempfile
929
+ import os
930
+ import numpy as np
931
+ import logging
932
+ from queue import Queue
933
+ from threading import Thread
934
+ from scipy.io.wavfile import write as write_wav
935
+ from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
936
+ from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
 
938
+ # Ensure your device is set to CUDA
939
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
940
 
941
+ repo_id = "parler-tts/parler-tts-mini-v1"
942
+
943
+ def generate_audio_parler_tts(text):
944
+ description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
945
+ chunk_size_in_s = 0.5
946
+
947
+ # Initialize the tokenizer and model
948
+ parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
949
+ parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
950
+ sampling_rate = parler_model.audio_encoder.config.sampling_rate
951
+ frame_rate = parler_model.audio_encoder.config.frame_rate
952
+
953
+ def generate(text, description, play_steps_in_s=0.5):
954
+ play_steps = int(frame_rate * play_steps_in_s)
955
+ streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
956
+
957
+ inputs = parler_tokenizer(description, return_tensors="pt").to(device)
958
+ prompt = parler_tokenizer(text, return_tensors="pt").to(device)
959
+
960
+ generation_kwargs = dict(
961
+ input_ids=inputs.input_ids,
962
+ prompt_input_ids=prompt.input_ids,
963
+ attention_mask=inputs.attention_mask,
964
+ prompt_attention_mask=prompt.attention_mask,
965
+ streamer=streamer,
966
+ do_sample=True,
967
+ temperature=1.0,
968
+ min_new_tokens=10,
969
+ )
970
 
971
+ thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
972
+ thread.start()
973
 
974
+ for new_audio in streamer:
975
+ if new_audio.shape[0] == 0:
976
+ break
977
+ # Save or process each audio chunk as it is generated
978
+ yield sampling_rate, new_audio
979
 
980
+ audio_segments = []
981
+ for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
982
+ audio_segments.append(audio_chunk)
983
 
984
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
985
+ write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
986
+ logging.debug(f"Saved chunk to {temp_audio_path}")
987
 
988
 
989
+ # Combine all the audio chunks into one audio file
990
+ combined_audio = np.concatenate(audio_segments)
991
+ combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
992
 
993
+ write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
994
 
995
+ logging.debug(f"Combined audio saved to {combined_audio_path}")
996
+ return combined_audio_path
997
 
998
 
999
  def fetch_local_events():
 
1478
  news_output = gr.HTML(value=fetch_local_news())
1479
  events_output = gr.HTML(value=fetch_local_events())
1480
 
1481
+
1482
  with gr.Column():
1483
+
1484
+ flux_prompt = gr.Textbox(show_copy_button=True, label="Flux Prompt", placeholder="Enter prompt for Flux image generation")
1485
+ flux_image_output = gr.Image()
1486
+ flux_generate_button = gr.Button("Generate Flux Image")
1487
+
1488
+ # When the button is clicked, the image generation function is triggered
1489
+ flux_generate_button.click(fn=generate_flux_image, inputs=flux_prompt, outputs=flux_image_output)
1490
+
1491
+
1492
+
1493
+
1494
 
1495
  # Refresh button to update images
1496
  refresh_button = gr.Button("Refresh Images")