Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -713,54 +713,29 @@ def generate_map(location_names):
|
|
713 |
# return image_1, image_2, image_3
|
714 |
|
715 |
|
716 |
-
|
717 |
-
|
718 |
-
# Clear any cached memory
|
719 |
-
torch.cuda.empty_cache()
|
720 |
-
|
721 |
-
|
722 |
-
import gradio as gr
|
723 |
import torch
|
724 |
from diffusers import FluxPipeline
|
725 |
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
# Function to initialize Flux bot model
|
730 |
-
def initialize_flux_bot():
|
731 |
-
try:
|
732 |
-
torch.cuda.empty_cache() # Clear GPU memory cache
|
733 |
-
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float16) # Use FP16
|
734 |
-
pipe.to(device) # Move the model to the correct device (GPU/CPU)
|
735 |
-
except torch.cuda.OutOfMemoryError:
|
736 |
-
print("CUDA out of memory, switching to CPU.")
|
737 |
-
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.float32) # Use FP32 for CPU
|
738 |
-
pipe.to("cpu")
|
739 |
return pipe
|
740 |
|
741 |
-
#
|
742 |
-
|
743 |
-
|
744 |
-
|
|
|
|
|
|
|
745 |
prompt,
|
746 |
guidance_scale=0.0,
|
747 |
-
num_inference_steps=
|
748 |
-
max_sequence_length=
|
749 |
-
generator=torch.Generator(
|
750 |
).images[0]
|
751 |
return image
|
752 |
|
753 |
-
# Hardcoded prompts for the images
|
754 |
-
hardcoded_prompt_1 = "A high quality cinematic image for Toyota Truck in Birmingham skyline shot in the style of Michael Mann"
|
755 |
-
hardcoded_prompt_2 = "A high quality cinematic image for Alabama Quarterback close up emotional shot in the style of Michael Mann"
|
756 |
-
hardcoded_prompt_3 = "A high quality cinematic image for Taylor Swift concert in Birmingham skyline style of Michael Mann"
|
757 |
-
|
758 |
-
# Function to update images
|
759 |
-
def update_images():
|
760 |
-
image_1 = generate_image_flux(hardcoded_prompt_1)
|
761 |
-
image_2 = generate_image_flux(hardcoded_prompt_2)
|
762 |
-
image_3 = generate_image_flux(hardcoded_prompt_3)
|
763 |
-
return image_1, image_2, image_3
|
764 |
|
765 |
|
766 |
|
@@ -949,76 +924,76 @@ def generate_audio_elevenlabs(text):
|
|
949 |
|
950 |
# chunking audio and then Process
|
951 |
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
-
|
962 |
-
|
963 |
-
# # Ensure your device is set to CUDA
|
964 |
-
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
965 |
-
|
966 |
-
# repo_id = "parler-tts/parler-tts-mini-v1"
|
967 |
-
|
968 |
-
# def generate_audio_parler_tts(text):
|
969 |
-
# description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
970 |
-
# chunk_size_in_s = 0.5
|
971 |
-
|
972 |
-
# # Initialize the tokenizer and model
|
973 |
-
# parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
974 |
-
# parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
975 |
-
# sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
976 |
-
# frame_rate = parler_model.audio_encoder.config.frame_rate
|
977 |
-
|
978 |
-
# def generate(text, description, play_steps_in_s=0.5):
|
979 |
-
# play_steps = int(frame_rate * play_steps_in_s)
|
980 |
-
# streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
981 |
|
982 |
-
#
|
983 |
-
|
984 |
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
#
|
992 |
-
|
993 |
-
|
994 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
995 |
|
996 |
-
|
997 |
-
|
998 |
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
#
|
1003 |
-
|
1004 |
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
|
1013 |
|
1014 |
-
#
|
1015 |
-
|
1016 |
-
|
1017 |
|
1018 |
-
|
1019 |
|
1020 |
-
|
1021 |
-
|
1022 |
|
1023 |
|
1024 |
def fetch_local_events():
|
@@ -1503,15 +1478,19 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
|
|
1503 |
news_output = gr.HTML(value=fetch_local_news())
|
1504 |
events_output = gr.HTML(value=fetch_local_events())
|
1505 |
|
|
|
1506 |
with gr.Column():
|
1507 |
-
|
1508 |
-
|
1509 |
-
|
1510 |
-
|
1511 |
-
|
1512 |
-
|
1513 |
-
|
1514 |
-
|
|
|
|
|
|
|
1515 |
|
1516 |
# Refresh button to update images
|
1517 |
refresh_button = gr.Button("Refresh Images")
|
|
|
713 |
# return image_1, image_2, image_3
|
714 |
|
715 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
716 |
import torch
|
717 |
from diffusers import FluxPipeline
|
718 |
|
719 |
+
def initialize_flux_pipeline():
|
720 |
+
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
|
721 |
+
pipe.enable_model_cpu_offload() # Offload to CPU to save VRAM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
return pipe
|
723 |
|
724 |
+
# Initialize the model
|
725 |
+
flux_pipe = initialize_flux_pipeline()
|
726 |
+
|
727 |
+
|
728 |
+
def generate_flux_image(prompt):
|
729 |
+
# Use the initialized flux_pipe to generate an image based on the input prompt
|
730 |
+
image = flux_pipe(
|
731 |
prompt,
|
732 |
guidance_scale=0.0,
|
733 |
+
num_inference_steps=4,
|
734 |
+
max_sequence_length=256,
|
735 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
736 |
).images[0]
|
737 |
return image
|
738 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
|
740 |
|
741 |
|
|
|
924 |
|
925 |
# chunking audio and then Process
|
926 |
|
927 |
+
import concurrent.futures
|
928 |
+
import tempfile
|
929 |
+
import os
|
930 |
+
import numpy as np
|
931 |
+
import logging
|
932 |
+
from queue import Queue
|
933 |
+
from threading import Thread
|
934 |
+
from scipy.io.wavfile import write as write_wav
|
935 |
+
from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
|
936 |
+
from transformers import AutoTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
937 |
|
938 |
+
# Ensure your device is set to CUDA
|
939 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
940 |
|
941 |
+
repo_id = "parler-tts/parler-tts-mini-v1"
|
942 |
+
|
943 |
+
def generate_audio_parler_tts(text):
|
944 |
+
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
|
945 |
+
chunk_size_in_s = 0.5
|
946 |
+
|
947 |
+
# Initialize the tokenizer and model
|
948 |
+
parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
949 |
+
parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
|
950 |
+
sampling_rate = parler_model.audio_encoder.config.sampling_rate
|
951 |
+
frame_rate = parler_model.audio_encoder.config.frame_rate
|
952 |
+
|
953 |
+
def generate(text, description, play_steps_in_s=0.5):
|
954 |
+
play_steps = int(frame_rate * play_steps_in_s)
|
955 |
+
streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
|
956 |
+
|
957 |
+
inputs = parler_tokenizer(description, return_tensors="pt").to(device)
|
958 |
+
prompt = parler_tokenizer(text, return_tensors="pt").to(device)
|
959 |
+
|
960 |
+
generation_kwargs = dict(
|
961 |
+
input_ids=inputs.input_ids,
|
962 |
+
prompt_input_ids=prompt.input_ids,
|
963 |
+
attention_mask=inputs.attention_mask,
|
964 |
+
prompt_attention_mask=prompt.attention_mask,
|
965 |
+
streamer=streamer,
|
966 |
+
do_sample=True,
|
967 |
+
temperature=1.0,
|
968 |
+
min_new_tokens=10,
|
969 |
+
)
|
970 |
|
971 |
+
thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
|
972 |
+
thread.start()
|
973 |
|
974 |
+
for new_audio in streamer:
|
975 |
+
if new_audio.shape[0] == 0:
|
976 |
+
break
|
977 |
+
# Save or process each audio chunk as it is generated
|
978 |
+
yield sampling_rate, new_audio
|
979 |
|
980 |
+
audio_segments = []
|
981 |
+
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
|
982 |
+
audio_segments.append(audio_chunk)
|
983 |
|
984 |
+
temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
|
985 |
+
write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
|
986 |
+
logging.debug(f"Saved chunk to {temp_audio_path}")
|
987 |
|
988 |
|
989 |
+
# Combine all the audio chunks into one audio file
|
990 |
+
combined_audio = np.concatenate(audio_segments)
|
991 |
+
combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
|
992 |
|
993 |
+
write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
|
994 |
|
995 |
+
logging.debug(f"Combined audio saved to {combined_audio_path}")
|
996 |
+
return combined_audio_path
|
997 |
|
998 |
|
999 |
def fetch_local_events():
|
|
|
1478 |
news_output = gr.HTML(value=fetch_local_news())
|
1479 |
events_output = gr.HTML(value=fetch_local_events())
|
1480 |
|
1481 |
+
|
1482 |
with gr.Column():
|
1483 |
+
|
1484 |
+
flux_prompt = gr.Textbox(show_copy_button=True, label="Flux Prompt", placeholder="Enter prompt for Flux image generation")
|
1485 |
+
flux_image_output = gr.Image()
|
1486 |
+
flux_generate_button = gr.Button("Generate Flux Image")
|
1487 |
+
|
1488 |
+
# When the button is clicked, the image generation function is triggered
|
1489 |
+
flux_generate_button.click(fn=generate_flux_image, inputs=flux_prompt, outputs=flux_image_output)
|
1490 |
+
|
1491 |
+
|
1492 |
+
|
1493 |
+
|
1494 |
|
1495 |
# Refresh button to update images
|
1496 |
refresh_button = gr.Button("Refresh Images")
|