IT2091024v2

Paused

App Files Files Community

Pijush2023 commited on Sep 9, 2024

Commit

ed253d8

verified ·

1 Parent(s): a4a37b4

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -144

app.py CHANGED Viewed

@@ -713,80 +713,31 @@ def generate_map(location_names):
 #     return image_1, image_2, image_3
-# import torch
-# from diffusers import DiffusionPipeline
-# import numpy as np
-# import random
-# import gradio as gr
-# # Constants for device and dtype
-# dtype = torch.bfloat16
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # Clear CUDA memory
-# torch.cuda.empty_cache()
-# # Load the Flux pipeline model
-# pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
-# MAX_SEED = np.iinfo(np.int32).max
-# MAX_IMAGE_SIZE = 2048
-# # Function for inference using the Flux pipeline
-# def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=4):
-#     if randomize_seed:
-#         seed = random.randint(0, MAX_SEED)
-#     generator = torch.Generator(device).manual_seed(seed)
-#     # Generate the image using the pipeline
-#     image = pipe(
-#         prompt=prompt,
-#         width=width,
-#         height=height,
-#         num_inference_steps=num_inference_steps,
-#         generator=generator,
-#         guidance_scale=0.0
-#     ).images[0]
-#     return image, seed
 import torch
-from diffusers import DiffusionPipeline
 import os
 # Set PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 # Check GPU memory and fallback to CPU if necessary
-if torch.cuda.is_available():
-    free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
-    if free_memory < 100 * 1024 * 1024:  # If less than 100 MB is free
-        print("Low GPU memory, switching to CPU.")
-        device = "cpu"
-    else:
-        device = "cuda"
-else:
-    device = "cpu"
-dtype = torch.float16 if device == "cuda" else torch.float32  # Use float16 for GPU and float32 for CPU
-# Clear any existing GPU memory cache
-torch.cuda.empty_cache()
-# Load the pipeline
-pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
 # Reduce the inference steps and image dimensions
-def generate_image_flux(prompt, seed=42, width=400, height=400, num_inference_steps=1):
-    generator = torch.Generator(device).manual_seed(seed)
     image = pipe(
         prompt=prompt,
         width=width,
         height=height,
         num_inference_steps=num_inference_steps,  # Reduce steps to save memory
-        generator=generator,
         guidance_scale=0.0
     ).images[0]
     return image
@@ -885,41 +836,41 @@ def fetch_local_news():
     else:
         return "<p>Failed to fetch local news</p>"
-# import numpy as np
-# import torch
-# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
-# model_id = 'openai/whisper-large-v3'
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
-# processor = AutoProcessor.from_pretrained(model_id)
-# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
-# base_audio_drive = "/data/audio"
-# #Normal Code with sample rate is 44100 Hz
-# def transcribe_function(stream, new_chunk):
-#     try:
-#         sr, y = new_chunk[0], new_chunk[1]
-#     except TypeError:
-#         print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
-#         return stream, "", None
-#     y = y.astype(np.float32) / np.max(np.abs(y))
-#     if stream is not None:
-#         stream = np.concatenate([stream, y])
-#     else:
-#         stream = y
-#     result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
-#     full_text = result.get("text","")
-#     return stream, full_text, result
@@ -994,76 +945,76 @@ def generate_audio_elevenlabs(text):
 # chunking audio and then Process
-# import concurrent.futures
-# import tempfile
-# import os
-# import numpy as np
-# import logging
-# from queue import Queue
-# from threading import Thread
-# from scipy.io.wavfile import write as write_wav
-# from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
-# from transformers import AutoTokenizer
-# # Ensure your device is set to CUDA
-# device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# repo_id = "parler-tts/parler-tts-mini-v1"
-# def generate_audio_parler_tts(text):
-#     description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
-#     chunk_size_in_s = 0.5
-#     # Initialize the tokenizer and model
-#     parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
-#     parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
-#     sampling_rate = parler_model.audio_encoder.config.sampling_rate
-#     frame_rate = parler_model.audio_encoder.config.frame_rate
-#     def generate(text, description, play_steps_in_s=0.5):
-#         play_steps = int(frame_rate * play_steps_in_s)
-#         streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
-#         inputs = parler_tokenizer(description, return_tensors="pt").to(device)
-#         prompt = parler_tokenizer(text, return_tensors="pt").to(device)
-#         generation_kwargs = dict(
-#             input_ids=inputs.input_ids,
-#             prompt_input_ids=prompt.input_ids,
-#             attention_mask=inputs.attention_mask,
-#             prompt_attention_mask=prompt.attention_mask,
-#             streamer=streamer,
-#             do_sample=True,
-#             temperature=1.0,
-#             min_new_tokens=10,
-#         )
-#         thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
-#         thread.start()
-#         for new_audio in streamer:
-#             if new_audio.shape[0] == 0:
-#                 break
-#             # Save or process each audio chunk as it is generated
-#             yield sampling_rate, new_audio
-#     audio_segments = []
-#     for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
-#         audio_segments.append(audio_chunk)
-#         temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
-#         write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
-#         logging.debug(f"Saved chunk to {temp_audio_path}")
-#     # Combine all the audio chunks into one audio file
-#     combined_audio = np.concatenate(audio_segments)
-#     combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
-#     write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
-#     logging.debug(f"Combined audio saved to {combined_audio_path}")
-#     return combined_audio_path
 def fetch_local_events():
@@ -1553,8 +1504,8 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
             # Display generated images
             image_output_1 = gr.Image(value=generate_image_flux(hardcoded_prompt_1), width=400, height=400)
-            # image_output_2 = gr.Image(value=generate_image_flux(hardcoded_prompt_2), width=400, height=400)
-            # image_output_3 = gr.Image(value=generate_image_flux(hardcoded_prompt_3), width=400, height=400)
             # Refresh button to update images
             refresh_button = gr.Button("Refresh Images")

 #     return image_1, image_2, image_3
 import torch
+from diffusers import FluxPipeline
 import os
 # Set PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
 os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
 # Check GPU memory and fallback to CPU if necessary
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
 # Reduce the inference steps and image dimensions
+def generate_image_flux(prompt, width=400, height=400, num_inference_steps=4):
     image = pipe(
         prompt=prompt,
         width=width,
         height=height,
         num_inference_steps=num_inference_steps,  # Reduce steps to save memory
+        generator=torch.Generator("cpu").manual_seed(0),
         guidance_scale=0.0
     ).images[0]
     return image
     else:
         return "<p>Failed to fetch local news</p>"
+import numpy as np
+import torch
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
+model_id = 'openai/whisper-large-v3'
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
+base_audio_drive = "/data/audio"
+#Normal Code with sample rate is 44100 Hz
+def transcribe_function(stream, new_chunk):
+    try:
+        sr, y = new_chunk[0], new_chunk[1]
+    except TypeError:
+        print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
+        return stream, "", None
+    y = y.astype(np.float32) / np.max(np.abs(y))
+    if stream is not None:
+        stream = np.concatenate([stream, y])
+    else:
+        stream = y
+    result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
+    full_text = result.get("text","")
+    return stream, full_text, result
 # chunking audio and then Process
+import concurrent.futures
+import tempfile
+import os
+import numpy as np
+import logging
+from queue import Queue
+from threading import Thread
+from scipy.io.wavfile import write as write_wav
+from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
+from transformers import AutoTokenizer
+# Ensure your device is set to CUDA
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+repo_id = "parler-tts/parler-tts-mini-v1"
+def generate_audio_parler_tts(text):
+    description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+    chunk_size_in_s = 0.5
+    # Initialize the tokenizer and model
+    parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
+    parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
+    sampling_rate = parler_model.audio_encoder.config.sampling_rate
+    frame_rate = parler_model.audio_encoder.config.frame_rate
+    def generate(text, description, play_steps_in_s=0.5):
+        play_steps = int(frame_rate * play_steps_in_s)
+        streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
+        inputs = parler_tokenizer(description, return_tensors="pt").to(device)
+        prompt = parler_tokenizer(text, return_tensors="pt").to(device)
+        generation_kwargs = dict(
+            input_ids=inputs.input_ids,
+            prompt_input_ids=prompt.input_ids,
+            attention_mask=inputs.attention_mask,
+            prompt_attention_mask=prompt.attention_mask,
+            streamer=streamer,
+            do_sample=True,
+            temperature=1.0,
+            min_new_tokens=10,
+        )
+        thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        for new_audio in streamer:
+            if new_audio.shape[0] == 0:
+                break
+            # Save or process each audio chunk as it is generated
+            yield sampling_rate, new_audio
+    audio_segments = []
+    for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
+        audio_segments.append(audio_chunk)
+        temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
+        write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
+        logging.debug(f"Saved chunk to {temp_audio_path}")
+    # Combine all the audio chunks into one audio file
+    combined_audio = np.concatenate(audio_segments)
+    combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
+    write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
+    logging.debug(f"Combined audio saved to {combined_audio_path}")
+    return combined_audio_path
 def fetch_local_events():
             # Display generated images
             image_output_1 = gr.Image(value=generate_image_flux(hardcoded_prompt_1), width=400, height=400)
+            image_output_2 = gr.Image(value=generate_image_flux(hardcoded_prompt_2), width=400, height=400)
+            image_output_3 = gr.Image(value=generate_image_flux(hardcoded_prompt_3), width=400, height=400)
             # Refresh button to update images
             refresh_button = gr.Button("Refresh Images")