Pijush2023 commited on
Commit
ed253d8
·
verified ·
1 Parent(s): a4a37b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -144
app.py CHANGED
@@ -713,80 +713,31 @@ def generate_map(location_names):
713
  # return image_1, image_2, image_3
714
 
715
 
716
- # import torch
717
- # from diffusers import DiffusionPipeline
718
- # import numpy as np
719
- # import random
720
- # import gradio as gr
721
-
722
- # # Constants for device and dtype
723
- # dtype = torch.bfloat16
724
- # device = "cuda" if torch.cuda.is_available() else "cpu"
725
-
726
- # # Clear CUDA memory
727
- # torch.cuda.empty_cache()
728
-
729
- # # Load the Flux pipeline model
730
- # pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
731
-
732
- # MAX_SEED = np.iinfo(np.int32).max
733
- # MAX_IMAGE_SIZE = 2048
734
-
735
- # # Function for inference using the Flux pipeline
736
- # def infer(prompt, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=4):
737
- # if randomize_seed:
738
- # seed = random.randint(0, MAX_SEED)
739
- # generator = torch.Generator(device).manual_seed(seed)
740
-
741
- # # Generate the image using the pipeline
742
- # image = pipe(
743
- # prompt=prompt,
744
- # width=width,
745
- # height=height,
746
- # num_inference_steps=num_inference_steps,
747
- # generator=generator,
748
- # guidance_scale=0.0
749
- # ).images[0]
750
-
751
- # return image, seed
752
 
753
 
754
 
755
  import torch
756
- from diffusers import DiffusionPipeline
757
  import os
758
 
759
  # Set PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
760
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
761
 
762
  # Check GPU memory and fallback to CPU if necessary
763
- if torch.cuda.is_available():
764
- free_memory = torch.cuda.memory_reserved(0) - torch.cuda.memory_allocated(0)
765
- if free_memory < 100 * 1024 * 1024: # If less than 100 MB is free
766
- print("Low GPU memory, switching to CPU.")
767
- device = "cpu"
768
- else:
769
- device = "cuda"
770
- else:
771
- device = "cpu"
772
-
773
- dtype = torch.float16 if device == "cuda" else torch.float32 # Use float16 for GPU and float32 for CPU
774
 
775
- # Clear any existing GPU memory cache
776
- torch.cuda.empty_cache()
777
 
778
- # Load the pipeline
779
- pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
780
 
781
  # Reduce the inference steps and image dimensions
782
- def generate_image_flux(prompt, seed=42, width=400, height=400, num_inference_steps=1):
783
- generator = torch.Generator(device).manual_seed(seed)
784
  image = pipe(
785
  prompt=prompt,
786
  width=width,
787
  height=height,
788
  num_inference_steps=num_inference_steps, # Reduce steps to save memory
789
- generator=generator,
790
  guidance_scale=0.0
791
  ).images[0]
792
  return image
@@ -885,41 +836,41 @@ def fetch_local_news():
885
  else:
886
  return "<p>Failed to fetch local news</p>"
887
 
888
- # import numpy as np
889
- # import torch
890
- # from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
891
 
892
- # model_id = 'openai/whisper-large-v3'
893
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
894
- # torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
895
- # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
896
- # processor = AutoProcessor.from_pretrained(model_id)
897
 
898
- # pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
899
 
900
- # base_audio_drive = "/data/audio"
901
 
902
- # #Normal Code with sample rate is 44100 Hz
903
 
904
- # def transcribe_function(stream, new_chunk):
905
- # try:
906
- # sr, y = new_chunk[0], new_chunk[1]
907
- # except TypeError:
908
- # print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
909
- # return stream, "", None
910
 
911
- # y = y.astype(np.float32) / np.max(np.abs(y))
912
 
913
- # if stream is not None:
914
- # stream = np.concatenate([stream, y])
915
- # else:
916
- # stream = y
917
 
918
- # result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
919
 
920
- # full_text = result.get("text","")
921
 
922
- # return stream, full_text, result
923
 
924
 
925
 
@@ -994,76 +945,76 @@ def generate_audio_elevenlabs(text):
994
 
995
  # chunking audio and then Process
996
 
997
- # import concurrent.futures
998
- # import tempfile
999
- # import os
1000
- # import numpy as np
1001
- # import logging
1002
- # from queue import Queue
1003
- # from threading import Thread
1004
- # from scipy.io.wavfile import write as write_wav
1005
- # from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
1006
- # from transformers import AutoTokenizer
1007
-
1008
- # # Ensure your device is set to CUDA
1009
- # device = "cuda:0" if torch.cuda.is_available() else "cpu"
1010
-
1011
- # repo_id = "parler-tts/parler-tts-mini-v1"
1012
-
1013
- # def generate_audio_parler_tts(text):
1014
- # description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
1015
- # chunk_size_in_s = 0.5
1016
-
1017
- # # Initialize the tokenizer and model
1018
- # parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
1019
- # parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
1020
- # sampling_rate = parler_model.audio_encoder.config.sampling_rate
1021
- # frame_rate = parler_model.audio_encoder.config.frame_rate
1022
-
1023
- # def generate(text, description, play_steps_in_s=0.5):
1024
- # play_steps = int(frame_rate * play_steps_in_s)
1025
- # streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
1026
-
1027
- # inputs = parler_tokenizer(description, return_tensors="pt").to(device)
1028
- # prompt = parler_tokenizer(text, return_tensors="pt").to(device)
1029
-
1030
- # generation_kwargs = dict(
1031
- # input_ids=inputs.input_ids,
1032
- # prompt_input_ids=prompt.input_ids,
1033
- # attention_mask=inputs.attention_mask,
1034
- # prompt_attention_mask=prompt.attention_mask,
1035
- # streamer=streamer,
1036
- # do_sample=True,
1037
- # temperature=1.0,
1038
- # min_new_tokens=10,
1039
- # )
1040
 
1041
- # thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
1042
- # thread.start()
1043
 
1044
- # for new_audio in streamer:
1045
- # if new_audio.shape[0] == 0:
1046
- # break
1047
- # # Save or process each audio chunk as it is generated
1048
- # yield sampling_rate, new_audio
1049
 
1050
- # audio_segments = []
1051
- # for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1052
- # audio_segments.append(audio_chunk)
1053
 
1054
- # temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1055
- # write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1056
- # logging.debug(f"Saved chunk to {temp_audio_path}")
1057
 
1058
 
1059
- # # Combine all the audio chunks into one audio file
1060
- # combined_audio = np.concatenate(audio_segments)
1061
- # combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1062
 
1063
- # write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1064
 
1065
- # logging.debug(f"Combined audio saved to {combined_audio_path}")
1066
- # return combined_audio_path
1067
 
1068
 
1069
  def fetch_local_events():
@@ -1553,8 +1504,8 @@ with gr.Blocks(theme='Pijush2023/scikit-learn-pijush') as demo:
1553
 
1554
  # Display generated images
1555
  image_output_1 = gr.Image(value=generate_image_flux(hardcoded_prompt_1), width=400, height=400)
1556
- # image_output_2 = gr.Image(value=generate_image_flux(hardcoded_prompt_2), width=400, height=400)
1557
- # image_output_3 = gr.Image(value=generate_image_flux(hardcoded_prompt_3), width=400, height=400)
1558
 
1559
  # Refresh button to update images
1560
  refresh_button = gr.Button("Refresh Images")
 
713
  # return image_1, image_2, image_3
714
 
715
 
716
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
717
 
718
 
719
 
720
  import torch
721
+ from diffusers import FluxPipeline
722
  import os
723
 
724
  # Set PYTORCH_CUDA_ALLOC_CONF to handle memory fragmentation
725
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
726
 
727
  # Check GPU memory and fallback to CPU if necessary
728
+ pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
729
+ pipe.enable_model_cpu_offload() #save some VRAM by offloading the model to CPU. Remove this if you have enough GPU power
 
 
 
 
 
 
 
 
 
730
 
 
 
731
 
 
 
732
 
733
  # Reduce the inference steps and image dimensions
734
+ def generate_image_flux(prompt, width=400, height=400, num_inference_steps=4):
 
735
  image = pipe(
736
  prompt=prompt,
737
  width=width,
738
  height=height,
739
  num_inference_steps=num_inference_steps, # Reduce steps to save memory
740
+ generator=torch.Generator("cpu").manual_seed(0),
741
  guidance_scale=0.0
742
  ).images[0]
743
  return image
 
836
  else:
837
  return "<p>Failed to fetch local news</p>"
838
 
839
+ import numpy as np
840
+ import torch
841
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
842
 
843
+ model_id = 'openai/whisper-large-v3'
844
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
845
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
846
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
847
+ processor = AutoProcessor.from_pretrained(model_id)
848
 
849
+ pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)
850
 
851
+ base_audio_drive = "/data/audio"
852
 
853
+ #Normal Code with sample rate is 44100 Hz
854
 
855
+ def transcribe_function(stream, new_chunk):
856
+ try:
857
+ sr, y = new_chunk[0], new_chunk[1]
858
+ except TypeError:
859
+ print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
860
+ return stream, "", None
861
 
862
+ y = y.astype(np.float32) / np.max(np.abs(y))
863
 
864
+ if stream is not None:
865
+ stream = np.concatenate([stream, y])
866
+ else:
867
+ stream = y
868
 
869
+ result = pipe_asr({"array": stream, "sampling_rate": sr}, return_timestamps=False)
870
 
871
+ full_text = result.get("text","")
872
 
873
+ return stream, full_text, result
874
 
875
 
876
 
 
945
 
946
  # chunking audio and then Process
947
 
948
+ import concurrent.futures
949
+ import tempfile
950
+ import os
951
+ import numpy as np
952
+ import logging
953
+ from queue import Queue
954
+ from threading import Thread
955
+ from scipy.io.wavfile import write as write_wav
956
+ from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
957
+ from transformers import AutoTokenizer
958
+
959
+ # Ensure your device is set to CUDA
960
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
961
+
962
+ repo_id = "parler-tts/parler-tts-mini-v1"
963
+
964
+ def generate_audio_parler_tts(text):
965
+ description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
966
+ chunk_size_in_s = 0.5
967
+
968
+ # Initialize the tokenizer and model
969
+ parler_tokenizer = AutoTokenizer.from_pretrained(repo_id)
970
+ parler_model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
971
+ sampling_rate = parler_model.audio_encoder.config.sampling_rate
972
+ frame_rate = parler_model.audio_encoder.config.frame_rate
973
+
974
+ def generate(text, description, play_steps_in_s=0.5):
975
+ play_steps = int(frame_rate * play_steps_in_s)
976
+ streamer = ParlerTTSStreamer(parler_model, device=device, play_steps=play_steps)
977
+
978
+ inputs = parler_tokenizer(description, return_tensors="pt").to(device)
979
+ prompt = parler_tokenizer(text, return_tensors="pt").to(device)
980
+
981
+ generation_kwargs = dict(
982
+ input_ids=inputs.input_ids,
983
+ prompt_input_ids=prompt.input_ids,
984
+ attention_mask=inputs.attention_mask,
985
+ prompt_attention_mask=prompt.attention_mask,
986
+ streamer=streamer,
987
+ do_sample=True,
988
+ temperature=1.0,
989
+ min_new_tokens=10,
990
+ )
991
 
992
+ thread = Thread(target=parler_model.generate, kwargs=generation_kwargs)
993
+ thread.start()
994
 
995
+ for new_audio in streamer:
996
+ if new_audio.shape[0] == 0:
997
+ break
998
+ # Save or process each audio chunk as it is generated
999
+ yield sampling_rate, new_audio
1000
 
1001
+ audio_segments = []
1002
+ for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
1003
+ audio_segments.append(audio_chunk)
1004
 
1005
+ temp_audio_path = os.path.join(tempfile.gettempdir(), f"parler_tts_audio_chunk_{len(audio_segments)}.wav")
1006
+ write_wav(temp_audio_path, sampling_rate, audio_chunk.astype(np.float32))
1007
+ logging.debug(f"Saved chunk to {temp_audio_path}")
1008
 
1009
 
1010
+ # Combine all the audio chunks into one audio file
1011
+ combined_audio = np.concatenate(audio_segments)
1012
+ combined_audio_path = os.path.join(tempfile.gettempdir(), "parler_tts_combined_audio_stream.wav")
1013
 
1014
+ write_wav(combined_audio_path, sampling_rate, combined_audio.astype(np.float32))
1015
 
1016
+ logging.debug(f"Combined audio saved to {combined_audio_path}")
1017
+ return combined_audio_path
1018
 
1019
 
1020
  def fetch_local_events():
 
1504
 
1505
  # Display generated images
1506
  image_output_1 = gr.Image(value=generate_image_flux(hardcoded_prompt_1), width=400, height=400)
1507
+ image_output_2 = gr.Image(value=generate_image_flux(hardcoded_prompt_2), width=400, height=400)
1508
+ image_output_3 = gr.Image(value=generate_image_flux(hardcoded_prompt_3), width=400, height=400)
1509
 
1510
  # Refresh button to update images
1511
  refresh_button = gr.Button("Refresh Images")