RealVis_v5.0_BF16_C

Running on Zero

App Files Files Community

ford442 commited on Feb 23

Commit

fcb861a

verified ·

1 Parent(s): ec46a0b

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -68

app.py CHANGED Viewed

@@ -6,51 +6,9 @@
 # copies of the Software, and to permit persons to whom the Software is
 import spaces
 import os
-os.environ["SAFETENSORS_FAST_GPU"] = "1"
 import subprocess
 import re
-def find_cuda_directories(search_paths=None):
-    """Finds directories that contain "cuda" and a version number in their name.
-    Args:
-        search_paths: A list of directories to search. If None, uses common paths.
-    Returns:
-        A dictionary where keys are directory paths and values are extracted versions.
-        Returns an empty dictionary if no CUDA directories are found.
-    """
-    if search_paths is None:
-        # Common CUDA installation locations (customize as needed)
-        search_paths = [
-            "/usr/local",  # Linux
-            "/usr/lib",    # Linux
-            "/opt",       # Linux
-            "/Program Files",  # Windows
-            "/Applications", # macOS (less common)
-            os.path.expanduser("~") # Check user's home directory
-        ]
-        if os.name == 'nt': #Windows
-            search_paths.append("C:\\Program Files")
-            search_paths.append("C:\\Program Files (x86)")
-    cuda_dirs = {}
-    for path in search_paths:
-        if os.path.exists(path):  # Check if the path exists
-            for root, dirs, files in os.walk(path):  # Walk recursively
-                for dir_name in dirs:
-                    match = re.search(r"cuda(\d+(\.\d+)*)", dir_name, re.IGNORECASE)  # Regex for cuda and version
-                    if match:
-                        full_path = os.path.join(root, dir_name)
-                        version = match.group(1)
-                        cuda_dirs[full_path] = version
-    return cuda_dirs
 #subprocess.run(['sh', './torch.sh'])
 #import sys
@@ -168,11 +126,6 @@ FTP_PASS = os.getenv("FTP_PASS")
 FTP_DIR = os.getenv("FTP_DIR")
 # os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
-os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-os.environ["SAFETENSORS_FAST_GPU"] = "1"
-upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
 def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
         # adjust the batch_size of prompt_embeds according to guidance_scale
@@ -211,6 +164,13 @@ def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
       #  pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
     return {"latents": callback_kwargs["latents"]}
 def load_and_prepare_model():
     sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
     vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
@@ -242,14 +202,14 @@ def load_and_prepare_model():
     pipe.watermark=None
     pipe.safety_checker=None
-    '''       # Freeze vae and unet
     pipe.vae.requires_grad_(False)
     pipe.unet.requires_grad_(False)
     pipe.text_encoder.requires_grad_(False)
     pipe.unet.eval()
     pipe.vae.eval()
     pipe.text_encoder.eval()
-    '''
     #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
     #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
     #pipe.unet.to(memory_format=torch.channels_last)
@@ -284,10 +244,11 @@ def load_and_prepare_model():
 # Preload and compile both models
 pipe = load_and_prepare_model()
-MAX_SEED = np.iinfo(np.int64).max
-neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
 def upload_to_ftp(filename):
     try:
@@ -412,6 +373,88 @@ import time
 import gc
 @spaces.GPU(duration=40)
 def generate_30c(
     prompt: str,
@@ -856,23 +899,6 @@ description = "Text Generator Application by ecarbo"
 if __name__ == "__main__":
-    cuda_directories = find_cuda_directories()
-    if cuda_directories:
-        print("Found CUDA directories:")
-        for directory, version in cuda_directories.items():
-            print(f"- {directory}: Version {version}")
-    else:
-        print("No CUDA directories found in the specified paths.")
-    # Example of how to find the "best" CUDA path (customize logic)
-    if cuda_directories:
-      # Simple example: just pick the first one.  You might have more sophisticated selection criteria
-      best_cuda_path = list(cuda_directories.keys())
-      print(f"Using CUDA path: {best_cuda_path}")
     demo_interface = demo.queue(max_size=50)  # Remove .launch() here
     text_gen_interface = gr.Interface(

 # copies of the Software, and to permit persons to whom the Software is
 import spaces
 import os
 import subprocess
 import re
 #subprocess.run(['sh', './torch.sh'])
 #import sys
 FTP_DIR = os.getenv("FTP_DIR")
 # os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
 def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
         # adjust the batch_size of prompt_embeds according to guidance_scale
       #  pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
     return {"latents": callback_kwargs["latents"]}
+'''
+os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+os.environ["SAFETENSORS_FAST_GPU"] = "1"
+upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
 def load_and_prepare_model():
     sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
     vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
     pipe.watermark=None
     pipe.safety_checker=None
+    '''  '''     # Freeze vae and unet
     pipe.vae.requires_grad_(False)
     pipe.unet.requires_grad_(False)
     pipe.text_encoder.requires_grad_(False)
     pipe.unet.eval()
     pipe.vae.eval()
     pipe.text_encoder.eval()
+    '''    '''
     #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
     #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
     #pipe.unet.to(memory_format=torch.channels_last)
 # Preload and compile both models
 pipe = load_and_prepare_model()
+neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
+'''
+MAX_SEED = np.iinfo(np.int64).max
 def upload_to_ftp(filename):
     try:
 import gc
+os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+os.environ["SAFETENSORS_FAST_GPU"] = "1"
+upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
+def load_and_prepare_model():
+    sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
+    vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
+    #vaeRV = AutoencoderKL.from_pretrained("SG161222/RealVisXL_V5.0", subfolder='vae', safety_checker=None, use_safetensors=False).to(device).to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
+    #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear")
+    #txt_1 = CLIPTextModel.from_pretrained(device_map??)
+    #txt_2 = CLIPTextModel.from_pretrained(vae too?)
+    #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler')
+    pipe = StableDiffusionXLPipeline.from_pretrained(
+        'ford442/RealVisXL_V5.0_BF16',
+        #torch_dtype=torch.bfloat16,
+        add_watermarker=False,
+      #  low_cpu_mem_usage = False,
+        token = HF_TOKEN,
+      #  scheduler = sched,
+    )
+    #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1) #,use_karras_sigmas=True)
+    pipe.vae = vaeXL #.to(torch.bfloat16)
+    pipe.scheduler = sched
+    pipe.vae.do_resize = False
+    #pipe.vae.vae_scale_factor = 8
+    pipe.vae.do_convert_rgb = True
+    pipe.vae.set_default_attn_processor()
+    #pipe.to(device)
+    #pipe.to(torch.bfloat16)
+    print(f'init noise scale: {pipe.scheduler.init_noise_sigma}')
+    pipe.watermark=None
+    pipe.safety_checker=None
+    '''       # Freeze vae and unet
+    pipe.vae.requires_grad_(False)
+    pipe.unet.requires_grad_(False)
+    pipe.text_encoder.requires_grad_(False)
+    pipe.unet.eval()
+    pipe.vae.eval()
+    pipe.text_encoder.eval()
+    '''
+    #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
+    #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
+    #pipe.unet.to(memory_format=torch.channels_last)
+    #pipe.enable_vae_tiling()
+    #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune') #.to(device=device, dtype=torch.bfloat16)
+    #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune-no-cudagraphs') #.to(device=device, dtype=torch.bfloat16)
+    #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={'epilogue_fusion': True, 'shape_padding': True}) #.to(device=device, dtype=torch.bfloat16)
+    #pipe.unet = torch.compile(pipe.unet, dynamic=False)
+    #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={"search_space": 0})
+    #pipe.unet = torch.compile(pipe.unet, backend="torch_tensorrt", dynamic=False, options={"precision": torch.bfloat16,"optimization_level": 4,})
+    pipe.to(torch.device('cuda:0'), torch.bfloat16)
+    return pipe
+#hidet.option.parallel_build(False)
+#hidet.option.parallel_tune(2,2.0)
+#torch._dynamo.config.suppress_errors = True
+#torch._dynamo.disallow_in_graph(diffusers.models.attention.BasicTransformerBlock)
+# more search
+#hidet.torch.dynamo_config.search_space(0)
+#hidet.torch.dynamo_config.dump_graph_ir("./local_graph")
+#  hidet.option.cache_dir("local_cache")
+# automatically transform the model to use float16 data type
+#hidet.torch.dynamo_config.use_fp16(True)
+# use float16 data type as the accumulate data type in operators with reduction
+#hidet.torch.dynamo_config.use_fp16_reduction(True)
+# use tensorcore
+#hidet.torch.dynamo_config.use_tensor_core()
+#hidet.torch.dynamo_config.steal_weights(False)
+# Preload and compile both models
+pipe = load_and_prepare_model()
+neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
 @spaces.GPU(duration=40)
 def generate_30c(
     prompt: str,
 if __name__ == "__main__":
     demo_interface = demo.queue(max_size=50)  # Remove .launch() here
     text_gen_interface = gr.Interface(