Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,51 +6,9 @@
|
|
6 |
# copies of the Software, and to permit persons to whom the Software is
|
7 |
import spaces
|
8 |
import os
|
9 |
-
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
10 |
-
|
11 |
import subprocess
|
12 |
-
|
13 |
import re
|
14 |
|
15 |
-
def find_cuda_directories(search_paths=None):
|
16 |
-
"""Finds directories that contain "cuda" and a version number in their name.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
search_paths: A list of directories to search. If None, uses common paths.
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
A dictionary where keys are directory paths and values are extracted versions.
|
23 |
-
Returns an empty dictionary if no CUDA directories are found.
|
24 |
-
"""
|
25 |
-
|
26 |
-
if search_paths is None:
|
27 |
-
# Common CUDA installation locations (customize as needed)
|
28 |
-
search_paths = [
|
29 |
-
"/usr/local", # Linux
|
30 |
-
"/usr/lib", # Linux
|
31 |
-
"/opt", # Linux
|
32 |
-
"/Program Files", # Windows
|
33 |
-
"/Applications", # macOS (less common)
|
34 |
-
os.path.expanduser("~") # Check user's home directory
|
35 |
-
]
|
36 |
-
if os.name == 'nt': #Windows
|
37 |
-
search_paths.append("C:\\Program Files")
|
38 |
-
search_paths.append("C:\\Program Files (x86)")
|
39 |
-
|
40 |
-
cuda_dirs = {}
|
41 |
-
|
42 |
-
for path in search_paths:
|
43 |
-
if os.path.exists(path): # Check if the path exists
|
44 |
-
for root, dirs, files in os.walk(path): # Walk recursively
|
45 |
-
for dir_name in dirs:
|
46 |
-
match = re.search(r"cuda(\d+(\.\d+)*)", dir_name, re.IGNORECASE) # Regex for cuda and version
|
47 |
-
if match:
|
48 |
-
full_path = os.path.join(root, dir_name)
|
49 |
-
version = match.group(1)
|
50 |
-
cuda_dirs[full_path] = version
|
51 |
-
|
52 |
-
return cuda_dirs
|
53 |
-
|
54 |
#subprocess.run(['sh', './torch.sh'])
|
55 |
|
56 |
#import sys
|
@@ -168,11 +126,6 @@ FTP_PASS = os.getenv("FTP_PASS")
|
|
168 |
FTP_DIR = os.getenv("FTP_DIR")
|
169 |
|
170 |
# os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
|
171 |
-
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
|
172 |
-
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
173 |
-
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
174 |
-
|
175 |
-
upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
|
176 |
|
177 |
def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
|
178 |
# adjust the batch_size of prompt_embeds according to guidance_scale
|
@@ -211,6 +164,13 @@ def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
|
|
211 |
# pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
|
212 |
return {"latents": callback_kwargs["latents"]}
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
def load_and_prepare_model():
|
215 |
sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
|
216 |
vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
|
@@ -242,14 +202,14 @@ def load_and_prepare_model():
|
|
242 |
pipe.watermark=None
|
243 |
pipe.safety_checker=None
|
244 |
|
245 |
-
'''
|
246 |
pipe.vae.requires_grad_(False)
|
247 |
pipe.unet.requires_grad_(False)
|
248 |
pipe.text_encoder.requires_grad_(False)
|
249 |
pipe.unet.eval()
|
250 |
pipe.vae.eval()
|
251 |
pipe.text_encoder.eval()
|
252 |
-
'''
|
253 |
#pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
|
254 |
#pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
|
255 |
#pipe.unet.to(memory_format=torch.channels_last)
|
@@ -284,10 +244,11 @@ def load_and_prepare_model():
|
|
284 |
# Preload and compile both models
|
285 |
|
286 |
pipe = load_and_prepare_model()
|
|
|
287 |
|
288 |
-
|
289 |
|
290 |
-
|
291 |
|
292 |
def upload_to_ftp(filename):
|
293 |
try:
|
@@ -412,6 +373,88 @@ import time
|
|
412 |
import gc
|
413 |
|
414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
@spaces.GPU(duration=40)
|
416 |
def generate_30c(
|
417 |
prompt: str,
|
@@ -856,23 +899,6 @@ description = "Text Generator Application by ecarbo"
|
|
856 |
|
857 |
if __name__ == "__main__":
|
858 |
|
859 |
-
cuda_directories = find_cuda_directories()
|
860 |
-
|
861 |
-
if cuda_directories:
|
862 |
-
print("Found CUDA directories:")
|
863 |
-
for directory, version in cuda_directories.items():
|
864 |
-
print(f"- {directory}: Version {version}")
|
865 |
-
else:
|
866 |
-
print("No CUDA directories found in the specified paths.")
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
# Example of how to find the "best" CUDA path (customize logic)
|
871 |
-
if cuda_directories:
|
872 |
-
# Simple example: just pick the first one. You might have more sophisticated selection criteria
|
873 |
-
best_cuda_path = list(cuda_directories.keys())
|
874 |
-
print(f"Using CUDA path: {best_cuda_path}")
|
875 |
-
|
876 |
demo_interface = demo.queue(max_size=50) # Remove .launch() here
|
877 |
|
878 |
text_gen_interface = gr.Interface(
|
|
|
6 |
# copies of the Software, and to permit persons to whom the Software is
|
7 |
import spaces
|
8 |
import os
|
|
|
|
|
9 |
import subprocess
|
|
|
10 |
import re
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
#subprocess.run(['sh', './torch.sh'])
|
13 |
|
14 |
#import sys
|
|
|
126 |
FTP_DIR = os.getenv("FTP_DIR")
|
127 |
|
128 |
# os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
|
131 |
# adjust the batch_size of prompt_embeds according to guidance_scale
|
|
|
164 |
# pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
|
165 |
return {"latents": callback_kwargs["latents"]}
|
166 |
|
167 |
+
'''
|
168 |
+
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
|
169 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
170 |
+
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
171 |
+
|
172 |
+
upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
|
173 |
+
|
174 |
def load_and_prepare_model():
|
175 |
sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
|
176 |
vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
|
|
|
202 |
pipe.watermark=None
|
203 |
pipe.safety_checker=None
|
204 |
|
205 |
+
''' ''' # Freeze vae and unet
|
206 |
pipe.vae.requires_grad_(False)
|
207 |
pipe.unet.requires_grad_(False)
|
208 |
pipe.text_encoder.requires_grad_(False)
|
209 |
pipe.unet.eval()
|
210 |
pipe.vae.eval()
|
211 |
pipe.text_encoder.eval()
|
212 |
+
''' '''
|
213 |
#pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
|
214 |
#pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
|
215 |
#pipe.unet.to(memory_format=torch.channels_last)
|
|
|
244 |
# Preload and compile both models
|
245 |
|
246 |
pipe = load_and_prepare_model()
|
247 |
+
neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
|
248 |
|
249 |
+
'''
|
250 |
|
251 |
+
MAX_SEED = np.iinfo(np.int64).max
|
252 |
|
253 |
def upload_to_ftp(filename):
|
254 |
try:
|
|
|
373 |
import gc
|
374 |
|
375 |
|
376 |
+
os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
|
377 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
378 |
+
os.environ["SAFETENSORS_FAST_GPU"] = "1"
|
379 |
+
|
380 |
+
upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
|
381 |
+
|
382 |
+
def load_and_prepare_model():
|
383 |
+
sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
|
384 |
+
vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
|
385 |
+
#vaeRV = AutoencoderKL.from_pretrained("SG161222/RealVisXL_V5.0", subfolder='vae', safety_checker=None, use_safetensors=False).to(device).to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
|
386 |
+
#sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear")
|
387 |
+
#txt_1 = CLIPTextModel.from_pretrained(device_map??)
|
388 |
+
#txt_2 = CLIPTextModel.from_pretrained(vae too?)
|
389 |
+
#sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler')
|
390 |
+
pipe = StableDiffusionXLPipeline.from_pretrained(
|
391 |
+
'ford442/RealVisXL_V5.0_BF16',
|
392 |
+
#torch_dtype=torch.bfloat16,
|
393 |
+
add_watermarker=False,
|
394 |
+
# low_cpu_mem_usage = False,
|
395 |
+
token = HF_TOKEN,
|
396 |
+
# scheduler = sched,
|
397 |
+
)
|
398 |
+
#sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1) #,use_karras_sigmas=True)
|
399 |
+
pipe.vae = vaeXL #.to(torch.bfloat16)
|
400 |
+
pipe.scheduler = sched
|
401 |
+
|
402 |
+
pipe.vae.do_resize = False
|
403 |
+
#pipe.vae.vae_scale_factor = 8
|
404 |
+
pipe.vae.do_convert_rgb = True
|
405 |
+
|
406 |
+
pipe.vae.set_default_attn_processor()
|
407 |
+
#pipe.to(device)
|
408 |
+
#pipe.to(torch.bfloat16)
|
409 |
+
print(f'init noise scale: {pipe.scheduler.init_noise_sigma}')
|
410 |
+
pipe.watermark=None
|
411 |
+
pipe.safety_checker=None
|
412 |
+
|
413 |
+
''' # Freeze vae and unet
|
414 |
+
pipe.vae.requires_grad_(False)
|
415 |
+
pipe.unet.requires_grad_(False)
|
416 |
+
pipe.text_encoder.requires_grad_(False)
|
417 |
+
pipe.unet.eval()
|
418 |
+
pipe.vae.eval()
|
419 |
+
pipe.text_encoder.eval()
|
420 |
+
'''
|
421 |
+
#pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
|
422 |
+
#pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
|
423 |
+
#pipe.unet.to(memory_format=torch.channels_last)
|
424 |
+
#pipe.enable_vae_tiling()
|
425 |
+
#pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune') #.to(device=device, dtype=torch.bfloat16)
|
426 |
+
#pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune-no-cudagraphs') #.to(device=device, dtype=torch.bfloat16)
|
427 |
+
#pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={'epilogue_fusion': True, 'shape_padding': True}) #.to(device=device, dtype=torch.bfloat16)
|
428 |
+
#pipe.unet = torch.compile(pipe.unet, dynamic=False)
|
429 |
+
#pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={"search_space": 0})
|
430 |
+
#pipe.unet = torch.compile(pipe.unet, backend="torch_tensorrt", dynamic=False, options={"precision": torch.bfloat16,"optimization_level": 4,})
|
431 |
+
pipe.to(torch.device('cuda:0'), torch.bfloat16)
|
432 |
+
|
433 |
+
return pipe
|
434 |
+
|
435 |
+
#hidet.option.parallel_build(False)
|
436 |
+
#hidet.option.parallel_tune(2,2.0)
|
437 |
+
#torch._dynamo.config.suppress_errors = True
|
438 |
+
#torch._dynamo.disallow_in_graph(diffusers.models.attention.BasicTransformerBlock)
|
439 |
+
|
440 |
+
# more search
|
441 |
+
#hidet.torch.dynamo_config.search_space(0)
|
442 |
+
#hidet.torch.dynamo_config.dump_graph_ir("./local_graph")
|
443 |
+
# hidet.option.cache_dir("local_cache")
|
444 |
+
# automatically transform the model to use float16 data type
|
445 |
+
#hidet.torch.dynamo_config.use_fp16(True)
|
446 |
+
# use float16 data type as the accumulate data type in operators with reduction
|
447 |
+
#hidet.torch.dynamo_config.use_fp16_reduction(True)
|
448 |
+
# use tensorcore
|
449 |
+
#hidet.torch.dynamo_config.use_tensor_core()
|
450 |
+
#hidet.torch.dynamo_config.steal_weights(False)
|
451 |
+
|
452 |
+
# Preload and compile both models
|
453 |
+
|
454 |
+
pipe = load_and_prepare_model()
|
455 |
+
neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
|
456 |
+
|
457 |
+
|
458 |
@spaces.GPU(duration=40)
|
459 |
def generate_30c(
|
460 |
prompt: str,
|
|
|
899 |
|
900 |
if __name__ == "__main__":
|
901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
902 |
demo_interface = demo.queue(max_size=50) # Remove .launch() here
|
903 |
|
904 |
text_gen_interface = gr.Interface(
|