ford442 commited on
Commit
fcb861a
·
verified ·
1 Parent(s): ec46a0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -68
app.py CHANGED
@@ -6,51 +6,9 @@
6
  # copies of the Software, and to permit persons to whom the Software is
7
  import spaces
8
  import os
9
- os.environ["SAFETENSORS_FAST_GPU"] = "1"
10
-
11
  import subprocess
12
-
13
  import re
14
 
15
- def find_cuda_directories(search_paths=None):
16
- """Finds directories that contain "cuda" and a version number in their name.
17
-
18
- Args:
19
- search_paths: A list of directories to search. If None, uses common paths.
20
-
21
- Returns:
22
- A dictionary where keys are directory paths and values are extracted versions.
23
- Returns an empty dictionary if no CUDA directories are found.
24
- """
25
-
26
- if search_paths is None:
27
- # Common CUDA installation locations (customize as needed)
28
- search_paths = [
29
- "/usr/local", # Linux
30
- "/usr/lib", # Linux
31
- "/opt", # Linux
32
- "/Program Files", # Windows
33
- "/Applications", # macOS (less common)
34
- os.path.expanduser("~") # Check user's home directory
35
- ]
36
- if os.name == 'nt': #Windows
37
- search_paths.append("C:\\Program Files")
38
- search_paths.append("C:\\Program Files (x86)")
39
-
40
- cuda_dirs = {}
41
-
42
- for path in search_paths:
43
- if os.path.exists(path): # Check if the path exists
44
- for root, dirs, files in os.walk(path): # Walk recursively
45
- for dir_name in dirs:
46
- match = re.search(r"cuda(\d+(\.\d+)*)", dir_name, re.IGNORECASE) # Regex for cuda and version
47
- if match:
48
- full_path = os.path.join(root, dir_name)
49
- version = match.group(1)
50
- cuda_dirs[full_path] = version
51
-
52
- return cuda_dirs
53
-
54
  #subprocess.run(['sh', './torch.sh'])
55
 
56
  #import sys
@@ -168,11 +126,6 @@ FTP_PASS = os.getenv("FTP_PASS")
168
  FTP_DIR = os.getenv("FTP_DIR")
169
 
170
  # os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
171
- os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
172
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
173
- os.environ["SAFETENSORS_FAST_GPU"] = "1"
174
-
175
- upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
176
 
177
  def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
178
  # adjust the batch_size of prompt_embeds according to guidance_scale
@@ -211,6 +164,13 @@ def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
211
  # pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
212
  return {"latents": callback_kwargs["latents"]}
213
 
 
 
 
 
 
 
 
214
  def load_and_prepare_model():
215
  sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
216
  vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
@@ -242,14 +202,14 @@ def load_and_prepare_model():
242
  pipe.watermark=None
243
  pipe.safety_checker=None
244
 
245
- ''' # Freeze vae and unet
246
  pipe.vae.requires_grad_(False)
247
  pipe.unet.requires_grad_(False)
248
  pipe.text_encoder.requires_grad_(False)
249
  pipe.unet.eval()
250
  pipe.vae.eval()
251
  pipe.text_encoder.eval()
252
- '''
253
  #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
254
  #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
255
  #pipe.unet.to(memory_format=torch.channels_last)
@@ -284,10 +244,11 @@ def load_and_prepare_model():
284
  # Preload and compile both models
285
 
286
  pipe = load_and_prepare_model()
 
287
 
288
- MAX_SEED = np.iinfo(np.int64).max
289
 
290
- neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
291
 
292
  def upload_to_ftp(filename):
293
  try:
@@ -412,6 +373,88 @@ import time
412
  import gc
413
 
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  @spaces.GPU(duration=40)
416
  def generate_30c(
417
  prompt: str,
@@ -856,23 +899,6 @@ description = "Text Generator Application by ecarbo"
856
 
857
  if __name__ == "__main__":
858
 
859
- cuda_directories = find_cuda_directories()
860
-
861
- if cuda_directories:
862
- print("Found CUDA directories:")
863
- for directory, version in cuda_directories.items():
864
- print(f"- {directory}: Version {version}")
865
- else:
866
- print("No CUDA directories found in the specified paths.")
867
-
868
-
869
-
870
- # Example of how to find the "best" CUDA path (customize logic)
871
- if cuda_directories:
872
- # Simple example: just pick the first one. You might have more sophisticated selection criteria
873
- best_cuda_path = list(cuda_directories.keys())
874
- print(f"Using CUDA path: {best_cuda_path}")
875
-
876
  demo_interface = demo.queue(max_size=50) # Remove .launch() here
877
 
878
  text_gen_interface = gr.Interface(
 
6
  # copies of the Software, and to permit persons to whom the Software is
7
  import spaces
8
  import os
 
 
9
  import subprocess
 
10
  import re
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  #subprocess.run(['sh', './torch.sh'])
13
 
14
  #import sys
 
126
  FTP_DIR = os.getenv("FTP_DIR")
127
 
128
  # os.putenv('TORCH_LINALG_PREFER_CUSOLVER','1')
 
 
 
 
 
129
 
130
  def scheduler_swap_callback(pipeline, step_index, timestep, callback_kwargs):
131
  # adjust the batch_size of prompt_embeds according to guidance_scale
 
164
  # pipeline.scheduler._step_index = pipeline.num_timesteps * 0.9
165
  return {"latents": callback_kwargs["latents"]}
166
 
167
+ '''
168
+ os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
169
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
170
+ os.environ["SAFETENSORS_FAST_GPU"] = "1"
171
+
172
+ upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
173
+
174
  def load_and_prepare_model():
175
  sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
176
  vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
 
202
  pipe.watermark=None
203
  pipe.safety_checker=None
204
 
205
+ ''' ''' # Freeze vae and unet
206
  pipe.vae.requires_grad_(False)
207
  pipe.unet.requires_grad_(False)
208
  pipe.text_encoder.requires_grad_(False)
209
  pipe.unet.eval()
210
  pipe.vae.eval()
211
  pipe.text_encoder.eval()
212
+ ''' '''
213
  #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
214
  #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
215
  #pipe.unet.to(memory_format=torch.channels_last)
 
244
  # Preload and compile both models
245
 
246
  pipe = load_and_prepare_model()
247
+ neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
248
 
249
+ '''
250
 
251
+ MAX_SEED = np.iinfo(np.int64).max
252
 
253
  def upload_to_ftp(filename):
254
  try:
 
373
  import gc
374
 
375
 
376
+ os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
377
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
378
+ os.environ["SAFETENSORS_FAST_GPU"] = "1"
379
+
380
+ upscaler = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(torch.device("cuda:0"))
381
+
382
+ def load_and_prepare_model():
383
+ sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1 ,use_karras_sigmas=True)
384
+ vaeXL = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", safety_checker=None, use_safetensors=False, device_map='cpu') #.to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
385
+ #vaeRV = AutoencoderKL.from_pretrained("SG161222/RealVisXL_V5.0", subfolder='vae', safety_checker=None, use_safetensors=False).to(device).to(torch.bfloat16) #.to(device=device, dtype=torch.bfloat16)
386
+ #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear")
387
+ #txt_1 = CLIPTextModel.from_pretrained(device_map??)
388
+ #txt_2 = CLIPTextModel.from_pretrained(vae too?)
389
+ #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler')
390
+ pipe = StableDiffusionXLPipeline.from_pretrained(
391
+ 'ford442/RealVisXL_V5.0_BF16',
392
+ #torch_dtype=torch.bfloat16,
393
+ add_watermarker=False,
394
+ # low_cpu_mem_usage = False,
395
+ token = HF_TOKEN,
396
+ # scheduler = sched,
397
+ )
398
+ #sched = EulerAncestralDiscreteScheduler.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='scheduler',beta_schedule="scaled_linear", beta_start=0.00085, beta_end=0.012, steps_offset=1) #,use_karras_sigmas=True)
399
+ pipe.vae = vaeXL #.to(torch.bfloat16)
400
+ pipe.scheduler = sched
401
+
402
+ pipe.vae.do_resize = False
403
+ #pipe.vae.vae_scale_factor = 8
404
+ pipe.vae.do_convert_rgb = True
405
+
406
+ pipe.vae.set_default_attn_processor()
407
+ #pipe.to(device)
408
+ #pipe.to(torch.bfloat16)
409
+ print(f'init noise scale: {pipe.scheduler.init_noise_sigma}')
410
+ pipe.watermark=None
411
+ pipe.safety_checker=None
412
+
413
+ ''' # Freeze vae and unet
414
+ pipe.vae.requires_grad_(False)
415
+ pipe.unet.requires_grad_(False)
416
+ pipe.text_encoder.requires_grad_(False)
417
+ pipe.unet.eval()
418
+ pipe.vae.eval()
419
+ pipe.text_encoder.eval()
420
+ '''
421
+ #pipe.unet = pipe.unet.to(memory_format=torch.contiguous_format)
422
+ #pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/FLUX-dev-lora-add_details.safetensors", low_cpu_mem_usage=False)
423
+ #pipe.unet.to(memory_format=torch.channels_last)
424
+ #pipe.enable_vae_tiling()
425
+ #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune') #.to(device=device, dtype=torch.bfloat16)
426
+ #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, mode='max-autotune-no-cudagraphs') #.to(device=device, dtype=torch.bfloat16)
427
+ #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={'epilogue_fusion': True, 'shape_padding': True}) #.to(device=device, dtype=torch.bfloat16)
428
+ #pipe.unet = torch.compile(pipe.unet, dynamic=False)
429
+ #pipe.unet = torch.compile(pipe.unet, backend="hidet", dynamic=False, options={"search_space": 0})
430
+ #pipe.unet = torch.compile(pipe.unet, backend="torch_tensorrt", dynamic=False, options={"precision": torch.bfloat16,"optimization_level": 4,})
431
+ pipe.to(torch.device('cuda:0'), torch.bfloat16)
432
+
433
+ return pipe
434
+
435
+ #hidet.option.parallel_build(False)
436
+ #hidet.option.parallel_tune(2,2.0)
437
+ #torch._dynamo.config.suppress_errors = True
438
+ #torch._dynamo.disallow_in_graph(diffusers.models.attention.BasicTransformerBlock)
439
+
440
+ # more search
441
+ #hidet.torch.dynamo_config.search_space(0)
442
+ #hidet.torch.dynamo_config.dump_graph_ir("./local_graph")
443
+ # hidet.option.cache_dir("local_cache")
444
+ # automatically transform the model to use float16 data type
445
+ #hidet.torch.dynamo_config.use_fp16(True)
446
+ # use float16 data type as the accumulate data type in operators with reduction
447
+ #hidet.torch.dynamo_config.use_fp16_reduction(True)
448
+ # use tensorcore
449
+ #hidet.torch.dynamo_config.use_tensor_core()
450
+ #hidet.torch.dynamo_config.steal_weights(False)
451
+
452
+ # Preload and compile both models
453
+
454
+ pipe = load_and_prepare_model()
455
+ neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
456
+
457
+
458
  @spaces.GPU(duration=40)
459
  def generate_30c(
460
  prompt: str,
 
899
 
900
  if __name__ == "__main__":
901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  demo_interface = demo.queue(max_size=50) # Remove .launch() here
903
 
904
  text_gen_interface = gr.Interface(