StableDiffusion-3.5-Large

Running on Zero

App Files Files Community

ford442 commited on Dec 10, 2024

Commit

e3da6fc

verified ·

1 Parent(s): 803fc0a

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -8

app.py CHANGED Viewed

@@ -53,8 +53,8 @@ torch_dtype = torch.bfloat16
 checkpoint = "microsoft/Phi-3.5-mini-instruct"
 #vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-#vae = AutoencoderKL.from_pretrained("ford442/sdxl-vae-bf16", torch_dtype=torch.bfloat16).to(torch.device("cuda:0"))
-vae = AutoencoderKL.from_pretrained("ford442/sdxl-vae-bf16")
 pipe = StableDiffusion3Pipeline.from_pretrained("ford442/stable-diffusion-3.5-medium-bf16", torch_dtype=torch.bfloat16).to(torch.device("cuda:0"))
 #pipe = StableDiffusion3Pipeline.from_pretrained("ford442/stable-diffusion-3.5-medium-bf16").to(torch.device("cuda:0"))
@@ -131,15 +131,20 @@ def infer(
     user_prompt_rewrite = (
     "Rewrite this prompt to be more descriptive and detailed and only return the rewritten text: "
     )
     input_text = f"{system_prompt_rewrite} {user_prompt_rewrite} {prompt}"
     print("-- got prompt --")
     # Encode the input text and include the attention mask
-    encoded_inputs = tokenizer(
-        input_text, return_tensors="pt", return_attention_mask=True
-    )
     # Ensure all values are on the correct device
     input_ids = encoded_inputs["input_ids"].to(device)
     attention_mask = encoded_inputs["attention_mask"].to(device)
     print("-- tokenize prompt --")
       # Google T5
     #input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
@@ -151,12 +156,24 @@ def infer(
         top_p=0.9,
         do_sample=True,
     )
     # Use the encoded tensor 'text_inputs' here
     enhanced_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
     print('-- generated prompt --')
     enhanced_prompt = filter_text(enhanced_prompt,prompt)
     print('-- filtered prompt --')
     print(enhanced_prompt)
     if latent_file:  # Check if a latent file is provided
       #  initial_latents = pipe.prepare_latents(
       #      batch_size=1,
@@ -188,7 +205,7 @@ def infer(
         with torch.no_grad():
           sd_image = pipe(
             prompt=enhanced_prompt,  # This conversion is fine
-            prompt_2=prompt,
             prompt_3=prompt,
             negative_prompt=negative_prompt,
             guidance_scale=guidance_scale,
@@ -213,13 +230,13 @@ def infer(
     upload_to_ftp(latent_path)
     #refiner.scheduler.set_timesteps(num_inference_steps,device)
     refine = refiner(
-            prompt=f"{prompt}, high quality masterpiece, complex details",
             negative_prompt = negative_prompt,
             guidance_scale=7.5,
             num_inference_steps=num_inference_steps,
             image=sd_image,
             generator=generator,
-    ).images[0]
     refine_path = f"sd35m_refine_{seed}.png"
     refine.save(refine_path,optimize=False,compress_level=0)
     upload_to_ftp(refine_path)

 checkpoint = "microsoft/Phi-3.5-mini-instruct"
 #vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained("ford442/sdxl-vae-bf16", torch_dtype=torch.bfloat16).to(torch.device("cuda:0"))
+#vae = AutoencoderKL.from_pretrained("ford442/sdxl-vae-bf16")
 pipe = StableDiffusion3Pipeline.from_pretrained("ford442/stable-diffusion-3.5-medium-bf16", torch_dtype=torch.bfloat16).to(torch.device("cuda:0"))
 #pipe = StableDiffusion3Pipeline.from_pretrained("ford442/stable-diffusion-3.5-medium-bf16").to(torch.device("cuda:0"))
     user_prompt_rewrite = (
     "Rewrite this prompt to be more descriptive and detailed and only return the rewritten text: "
     )
+    user_prompt_rewrite_2 = (
+    "Rephrase this scene to have more elaborate details: "
+    )
     input_text = f"{system_prompt_rewrite} {user_prompt_rewrite} {prompt}"
+    input_text_2 = f"{system_prompt_rewrite} {user_prompt_rewrite_2} {prompt}"
     print("-- got prompt --")
     # Encode the input text and include the attention mask
+    encoded_inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
+    encoded_inputs_2 = tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
     # Ensure all values are on the correct device
     input_ids = encoded_inputs["input_ids"].to(device)
+    input_ids_2 = encoded_inputs_2["input_ids"].to(device)
     attention_mask = encoded_inputs["attention_mask"].to(device)
+    attention_mask_2 = encoded_inputs_2["attention_mask"].to(device)
     print("-- tokenize prompt --")
       # Google T5
     #input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
         top_p=0.9,
         do_sample=True,
     )
+    outputs_2 = model.generate(
+        input_ids=input_ids_2,
+        attention_mask=attention_mask_2,
+        max_new_tokens=65,
+        temperature=0.2,
+        top_p=0.9,
+        do_sample=True,
+    )
     # Use the encoded tensor 'text_inputs' here
     enhanced_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    enhanced_prompt_2 = tokenizer.decode(outputs_2[0], skip_special_tokens=True)
     print('-- generated prompt --')
     enhanced_prompt = filter_text(enhanced_prompt,prompt)
+    enhanced_prompt_2 = filter_text(enhanced_prompt_2,prompt)
     print('-- filtered prompt --')
     print(enhanced_prompt)
+    print('-- filtered prompt 2 --')
+    print(enhanced_prompt_2)
     if latent_file:  # Check if a latent file is provided
       #  initial_latents = pipe.prepare_latents(
       #      batch_size=1,
         with torch.no_grad():
           sd_image = pipe(
             prompt=enhanced_prompt,  # This conversion is fine
+            prompt_2=enhanced_prompt_2,
             prompt_3=prompt,
             negative_prompt=negative_prompt,
             guidance_scale=guidance_scale,
     upload_to_ftp(latent_path)
     #refiner.scheduler.set_timesteps(num_inference_steps,device)
     refine = refiner(
+            prompt=f"{enhanced_prompt_2}, high quality masterpiece, complex details",
             negative_prompt = negative_prompt,
             guidance_scale=7.5,
             num_inference_steps=num_inference_steps,
             image=sd_image,
             generator=generator,
+    ).images[0]
     refine_path = f"sd35m_refine_{seed}.png"
     refine.save(refine_path,optimize=False,compress_level=0)
     upload_to_ftp(refine_path)