RealVis_v5.0_BF16_IP_B

Running on Zero

App Files Files Community

1inkusFace commited on Jan 20

Commit

4167ce8

verified ·

1 Parent(s): 631e75c

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -2

app.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 import spaces
 import os
 import random
@@ -20,7 +21,7 @@ from ip_adapter import IPAdapterXL
 from huggingface_hub import snapshot_download
 import torch
 from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-from transformers import CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -165,7 +166,9 @@ captioner_3 = pipeline(model="Salesforce/blip-image-captioning-large",device='cu
  #model5 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco").to('cuda')
 #processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.bfloat16).to('cuda')
  #processor5 = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
 ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
 text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
@@ -175,6 +178,27 @@ MAX_SEED = np.iinfo(np.int32).max
 neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
 def upload_to_ftp(filename):
     try:
         transport = paramiko.Transport((FTP_HOST, 22))
@@ -277,6 +301,62 @@ def generate_30(
         filename= f'rv_IP_{timestamp}.png'
         print("-- using image file --")
         print(caption)
         print('-- generating image --')
         sd_image = ip_model.generate(
                 pil_image_1=sd_image_a,

 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 import spaces
 import os
 import random
 from huggingface_hub import snapshot_download
 import torch
 from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
  #model5 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco").to('cuda')
 #processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.bfloat16).to('cuda')
  #processor5 = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
+txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=False)
+txt_tokenizer.tokenizer_legacy=False
+model = AutoModelForCausalLM.from_pretrained(checkpoint).to('cuda')
 ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
 text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
 neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
+def filter_text(text,phraseC):
+  """Filters out the text up to and including 'Rewritten Prompt:'."""
+  phrase = "Rewritten Prompt:"
+  phraseB = "rewritten text:"
+  pattern = f"(.*?){re.escape(phrase)}(.*)"
+  patternB = f"(.*?){re.escape(phraseB)}(.*)"
+  #  matchB = re.search(patternB, text)
+  matchB = re.search(patternB, text, flags=re.DOTALL)
+  if matchB:
+        filtered_text = matchB.group(2)
+        match = re.search(pattern, filtered_text, flags=re.DOTALL)
+        if match:
+          filtered_text = match.group(2)
+          filtered_text = re.sub(phraseC, "", filtered_text, flags=re.DOTALL)  # Replaces the matched pattern with an empty string
+          return filtered_text
+        else:
+          return filtered_text
+  else:
+        # Handle the case where no match is found
+        return text
 def upload_to_ftp(filename):
     try:
         transport = paramiko.Transport((FTP_HOST, 22))
         filename= f'rv_IP_{timestamp}.png'
         print("-- using image file --")
         print(caption)
+        print("-- generating further caption --")
+        system_prompt_rewrite = (
+            "You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
+        )
+        user_prompt_rewrite = (
+            "Rewrite this prompt to be more descriptive and detailed and only return the rewritten text: "
+        )
+        user_prompt_rewrite_2 = (
+            "Rephrase this scene to have more elaborate details: "
+        )
+        input_text = f"{system_prompt_rewrite} {user_prompt_rewrite} {prompt}"
+        input_text_2 = f"{system_prompt_rewrite} {user_prompt_rewrite_2} {prompt}"
+        print("-- got prompt --")
+        # Encode the input text and include the attention mask
+        encoded_inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
+        encoded_inputs_2 = tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
+        # Ensure all values are on the correct device
+        input_ids = encoded_inputs["input_ids"].to(device)
+        input_ids_2 = encoded_inputs_2["input_ids"].to(device)
+        attention_mask = encoded_inputs["attention_mask"].to(device)
+        attention_mask_2 = encoded_inputs_2["attention_mask"].to(device)
+        print("-- tokenize prompt --")
+          # Google T5
+        #input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=512,
+            temperature=0.2,
+            top_p=0.9,
+            do_sample=True,
+        )
+        outputs_2 = model.generate(
+            input_ids=input_ids_2,
+            attention_mask=attention_mask_2,
+            max_new_tokens=65,
+            temperature=0.2,
+            top_p=0.9,
+            do_sample=True,
+        )
+        # Use the encoded tensor 'text_inputs' here
+        enhanced_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        enhanced_prompt_2 = tokenizer.decode(outputs_2[0], skip_special_tokens=True)
+        print('-- generated prompt --')
+        enhanced_prompt = filter_text(enhanced_prompt,prompt)
+        enhanced_prompt_2 = filter_text(enhanced_prompt_2,prompt)
+        print('-- filtered prompt --')
+        print(enhanced_prompt)
+        print('-- filtered prompt 2 --')
+        print(enhanced_prompt_2)
         print('-- generating image --')
         sd_image = ip_model.generate(
                 pil_image_1=sd_image_a,