1inkusFace commited on
Commit
4167ce8
·
verified ·
1 Parent(s): 631e75c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -2
app.py CHANGED
@@ -4,6 +4,7 @@
4
  # in the Software without restriction, including without limitation the rights
5
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
6
  # copies of the Software, and to permit persons to whom the Software is
 
7
  import spaces
8
  import os
9
  import random
@@ -20,7 +21,7 @@ from ip_adapter import IPAdapterXL
20
  from huggingface_hub import snapshot_download
21
  import torch
22
  from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
23
- from transformers import CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
24
 
25
  torch.backends.cuda.matmul.allow_tf32 = False
26
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@@ -165,7 +166,9 @@ captioner_3 = pipeline(model="Salesforce/blip-image-captioning-large",device='cu
165
  #model5 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco").to('cuda')
166
  #processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.bfloat16).to('cuda')
167
  #processor5 = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
168
-
 
 
169
 
170
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
171
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
@@ -175,6 +178,27 @@ MAX_SEED = np.iinfo(np.int32).max
175
 
176
  neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  def upload_to_ftp(filename):
179
  try:
180
  transport = paramiko.Transport((FTP_HOST, 22))
@@ -277,6 +301,62 @@ def generate_30(
277
  filename= f'rv_IP_{timestamp}.png'
278
  print("-- using image file --")
279
  print(caption)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  print('-- generating image --')
281
  sd_image = ip_model.generate(
282
  pil_image_1=sd_image_a,
 
4
  # in the Software without restriction, including without limitation the rights
5
  # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
6
  # copies of the Software, and to permit persons to whom the Software is
7
+
8
  import spaces
9
  import os
10
  import random
 
21
  from huggingface_hub import snapshot_download
22
  import torch
23
  from diffusers import AutoencoderKL, StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
24
+ from transformers import AutoTokenizer, AutoModelForCausalLM, CLIPTextModelWithProjection, CLIPTextModel, Blip2Processor, Blip2ForConditionalGeneration, pipeline
25
 
26
  torch.backends.cuda.matmul.allow_tf32 = False
27
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
 
166
  #model5 = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b-coco").to('cuda')
167
  #processor5 = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b").to(torch.bfloat16).to('cuda')
168
  #processor5 = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b-coco")
169
+ txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=False)
170
+ txt_tokenizer.tokenizer_legacy=False
171
+ model = AutoModelForCausalLM.from_pretrained(checkpoint).to('cuda')
172
 
173
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
174
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
 
178
 
179
  neg_prompt_2 = " 'non-photorealistic':1.5, 'unrealistic skin','unattractive face':1.3, 'low quality':1.1, ('dull color scheme', 'dull colors', 'digital noise':1.2),'amateurish', 'poorly drawn face':1.3, 'poorly drawn', 'distorted face', 'low resolution', 'simplistic' "
180
 
181
+ def filter_text(text,phraseC):
182
+ """Filters out the text up to and including 'Rewritten Prompt:'."""
183
+ phrase = "Rewritten Prompt:"
184
+ phraseB = "rewritten text:"
185
+ pattern = f"(.*?){re.escape(phrase)}(.*)"
186
+ patternB = f"(.*?){re.escape(phraseB)}(.*)"
187
+ # matchB = re.search(patternB, text)
188
+ matchB = re.search(patternB, text, flags=re.DOTALL)
189
+ if matchB:
190
+ filtered_text = matchB.group(2)
191
+ match = re.search(pattern, filtered_text, flags=re.DOTALL)
192
+ if match:
193
+ filtered_text = match.group(2)
194
+ filtered_text = re.sub(phraseC, "", filtered_text, flags=re.DOTALL) # Replaces the matched pattern with an empty string
195
+ return filtered_text
196
+ else:
197
+ return filtered_text
198
+ else:
199
+ # Handle the case where no match is found
200
+ return text
201
+
202
  def upload_to_ftp(filename):
203
  try:
204
  transport = paramiko.Transport((FTP_HOST, 22))
 
301
  filename= f'rv_IP_{timestamp}.png'
302
  print("-- using image file --")
303
  print(caption)
304
+ print("-- generating further caption --")
305
+
306
+
307
+ system_prompt_rewrite = (
308
+ "You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
309
+ )
310
+ user_prompt_rewrite = (
311
+ "Rewrite this prompt to be more descriptive and detailed and only return the rewritten text: "
312
+ )
313
+ user_prompt_rewrite_2 = (
314
+ "Rephrase this scene to have more elaborate details: "
315
+ )
316
+ input_text = f"{system_prompt_rewrite} {user_prompt_rewrite} {prompt}"
317
+ input_text_2 = f"{system_prompt_rewrite} {user_prompt_rewrite_2} {prompt}"
318
+ print("-- got prompt --")
319
+ # Encode the input text and include the attention mask
320
+ encoded_inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True)
321
+ encoded_inputs_2 = tokenizer(input_text_2, return_tensors="pt", return_attention_mask=True)
322
+ # Ensure all values are on the correct device
323
+ input_ids = encoded_inputs["input_ids"].to(device)
324
+ input_ids_2 = encoded_inputs_2["input_ids"].to(device)
325
+ attention_mask = encoded_inputs["attention_mask"].to(device)
326
+ attention_mask_2 = encoded_inputs_2["attention_mask"].to(device)
327
+ print("-- tokenize prompt --")
328
+ # Google T5
329
+ #input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
330
+ outputs = model.generate(
331
+ input_ids=input_ids,
332
+ attention_mask=attention_mask,
333
+ max_new_tokens=512,
334
+ temperature=0.2,
335
+ top_p=0.9,
336
+ do_sample=True,
337
+ )
338
+ outputs_2 = model.generate(
339
+ input_ids=input_ids_2,
340
+ attention_mask=attention_mask_2,
341
+ max_new_tokens=65,
342
+ temperature=0.2,
343
+ top_p=0.9,
344
+ do_sample=True,
345
+ )
346
+ # Use the encoded tensor 'text_inputs' here
347
+ enhanced_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
348
+ enhanced_prompt_2 = tokenizer.decode(outputs_2[0], skip_special_tokens=True)
349
+ print('-- generated prompt --')
350
+ enhanced_prompt = filter_text(enhanced_prompt,prompt)
351
+ enhanced_prompt_2 = filter_text(enhanced_prompt_2,prompt)
352
+ print('-- filtered prompt --')
353
+ print(enhanced_prompt)
354
+ print('-- filtered prompt 2 --')
355
+ print(enhanced_prompt_2)
356
+
357
+
358
+
359
+
360
  print('-- generating image --')
361
  sd_image = ip_model.generate(
362
  pil_image_1=sd_image_a,