1inkusFace commited on
Commit
ea0b581
·
verified ·
1 Parent(s): 4d5c246

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -12
app.py CHANGED
@@ -158,15 +158,16 @@ def load_and_prepare_model():
158
  pipe = load_and_prepare_model()
159
 
160
  # text models
161
- checkpoint = "microsoft/Phi-3.5-mini-instruct"
 
162
  captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
163
  captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
164
  captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
165
  model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
166
  processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
167
- #txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=False)
168
- #txt_tokenizer.tokenizer_legacy=False
169
- #model = AutoModelForCausalLM.from_pretrained(checkpoint).to('cuda')
170
 
171
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
172
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
@@ -242,14 +243,14 @@ def captioning(img):
242
  output_prompt=[]
243
  # Initial caption generation without a prompt:
244
  inputsa = processor5(images=img, return_tensors="pt").to('cuda')
245
- generated_ids = model5.generate(**inputsa, min_length=24, max_length=42)
246
  generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
247
  output_prompt.append(generated_text)
248
  print(generated_text)
249
  # Loop through prompts array:
250
  for prompt in prompts_array:
251
  inputs = processor5(images=img, text=prompt, return_tensors="pt").to('cuda')
252
- generated_ids = model5.generate(**inputs, max_length=42) # Adjust max_length if needed
253
  generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
254
  response_text = generated_text.replace(prompt, "").strip() #Or could try .split(prompt, 1)[-1].strip()
255
  output_prompt.append(response_text)
@@ -263,7 +264,7 @@ def captioning(img):
263
  #output_prompt.append(response_text)
264
  print(output_prompt)
265
  return output_prompt
266
- '''
267
  def expand_prompt(prompt):
268
  system_prompt_rewrite = (
269
  "You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
@@ -316,7 +317,7 @@ def expand_prompt(prompt):
316
  print(enhanced_prompt_2)
317
  enh_prompt=[enhanced_prompt,enhanced_prompt_2]
318
  return enh_prompt
319
- '''
320
  @spaces.GPU(duration=40)
321
  def generate_30(
322
  prompt: str = "",
@@ -398,10 +399,9 @@ def generate_30(
398
  print(caption_2)
399
  print("-- generating further caption --")
400
 
401
- #expand_prompt(prompt)
402
- #expand_prompt(caption)
403
- #expand_prompt(caption_2)
404
-
405
 
406
  print('-- generating image --')
407
  sd_image = ip_model.generate(
 
158
  pipe = load_and_prepare_model()
159
 
160
  # text models
161
+ #checkpoint = "microsoft/Phi-3.5-mini-instruct"
162
+ checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
163
  captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
164
  captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
165
  captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
166
  model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
167
  processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
168
+ txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=False)
169
+ txt_tokenizer.tokenizer_legacy=False
170
+ model = AutoModelForCausalLM.from_pretrained(checkpoint,attn_implementation="flash_attention_2").to('cuda')
171
 
172
  ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
173
  text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
 
243
  output_prompt=[]
244
  # Initial caption generation without a prompt:
245
  inputsa = processor5(images=img, return_tensors="pt").to('cuda')
246
+ generated_ids = model5.generate(**inputsa, min_length=32, max_length=64)
247
  generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
248
  output_prompt.append(generated_text)
249
  print(generated_text)
250
  # Loop through prompts array:
251
  for prompt in prompts_array:
252
  inputs = processor5(images=img, text=prompt, return_tensors="pt").to('cuda')
253
+ generated_ids = model5.generate(**inputs) # Adjust max_length if needed
254
  generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
255
  response_text = generated_text.replace(prompt, "").strip() #Or could try .split(prompt, 1)[-1].strip()
256
  output_prompt.append(response_text)
 
264
  #output_prompt.append(response_text)
265
  print(output_prompt)
266
  return output_prompt
267
+
268
  def expand_prompt(prompt):
269
  system_prompt_rewrite = (
270
  "You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
 
317
  print(enhanced_prompt_2)
318
  enh_prompt=[enhanced_prompt,enhanced_prompt_2]
319
  return enh_prompt
320
+
321
  @spaces.GPU(duration=40)
322
  def generate_30(
323
  prompt: str = "",
 
399
  print(caption_2)
400
  print("-- generating further caption --")
401
 
402
+ expand_prompt(prompt)
403
+ expand_prompt(caption)
404
+ expand_prompt(caption_2)
 
405
 
406
  print('-- generating image --')
407
  sd_image = ip_model.generate(