Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -158,15 +158,16 @@ def load_and_prepare_model():
|
|
158 |
pipe = load_and_prepare_model()
|
159 |
|
160 |
# text models
|
161 |
-
checkpoint = "microsoft/Phi-3.5-mini-instruct"
|
|
|
162 |
captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
|
163 |
captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
|
164 |
captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
|
165 |
model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
|
166 |
processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
|
171 |
ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
|
172 |
text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
|
@@ -242,14 +243,14 @@ def captioning(img):
|
|
242 |
output_prompt=[]
|
243 |
# Initial caption generation without a prompt:
|
244 |
inputsa = processor5(images=img, return_tensors="pt").to('cuda')
|
245 |
-
generated_ids = model5.generate(**inputsa, min_length=
|
246 |
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
247 |
output_prompt.append(generated_text)
|
248 |
print(generated_text)
|
249 |
# Loop through prompts array:
|
250 |
for prompt in prompts_array:
|
251 |
inputs = processor5(images=img, text=prompt, return_tensors="pt").to('cuda')
|
252 |
-
generated_ids = model5.generate(**inputs
|
253 |
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
254 |
response_text = generated_text.replace(prompt, "").strip() #Or could try .split(prompt, 1)[-1].strip()
|
255 |
output_prompt.append(response_text)
|
@@ -263,7 +264,7 @@ def captioning(img):
|
|
263 |
#output_prompt.append(response_text)
|
264 |
print(output_prompt)
|
265 |
return output_prompt
|
266 |
-
|
267 |
def expand_prompt(prompt):
|
268 |
system_prompt_rewrite = (
|
269 |
"You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
|
@@ -316,7 +317,7 @@ def expand_prompt(prompt):
|
|
316 |
print(enhanced_prompt_2)
|
317 |
enh_prompt=[enhanced_prompt,enhanced_prompt_2]
|
318 |
return enh_prompt
|
319 |
-
|
320 |
@spaces.GPU(duration=40)
|
321 |
def generate_30(
|
322 |
prompt: str = "",
|
@@ -398,10 +399,9 @@ def generate_30(
|
|
398 |
print(caption_2)
|
399 |
print("-- generating further caption --")
|
400 |
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
|
406 |
print('-- generating image --')
|
407 |
sd_image = ip_model.generate(
|
|
|
158 |
pipe = load_and_prepare_model()
|
159 |
|
160 |
# text models
|
161 |
+
#checkpoint = "microsoft/Phi-3.5-mini-instruct"
|
162 |
+
checkpoint = "ford442/Phi-3.5-mini-instruct-bf16"
|
163 |
captioner = pipeline(model="ydshieh/vit-gpt2-coco-en",device='cuda', task="image-to-text")
|
164 |
captioner_2 = pipeline(model="Salesforce/blip-image-captioning-base",device='cuda', task="image-to-text")
|
165 |
captioner_3 = pipeline(model="ford442/blip-image-to-text-large-bf16",device='cuda', task="image-to-text")
|
166 |
model5 = Blip2ForConditionalGeneration.from_pretrained("ford442/blip2-image-to-text-bf16").to('cuda')
|
167 |
processor5 = Blip2Processor.from_pretrained("ford442/blip2-image-to-text-bf16")
|
168 |
+
txt_tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=False)
|
169 |
+
txt_tokenizer.tokenizer_legacy=False
|
170 |
+
model = AutoModelForCausalLM.from_pretrained(checkpoint,attn_implementation="flash_attention_2").to('cuda')
|
171 |
|
172 |
ip_model = IPAdapterXL(pipe, local_folder, ip_ckpt, device)
|
173 |
text_encoder=CLIPTextModel.from_pretrained('ford442/RealVisXL_V5.0_BF16', subfolder='text_encoder',token=True).to(device=device, dtype=torch.bfloat16)
|
|
|
243 |
output_prompt=[]
|
244 |
# Initial caption generation without a prompt:
|
245 |
inputsa = processor5(images=img, return_tensors="pt").to('cuda')
|
246 |
+
generated_ids = model5.generate(**inputsa, min_length=32, max_length=64)
|
247 |
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
248 |
output_prompt.append(generated_text)
|
249 |
print(generated_text)
|
250 |
# Loop through prompts array:
|
251 |
for prompt in prompts_array:
|
252 |
inputs = processor5(images=img, text=prompt, return_tensors="pt").to('cuda')
|
253 |
+
generated_ids = model5.generate(**inputs) # Adjust max_length if needed
|
254 |
generated_text = processor5.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
255 |
response_text = generated_text.replace(prompt, "").strip() #Or could try .split(prompt, 1)[-1].strip()
|
256 |
output_prompt.append(response_text)
|
|
|
264 |
#output_prompt.append(response_text)
|
265 |
print(output_prompt)
|
266 |
return output_prompt
|
267 |
+
|
268 |
def expand_prompt(prompt):
|
269 |
system_prompt_rewrite = (
|
270 |
"You are an AI assistant that rewrites image prompts to be more descriptive and detailed."
|
|
|
317 |
print(enhanced_prompt_2)
|
318 |
enh_prompt=[enhanced_prompt,enhanced_prompt_2]
|
319 |
return enh_prompt
|
320 |
+
|
321 |
@spaces.GPU(duration=40)
|
322 |
def generate_30(
|
323 |
prompt: str = "",
|
|
|
399 |
print(caption_2)
|
400 |
print("-- generating further caption --")
|
401 |
|
402 |
+
expand_prompt(prompt)
|
403 |
+
expand_prompt(caption)
|
404 |
+
expand_prompt(caption_2)
|
|
|
405 |
|
406 |
print('-- generating image --')
|
407 |
sd_image = ip_model.generate(
|