Spaces:
Runtime error
Runtime error
Improved user interface
Browse files- app.py +40 -13
- bounded_attention.py +39 -3
app.py
CHANGED
|
@@ -20,6 +20,7 @@ WHITE = 255
|
|
| 20 |
COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
|
| 21 |
|
| 22 |
PROMPT1 = "a ginger kitten and a gray puppy in a yard"
|
|
|
|
| 23 |
SUBJECT_TOKEN_INDICES1 = "2,3;6,7"
|
| 24 |
FILTER_TOKEN_INDICES1 = "1,4,5,8,9"
|
| 25 |
NUM_TOKENS1 = "10"
|
|
@@ -158,6 +159,7 @@ FOOTNOTE = """
|
|
| 158 |
def inference(
|
| 159 |
boxes,
|
| 160 |
prompts,
|
|
|
|
| 161 |
subject_token_indices,
|
| 162 |
filter_token_indices,
|
| 163 |
num_tokens,
|
|
@@ -190,9 +192,10 @@ def inference(
|
|
| 190 |
editor = BoundedAttention(
|
| 191 |
boxes,
|
| 192 |
prompts,
|
| 193 |
-
subject_token_indices,
|
| 194 |
list(range(70, 82)),
|
| 195 |
list(range(70, 82)),
|
|
|
|
|
|
|
| 196 |
filter_token_indices=filter_token_indices,
|
| 197 |
eos_token_index=eos_token_index,
|
| 198 |
cross_loss_coef=cross_loss_scale,
|
|
@@ -214,6 +217,7 @@ def inference(
|
|
| 214 |
@spaces.GPU(duration=340)
|
| 215 |
def generate(
|
| 216 |
prompt,
|
|
|
|
| 217 |
subject_token_indices,
|
| 218 |
filter_token_indices,
|
| 219 |
num_tokens,
|
|
@@ -231,27 +235,45 @@ def generate(
|
|
| 231 |
seed,
|
| 232 |
boxes,
|
| 233 |
):
|
| 234 |
-
|
|
|
|
| 235 |
subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
|
| 236 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
raise gr.Error("""
|
| 238 |
The number of boxes should be equal to the number of subjects.
|
| 239 |
Number of boxes drawn: {}, number of subjects: {}.
|
| 240 |
-
""".format(len(boxes),
|
| 241 |
|
| 242 |
filter_token_indices = convert_token_indices(filter_token_indices) if len(filter_token_indices.strip()) > 0 else None
|
| 243 |
num_tokens = int(num_tokens) if len(num_tokens.strip()) > 0 else None
|
| 244 |
prompts = [prompt.strip(".").strip(",").strip()] * batch_size
|
| 245 |
|
| 246 |
images = inference(
|
| 247 |
-
boxes, prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
|
| 248 |
final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 249 |
classifier_free_guidance_scale, num_iterations, loss_threshold, num_guidance_steps, seed)
|
| 250 |
|
| 251 |
return images
|
| 252 |
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
def convert_token_indices(token_indices, nested=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
if nested:
|
| 256 |
return [convert_token_indices(indices, nested=False) for indices in token_indices.split(";")]
|
| 257 |
|
|
@@ -331,8 +353,13 @@ def main():
|
|
| 331 |
placeholder=PROMPT1,
|
| 332 |
)
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
subject_token_indices = gr.Textbox(
|
| 335 |
-
label="The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
|
| 336 |
placeholder=SUBJECT_TOKEN_INDICES1,
|
| 337 |
)
|
| 338 |
|
|
@@ -393,7 +420,7 @@ def main():
|
|
| 393 |
generate_image_button.click(
|
| 394 |
fn=generate,
|
| 395 |
inputs=[
|
| 396 |
-
prompt, subject_token_indices, filter_token_indices, num_tokens,
|
| 397 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 398 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
| 399 |
seed,
|
|
@@ -407,31 +434,31 @@ def main():
|
|
| 407 |
gr.Examples(
|
| 408 |
examples=[
|
| 409 |
[
|
| 410 |
-
PROMPT1, SUBJECT_TOKEN_INDICES1, FILTER_TOKEN_INDICES1, NUM_TOKENS1,
|
| 411 |
15, 10, 15, 3, 1, 1,
|
| 412 |
7.5, 1, 5, 0.2, 8,
|
| 413 |
12,
|
| 414 |
],
|
| 415 |
[
|
| 416 |
-
PROMPT2, "7,8,17;11,12,17;15,16,17", "5,6,9,10,13,14,18,19", "21",
|
| 417 |
25, 18, 15, 3, 1, 1,
|
| 418 |
7.5, 1, 5, 0.2, 8,
|
| 419 |
286,
|
| 420 |
],
|
| 421 |
[
|
| 422 |
-
PROMPT3, "7;10;13,14;17", "5,6,8,9,11,12,15,16", "17",
|
| 423 |
18, 12, 15, 3, 1, 1,
|
| 424 |
7.5, 1, 5, 0.2, 8,
|
| 425 |
216,
|
| 426 |
],
|
| 427 |
[
|
| 428 |
-
PROMPT4, "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17",
|
| 429 |
25, 18, 15, 3, 1, 1,
|
| 430 |
7.5, 1, 5, 0.2, 8,
|
| 431 |
82,
|
| 432 |
],
|
| 433 |
[
|
| 434 |
-
PROMPT5, "2,3;6,7;10,11;14,15;18,19", "1,4,5,8,9,12,13,16,17,20,21", "22",
|
| 435 |
18, 12, 15, 3, 1, 1,
|
| 436 |
7.5, 1, 5, 0.2, 8,
|
| 437 |
152,
|
|
@@ -439,7 +466,7 @@ def main():
|
|
| 439 |
],
|
| 440 |
fn=build_example_layout,
|
| 441 |
inputs=[
|
| 442 |
-
prompt, subject_token_indices, filter_token_indices, num_tokens,
|
| 443 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 444 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
| 445 |
seed,
|
|
|
|
| 20 |
COLORS = ["red", "blue", "green", "orange", "purple", "turquoise", "olive"]
|
| 21 |
|
| 22 |
PROMPT1 = "a ginger kitten and a gray puppy in a yard"
|
| 23 |
+
SUBJECT_SUB_PROMPTS1 = "ginger kitten;gray puppy"
|
| 24 |
SUBJECT_TOKEN_INDICES1 = "2,3;6,7"
|
| 25 |
FILTER_TOKEN_INDICES1 = "1,4,5,8,9"
|
| 26 |
NUM_TOKENS1 = "10"
|
|
|
|
| 159 |
def inference(
|
| 160 |
boxes,
|
| 161 |
prompts,
|
| 162 |
+
subject_sub_prompts,
|
| 163 |
subject_token_indices,
|
| 164 |
filter_token_indices,
|
| 165 |
num_tokens,
|
|
|
|
| 192 |
editor = BoundedAttention(
|
| 193 |
boxes,
|
| 194 |
prompts,
|
|
|
|
| 195 |
list(range(70, 82)),
|
| 196 |
list(range(70, 82)),
|
| 197 |
+
subject_sub_prompts=subject_sub_prompts,
|
| 198 |
+
subject_token_indices=subject_token_indices,
|
| 199 |
filter_token_indices=filter_token_indices,
|
| 200 |
eos_token_index=eos_token_index,
|
| 201 |
cross_loss_coef=cross_loss_scale,
|
|
|
|
| 217 |
@spaces.GPU(duration=340)
|
| 218 |
def generate(
|
| 219 |
prompt,
|
| 220 |
+
subject_sub_prompts,
|
| 221 |
subject_token_indices,
|
| 222 |
filter_token_indices,
|
| 223 |
num_tokens,
|
|
|
|
| 235 |
seed,
|
| 236 |
boxes,
|
| 237 |
):
|
| 238 |
+
num_subjects = 0
|
| 239 |
+
subject_sub_prompts = convert_sub_prompts(subject_sub_prompts)
|
| 240 |
subject_token_indices = convert_token_indices(subject_token_indices, nested=True)
|
| 241 |
+
if subject_sub_prompts is not None:
|
| 242 |
+
num_subjects = len(subject_sub_prompts)
|
| 243 |
+
if subject_token_indices is not None:
|
| 244 |
+
num_subjects = len(subject_token_indices)
|
| 245 |
+
|
| 246 |
+
if len(boxes) != num_subjects:
|
| 247 |
raise gr.Error("""
|
| 248 |
The number of boxes should be equal to the number of subjects.
|
| 249 |
Number of boxes drawn: {}, number of subjects: {}.
|
| 250 |
+
""".format(len(boxes), nun_subjects))
|
| 251 |
|
| 252 |
filter_token_indices = convert_token_indices(filter_token_indices) if len(filter_token_indices.strip()) > 0 else None
|
| 253 |
num_tokens = int(num_tokens) if len(num_tokens.strip()) > 0 else None
|
| 254 |
prompts = [prompt.strip(".").strip(",").strip()] * batch_size
|
| 255 |
|
| 256 |
images = inference(
|
| 257 |
+
boxes, prompts, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens, init_step_size,
|
| 258 |
final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 259 |
classifier_free_guidance_scale, num_iterations, loss_threshold, num_guidance_steps, seed)
|
| 260 |
|
| 261 |
return images
|
| 262 |
|
| 263 |
|
| 264 |
+
def convert_sub_prompts(sub_prompts):
|
| 265 |
+
sub_prompts = sub_prompts.strip()
|
| 266 |
+
if len(sub_prompts) == 0:
|
| 267 |
+
return None
|
| 268 |
+
|
| 269 |
+
return [sub_prompt.strip() for sub_prompt in sub_prompts.split(";")]
|
| 270 |
+
|
| 271 |
+
|
| 272 |
def convert_token_indices(token_indices, nested=False):
|
| 273 |
+
token_indices = token_indices.strip()
|
| 274 |
+
if len(token_indices) == 0:
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
if nested:
|
| 278 |
return [convert_token_indices(indices, nested=False) for indices in token_indices.split(";")]
|
| 279 |
|
|
|
|
| 353 |
placeholder=PROMPT1,
|
| 354 |
)
|
| 355 |
|
| 356 |
+
subject_sub_prompts = gr.Textbox(
|
| 357 |
+
label="Sub-prompts for each subject (separate with semicolons)",
|
| 358 |
+
placeholder=SUBJECT_SUB_PROMPTS1,
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
subject_token_indices = gr.Textbox(
|
| 362 |
+
label="Optional: The token indices of each subject (separate indices for the same subject with commas, and for different subjects with semicolons)",
|
| 363 |
placeholder=SUBJECT_TOKEN_INDICES1,
|
| 364 |
)
|
| 365 |
|
|
|
|
| 420 |
generate_image_button.click(
|
| 421 |
fn=generate,
|
| 422 |
inputs=[
|
| 423 |
+
prompt, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens,
|
| 424 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 425 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
| 426 |
seed,
|
|
|
|
| 434 |
gr.Examples(
|
| 435 |
examples=[
|
| 436 |
[
|
| 437 |
+
PROMPT1, SUBJECT_SUB_PROMPTS1, SUBJECT_TOKEN_INDICES1, FILTER_TOKEN_INDICES1, NUM_TOKENS1,
|
| 438 |
15, 10, 15, 3, 1, 1,
|
| 439 |
7.5, 1, 5, 0.2, 8,
|
| 440 |
12,
|
| 441 |
],
|
| 442 |
[
|
| 443 |
+
PROMPT2, "cute unicorn;pink hedgehog;nerdy owl", "7,8,17;11,12,17;15,16,17", "5,6,9,10,13,14,18,19", "21",
|
| 444 |
25, 18, 15, 3, 1, 1,
|
| 445 |
7.5, 1, 5, 0.2, 8,
|
| 446 |
286,
|
| 447 |
],
|
| 448 |
[
|
| 449 |
+
PROMPT3, "astronaut;robot;green alien;spaceship", "7;10;13,14;17", "5,6,8,9,11,12,15,16", "17",
|
| 450 |
18, 12, 15, 3, 1, 1,
|
| 451 |
7.5, 1, 5, 0.2, 8,
|
| 452 |
216,
|
| 453 |
],
|
| 454 |
[
|
| 455 |
+
PROMPT4, "semi trailer;concrete mixer;helicopter", "9,10;13,14;17", "1,4,5,7,8,11,12,15,16", "17",
|
| 456 |
25, 18, 15, 3, 1, 1,
|
| 457 |
7.5, 1, 5, 0.2, 8,
|
| 458 |
82,
|
| 459 |
],
|
| 460 |
[
|
| 461 |
+
PROMPT5, "golden retriever;german shepherd;boston terrier;english bulldog;border collie", "2,3;6,7;10,11;14,15;18,19", "1,4,5,8,9,12,13,16,17,20,21", "22",
|
| 462 |
18, 12, 15, 3, 1, 1,
|
| 463 |
7.5, 1, 5, 0.2, 8,
|
| 464 |
152,
|
|
|
|
| 466 |
],
|
| 467 |
fn=build_example_layout,
|
| 468 |
inputs=[
|
| 469 |
+
prompt, subject_sub_prompts, subject_token_indices, filter_token_indices, num_tokens,
|
| 470 |
init_step_size, final_step_size, first_refinement_step, num_clusters_per_subject, cross_loss_scale, self_loss_scale,
|
| 471 |
classifier_free_guidance_scale, batch_size, num_iterations, loss_threshold, num_guidance_steps,
|
| 472 |
seed,
|
bounded_attention.py
CHANGED
|
@@ -21,9 +21,10 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
| 21 |
self,
|
| 22 |
boxes,
|
| 23 |
prompts,
|
| 24 |
-
subject_token_indices,
|
| 25 |
cross_loss_layers,
|
| 26 |
self_loss_layers,
|
|
|
|
|
|
|
| 27 |
cross_mask_layers=None,
|
| 28 |
self_mask_layers=None,
|
| 29 |
eos_token_index=None,
|
|
@@ -56,6 +57,7 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
| 56 |
super().__init__()
|
| 57 |
self.boxes = boxes
|
| 58 |
self.prompts = prompts
|
|
|
|
| 59 |
self.subject_token_indices = subject_token_indices
|
| 60 |
self.cross_loss_layers = set(cross_loss_layers)
|
| 61 |
self.self_loss_layers = set(self_loss_layers)
|
|
@@ -186,8 +188,9 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
| 186 |
self.optimized = False
|
| 187 |
return latents
|
| 188 |
|
| 189 |
-
def _tokenize(self):
|
| 190 |
-
|
|
|
|
| 191 |
tokens = self.model.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
|
| 192 |
return [token[:-4] for token in tokens] # remove ending </w>
|
| 193 |
|
|
@@ -195,6 +198,38 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
| 195 |
tagged_tokens = nltk.pos_tag(self._tokenize())
|
| 196 |
return [type(self).TAG_RULES.get(token, tag) for token, tag in tagged_tokens]
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
def _determine_eos_token(self):
|
| 199 |
tokens = self._tokenize()
|
| 200 |
eos_token_index = len(tokens) + 1
|
|
@@ -224,6 +259,7 @@ class BoundedAttention(injection_utils.AttentionBase):
|
|
| 224 |
self.leading_token_indices = leading_token_indices
|
| 225 |
|
| 226 |
def _determine_tokens(self):
|
|
|
|
| 227 |
self._determine_eos_token()
|
| 228 |
self._determine_filter_tokens()
|
| 229 |
self._determine_leading_tokens()
|
|
|
|
| 21 |
self,
|
| 22 |
boxes,
|
| 23 |
prompts,
|
|
|
|
| 24 |
cross_loss_layers,
|
| 25 |
self_loss_layers,
|
| 26 |
+
subject_sub_prompts=None,
|
| 27 |
+
subject_token_indices=None,
|
| 28 |
cross_mask_layers=None,
|
| 29 |
self_mask_layers=None,
|
| 30 |
eos_token_index=None,
|
|
|
|
| 57 |
super().__init__()
|
| 58 |
self.boxes = boxes
|
| 59 |
self.prompts = prompts
|
| 60 |
+
self.subject_sub_prompts = subject_sub_prompts
|
| 61 |
self.subject_token_indices = subject_token_indices
|
| 62 |
self.cross_loss_layers = set(cross_loss_layers)
|
| 63 |
self.self_loss_layers = set(self_loss_layers)
|
|
|
|
| 188 |
self.optimized = False
|
| 189 |
return latents
|
| 190 |
|
| 191 |
+
def _tokenize(self, prompt=None):
|
| 192 |
+
prompt = self.prompts[0] if prompt is None else prompt
|
| 193 |
+
ids = self.model.tokenizer.encode(prompt)
|
| 194 |
tokens = self.model.tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)
|
| 195 |
return [token[:-4] for token in tokens] # remove ending </w>
|
| 196 |
|
|
|
|
| 198 |
tagged_tokens = nltk.pos_tag(self._tokenize())
|
| 199 |
return [type(self).TAG_RULES.get(token, tag) for token, tag in tagged_tokens]
|
| 200 |
|
| 201 |
+
def _determine_subject_tokens(self):
|
| 202 |
+
if self.subject_token_indices is not None:
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
+
if self.subject_sub_prompts is None:
|
| 206 |
+
raise ValueError('Missing subject sub-prompts.')
|
| 207 |
+
|
| 208 |
+
tokens = self._tokenize()
|
| 209 |
+
|
| 210 |
+
matches = []
|
| 211 |
+
self.subject_token_indices = []
|
| 212 |
+
for sub_prompt in self.subject_sub_prompts:
|
| 213 |
+
token_indices = self._determine_specific_subject_tokens(tokens, sub_prompt, matches)
|
| 214 |
+
matches.append(token_indices[0])
|
| 215 |
+
self.subject_token_indices.append(token_indices)
|
| 216 |
+
|
| 217 |
+
def _determine_specific_subject_tokens(self, tokens, sub_prompt, previous_matches):
|
| 218 |
+
sub_tokens = self._tokenize(sub_prompt)
|
| 219 |
+
sub_len = len(sub_tokens)
|
| 220 |
+
|
| 221 |
+
matches = []
|
| 222 |
+
for i in range(len(tokens)):
|
| 223 |
+
if tokens[i] == sub_tokens[0] and tokens[i:i + sub_len] == sub_tokens:
|
| 224 |
+
matches.append(i + 1)
|
| 225 |
+
|
| 226 |
+
if len(matches) == 0:
|
| 227 |
+
raise ValueError(f'Couldn\'t locate sub-prompt: {sub_prompt}.')
|
| 228 |
+
|
| 229 |
+
new_matches = [i for i in matches if i not in previous_matches]
|
| 230 |
+
last_match = new_matches[0] if len(new_matches) > 0 else matches[-1]
|
| 231 |
+
return list(range(last_match, last_match + sub_len))
|
| 232 |
+
|
| 233 |
def _determine_eos_token(self):
|
| 234 |
tokens = self._tokenize()
|
| 235 |
eos_token_index = len(tokens) + 1
|
|
|
|
| 259 |
self.leading_token_indices = leading_token_indices
|
| 260 |
|
| 261 |
def _determine_tokens(self):
|
| 262 |
+
self._determine_subject_tokens()
|
| 263 |
self._determine_eos_token()
|
| 264 |
self._determine_filter_tokens()
|
| 265 |
self._determine_leading_tokens()
|