Himanshu-AT commited on
Commit
4352a3f
·
1 Parent(s): c6696f9

Add gemini.py for generative AI styling prompt generation

Browse files
Files changed (3) hide show
  1. app.py +126 -52
  2. gemini.py +67 -0
  3. segment-anything +1 -0
app.py CHANGED
@@ -1,25 +1,21 @@
1
  import gradio as gr
2
  import numpy as np
3
-
4
- import spaces
5
  import torch
6
- import spaces
7
  import random
8
-
9
- from diffusers import FluxFillPipeline
10
  from PIL import Image
 
11
 
 
 
12
 
13
  MAX_SEED = np.iinfo(np.int32).max
14
  MAX_IMAGE_SIZE = 2048
15
 
16
- pipe = FluxFillPipeline.from_pretrained("black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16).to("cuda")
 
 
17
  pipe.load_lora_weights("alvdansen/flux-koda")
18
- # pipe.enable_sequential_cpu_offload()
19
- # pipe.enable_fp16()
20
  pipe.enable_lora()
21
- # pipe.vae.enable_slicing()
22
- # pipe.vae.enable_tiling()
23
 
24
  def calculate_optimal_dimensions(image: Image.Image):
25
  # Extract the original dimensions
@@ -52,22 +48,99 @@ def calculate_optimal_dimensions(image: Image.Image):
52
  elif calculated_aspect_ratio < MIN_ASPECT_RATIO:
53
  height = (width / MIN_ASPECT_RATIO // 8) * 8
54
 
55
- # Ensure width and height remain above the minimum dimensions
56
  width = max(width, 576) if width == FIXED_DIMENSION else width
57
  height = max(height, 576) if height == FIXED_DIMENSION else height
58
 
59
  return width, height
60
 
61
- @spaces.GPU(durations=300)
62
- def infer(edit_images, prompt, seed=42, randomize_seed=False, width=1024, height=1024, guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
63
- # pipe.enable_xformers_memory_efficient_attention()
64
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  image = edit_images["background"]
66
  width, height = calculate_optimal_dimensions(image)
67
- mask = edit_images["layers"][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  if randomize_seed:
69
  seed = random.randint(0, MAX_SEED)
70
- image = pipe(
 
 
71
  prompt=prompt,
72
  image=image,
73
  mask_image=mask,
@@ -76,23 +149,14 @@ def infer(edit_images, prompt, seed=42, randomize_seed=False, width=1024, height
76
  guidance_scale=guidance_scale,
77
  num_inference_steps=num_inference_steps,
78
  generator=torch.Generator(device='cuda').manual_seed(seed),
79
- # lora_scale=0.75 // not supported in this version
80
  ).images[0]
81
 
82
- output_image_jpg = image.convert("RGB")
83
  output_image_jpg.save("output.jpg", "JPEG")
84
-
85
  return output_image_jpg, seed
86
- # return image, seed
87
 
88
- examples = [
89
- "photography of a young woman, accent lighting, (front view:1.4), "
90
- # "a tiny astronaut hatching from an egg on the moon",
91
- # "a cat holding a sign that says hello world",
92
- # "an anime illustration of a wiener schnitzel",
93
- ]
94
-
95
- css="""
96
  #col-container {
97
  margin: 0 auto;
98
  max-width: 1000px;
@@ -100,34 +164,51 @@ css="""
100
  """
101
 
102
  with gr.Blocks(css=css) as demo:
103
-
104
  with gr.Column(elem_id="col-container"):
105
- gr.Markdown(f"""# FLUX.1 [dev]
106
- """)
107
  with gr.Row():
108
  with gr.Column():
 
109
  edit_image = gr.ImageEditor(
110
- label='Upload and draw mask for inpainting',
111
  type='pil',
112
  sources=["upload", "webcam"],
113
  image_mode='RGB',
114
- layers=False,
115
  brush=gr.Brush(colors=["#FFFFFF"]),
116
- # height=600
117
  )
118
  prompt = gr.Text(
119
- label="Prompt",
120
  show_label=False,
121
  max_lines=2,
122
- placeholder="Enter your prompt",
123
  container=False,
124
  )
 
 
 
 
 
 
 
 
125
  run_button = gr.Button("Run")
126
-
127
  result = gr.Image(label="Result", show_label=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  with gr.Accordion("Advanced Settings", open=False):
130
-
131
  seed = gr.Slider(
132
  label="Seed",
133
  minimum=0,
@@ -135,11 +216,8 @@ with gr.Blocks(css=css) as demo:
135
  step=1,
136
  value=0,
137
  )
138
-
139
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
140
-
141
  with gr.Row():
142
-
143
  width = gr.Slider(
144
  label="Width",
145
  minimum=256,
@@ -148,7 +226,6 @@ with gr.Blocks(css=css) as demo:
148
  value=1024,
149
  visible=False
150
  )
151
-
152
  height = gr.Slider(
153
  label="Height",
154
  minimum=256,
@@ -157,19 +234,16 @@ with gr.Blocks(css=css) as demo:
157
  value=1024,
158
  visible=False
159
  )
160
-
161
  with gr.Row():
162
-
163
  guidance_scale = gr.Slider(
164
  label="Guidance Scale",
165
  minimum=1,
166
  maximum=30,
167
  step=0.5,
168
- value=50,
169
  )
170
-
171
  num_inference_steps = gr.Slider(
172
- label="Number of inference steps",
173
  minimum=1,
174
  maximum=50,
175
  step=1,
@@ -178,9 +252,9 @@ with gr.Blocks(css=css) as demo:
178
 
179
  gr.on(
180
  triggers=[run_button.click, prompt.submit],
181
- fn = infer,
182
- inputs = [edit_image, prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
183
- outputs = [result, seed]
184
  )
185
 
186
- demo.launch()
 
1
  import gradio as gr
2
  import numpy as np
 
 
3
  import torch
 
4
  import random
 
 
5
  from PIL import Image
6
+ import cv2
7
 
8
+ # ------------------ Inpainting Pipeline Setup ------------------ #
9
+ from diffusers import FluxFillPipeline
10
 
11
  MAX_SEED = np.iinfo(np.int32).max
12
  MAX_IMAGE_SIZE = 2048
13
 
14
+ pipe = FluxFillPipeline.from_pretrained(
15
+ "black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16
16
+ ).to("cuda")
17
  pipe.load_lora_weights("alvdansen/flux-koda")
 
 
18
  pipe.enable_lora()
 
 
19
 
20
  def calculate_optimal_dimensions(image: Image.Image):
21
  # Extract the original dimensions
 
48
  elif calculated_aspect_ratio < MIN_ASPECT_RATIO:
49
  height = (width / MIN_ASPECT_RATIO // 8) * 8
50
 
51
+ # Ensure minimum dimensions are met
52
  width = max(width, 576) if width == FIXED_DIMENSION else width
53
  height = max(height, 576) if height == FIXED_DIMENSION else height
54
 
55
  return width, height
56
 
57
+ # ------------------ SAM (Transformers) Imports and Initialization ------------------ #
58
+ from transformers import SamModel, SamProcessor
59
+
60
+ # Load the model and processor from Hugging Face.
61
+ sam_model = SamModel.from_pretrained("facebook/sam-vit-base")
62
+ sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
63
+ # (The model will run on CPU by default; if you have a CUDA device, you can send the model to "cuda")
64
+ sam_model.to("cuda" if torch.cuda.is_available() else "cpu")
65
+
66
+ def generate_mask_with_sam(image: Image.Image, mask_prompt: str):
67
+ """
68
+ Generate a segmentation mask using SAM (via Hugging Face Transformers).
69
+
70
+ The mask_prompt is expected to be a comma-separated string of two integers,
71
+ e.g. "450,600" representing an (x,y) coordinate in the image.
72
+
73
+ The function converts the coordinate into the proper input format for SAM and returns a binary mask.
74
+ """
75
+ if mask_prompt.strip() == "":
76
+ raise ValueError("No mask prompt provided.")
77
+
78
+ try:
79
+ # Parse the mask_prompt into a coordinate
80
+ coords = [int(x.strip()) for x in mask_prompt.split(",")]
81
+ if len(coords) != 2:
82
+ raise ValueError("Expected two comma-separated integers (x,y).")
83
+ except Exception as e:
84
+ raise ValueError("Invalid mask prompt. Please provide coordinates as 'x,y'. Error: " + str(e))
85
+
86
+ # The SAM processor expects a list of input points.
87
+ # Format the point as a list of lists; here we assume one point per image.
88
+ # (The Transformers SAM expects the points in [x, y] order.)
89
+ input_points = [coords] # e.g. [[450,600]]
90
+ # Optionally, you can supply input_labels (1 for foreground, 0 for background)
91
+ input_labels = [1]
92
+
93
+ # Prepare the inputs for the SAM processor.
94
+ inputs = sam_processor(images=image,
95
+ input_points=[input_points],
96
+ input_labels=[input_labels],
97
+ return_tensors="pt")
98
+
99
+ # Move tensors to the same device as the model.
100
+ device = next(sam_model.parameters()).device
101
+ inputs = {k: v.to(device) for k, v in inputs.items()}
102
+
103
+ # Forward pass through SAM.
104
+ with torch.no_grad():
105
+ outputs = sam_model(**inputs)
106
+
107
+ # The output contains predicted masks; we take the first mask from the first prompt.
108
+ # (Assuming outputs.pred_masks is of shape (batch_size, num_masks, H, W))
109
+ pred_masks = outputs.pred_masks # Tensor of shape (1, num_masks, H, W)
110
+ mask = pred_masks[0][0].detach().cpu().numpy()
111
+
112
+ # Convert the mask to binary (0 or 255) using a threshold.
113
+ mask_bin = (mask > 0.5).astype(np.uint8) * 255
114
+ mask_pil = Image.fromarray(mask_bin)
115
+ return mask_pil
116
+
117
+ # ------------------ Inference Function ------------------ #
118
+ @gr.blocks.GPU(durations=300)
119
+ def infer(edit_images, prompt, mask_prompt,
120
+ seed=42, randomize_seed=False, width=1024, height=1024,
121
+ guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
122
+ # Get the base image from the "background" layer.
123
  image = edit_images["background"]
124
  width, height = calculate_optimal_dimensions(image)
125
+
126
+ # If a mask prompt is provided, use the SAM-based mask generator.
127
+ if mask_prompt and mask_prompt.strip() != "":
128
+ try:
129
+ mask = generate_mask_with_sam(image, mask_prompt)
130
+ except Exception as e:
131
+ raise ValueError("Error generating mask from prompt: " + str(e))
132
+ else:
133
+ # Fall back to using a manually drawn mask (from the first layer).
134
+ try:
135
+ mask = edit_images["layers"][0]
136
+ except (TypeError, IndexError):
137
+ raise ValueError("No mask provided. Please either draw a mask or supply a mask prompt.")
138
+
139
  if randomize_seed:
140
  seed = random.randint(0, MAX_SEED)
141
+
142
+ # Run the inpainting diffusion pipeline with the provided prompt and mask.
143
+ image_out = pipe(
144
  prompt=prompt,
145
  image=image,
146
  mask_image=mask,
 
149
  guidance_scale=guidance_scale,
150
  num_inference_steps=num_inference_steps,
151
  generator=torch.Generator(device='cuda').manual_seed(seed),
 
152
  ).images[0]
153
 
154
+ output_image_jpg = image_out.convert("RGB")
155
  output_image_jpg.save("output.jpg", "JPEG")
 
156
  return output_image_jpg, seed
 
157
 
158
+ # ------------------ Gradio UI ------------------ #
159
+ css = """
 
 
 
 
 
 
160
  #col-container {
161
  margin: 0 auto;
162
  max-width: 1000px;
 
164
  """
165
 
166
  with gr.Blocks(css=css) as demo:
 
167
  with gr.Column(elem_id="col-container"):
168
+ gr.Markdown("# FLUX.1 [dev] with SAM (Transformers) Mask Generation")
 
169
  with gr.Row():
170
  with gr.Column():
171
+ # The image editor now allows you to optionally draw a mask.
172
  edit_image = gr.ImageEditor(
173
+ label='Upload Image (and optionally draw a mask)',
174
  type='pil',
175
  sources=["upload", "webcam"],
176
  image_mode='RGB',
177
+ layers=False, # We will generate a mask automatically if needed.
178
  brush=gr.Brush(colors=["#FFFFFF"]),
 
179
  )
180
  prompt = gr.Text(
181
+ label="Inpainting Prompt",
182
  show_label=False,
183
  max_lines=2,
184
+ placeholder="Enter your inpainting prompt",
185
  container=False,
186
  )
187
+ mask_prompt = gr.Text(
188
+ label="Mask Prompt (enter a coordinate as 'x,y')",
189
+ show_label=True,
190
+ placeholder="E.g. 450,600",
191
+ container=True,
192
+ )
193
+ generate_mask_btn = gr.Button("Generate Mask")
194
+ mask_preview = gr.Image(label="Mask Preview", show_label=True)
195
  run_button = gr.Button("Run")
 
196
  result = gr.Image(label="Result", show_label=False)
197
+
198
+ # Button to preview the generated mask.
199
+ def on_generate_mask(image, mask_prompt):
200
+ if image is None or mask_prompt.strip() == "":
201
+ return None
202
+ mask = generate_mask_with_sam(image, mask_prompt)
203
+ return mask
204
+
205
+ generate_mask_btn.click(
206
+ fn=on_generate_mask,
207
+ inputs=[edit_image, mask_prompt],
208
+ outputs=[mask_preview]
209
+ )
210
 
211
  with gr.Accordion("Advanced Settings", open=False):
 
212
  seed = gr.Slider(
213
  label="Seed",
214
  minimum=0,
 
216
  step=1,
217
  value=0,
218
  )
 
219
  randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
 
220
  with gr.Row():
 
221
  width = gr.Slider(
222
  label="Width",
223
  minimum=256,
 
226
  value=1024,
227
  visible=False
228
  )
 
229
  height = gr.Slider(
230
  label="Height",
231
  minimum=256,
 
234
  value=1024,
235
  visible=False
236
  )
 
237
  with gr.Row():
 
238
  guidance_scale = gr.Slider(
239
  label="Guidance Scale",
240
  minimum=1,
241
  maximum=30,
242
  step=0.5,
243
+ value=3.5,
244
  )
 
245
  num_inference_steps = gr.Slider(
246
+ label="Number of Inference Steps",
247
  minimum=1,
248
  maximum=50,
249
  step=1,
 
252
 
253
  gr.on(
254
  triggers=[run_button.click, prompt.submit],
255
+ fn=infer,
256
+ inputs=[edit_image, prompt, mask_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
257
+ outputs=[result, seed]
258
  )
259
 
260
+ demo.launch()
gemini.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import google.generativeai as genai
2
+ # import os
3
+
4
+ # api_key = os.getenv("GEMINI_API_KEY")
5
+ # if not api_key:
6
+ # raise ValueError("API key not found")
7
+
8
+ api_key="AIzaSyBFGmov28dektbVZ5MfPT5o-THOgej2u24"
9
+
10
+ genai.configure(api_key=api_key)
11
+
12
+ model = genai.GenerativeModel("gemini-1.5-flash")
13
+
14
+ prompt1 = """An young indian office lady, standing in front of pot inside office with exotic plants"""
15
+
16
+ system_prompt = f"""\
17
+ <SYSTEM_PROMPT>
18
+ Act as a professional stylist and generate highly detailed prompt styling suggestions based on a given look or style preference.
19
+ The details should be highly detailed simialar to how a professional stylist would describe a look. Use EXAMPLE_INPUT and EXAMPLE_OUTPUT \
20
+ as a reference as example for properly understand how exactly you have to describe faces
21
+
22
+ Input: A general prompt describing the look or style preference.
23
+
24
+ Output: A well-structured, detailed styling guide based on face shape, eyes, lips, hair, skin tone, and body type.
25
+
26
+ Step 1: Extract Core Elements
27
+ - Face Shape: Oval, Round, Square, Heart, Diamond, etc.
28
+ - Eye Shape & Color: Almond, Hooded, Monolid, Deep-set; Brown, Blue, Green, etc.
29
+ - Lips: Thin, Full, Defined Cupid’s bow, etc.
30
+ - Hair Type & Length: Straight, Wavy, Curly, Coily; Short, Medium, Long.
31
+ - Skin Tone: Fair, Medium, Olive, Dark, etc.
32
+ - Body Type: Petite, Tall, Athletic, Curvy, etc.
33
+ - Occasion: Casual, Formal, Streetwear, Vintage, Business, Party, etc.
34
+
35
+ Step 2: Generate a Detailed Styling Prompt
36
+ Given the extracted details, the generator will create a tailored styling suggestion.
37
+ </SYSTEM_PROMPT>
38
+
39
+ <EXAMPLE_INPUT>
40
+ An young indian girl standing in front of a rock wall with visible large rocks
41
+ </EXAMPLE_INPUT>
42
+
43
+ ✨ Generated Styling Prompt:
44
+ <EXAMPLE_OUTPUT>
45
+ A young Indian girl with warm brown skin and expressive almond-shaped eyes stands
46
+ gracefully in front of a textured rock wall with large, visible stones. Her long,
47
+ wavy black hair cascades over her shoulders, catching the soft sunlight. She
48
+ wears a flowing, earth-toned bohemian dress that complements the rugged background,
49
+ with delicate golden jewelry adding a subtle elegance. Her full lips curve into a
50
+ serene smile as she gazes into the distance, embodying a harmonious blend of strength
51
+ and grace.
52
+ </EXAMPLE_OUTPUT>
53
+
54
+ <RULES>
55
+ - you should only return the prompt and nothing else
56
+ - you should not return the system prompt
57
+ - you should not return any other details from prompt
58
+ </RULES>
59
+
60
+ <NORMAL_PROMPT>
61
+ ${prompt1}
62
+ </NORMAL_PROMPT>
63
+ """
64
+
65
+
66
+ response = model.generate_content(system_prompt)
67
+ print(response.text)
segment-anything ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit dca509fe793f601edb92606367a655c15ac00fdf