File size: 7,156 Bytes
ee3b2de
 
 
 
 
490262d
775e2c4
6752228
ee3b2de
 
6371eda
ee3b2de
 
 
 
 
 
6371eda
 
 
 
6fb313b
 
6371eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee3b2de
abf9f32
 
ee3b2de
7f90752
0b764ac
ee3b2de
176f241
 
775e2c4
330dc1c
775e2c4
f2264fe
830976f
39daff5
 
 
 
99642bc
 
 
39daff5
330dc1c
26049de
abf9f32
26049de
7f90752
26049de
330dc1c
 
 
39daff5
 
 
99642bc
 
 
39daff5
f06c5ef
ee3b2de
26049de
f5501f6
f06c5ef
39daff5
864f70a
99642bc
830976f
f06c5ef
830976f
39daff5
864f70a
99642bc
830976f
f06c5ef
 
39daff5
ee3b2de
 
bcd0b3f
 
6ab5788
 
 
ee3b2de
 
 
 
 
 
f5501f6
 
 
 
 
46c7aef
26049de
7f9dcb6
ee3b2de
0b764ac
d850e97
 
1ac78b1
f5501f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import PIL
import requests
import torch
import gradio as gr
import random
from PIL import Image
import os
import time
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler

#Loading from Diffusers Library
model_id = "timbrooks/instruct-pix2pix"
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, revision="fp16", safety_checker=None)
pipe.to("cuda")
pipe.enable_attention_slicing()

counter = 0


help_text = """ Note: I will try to add the functionality to revert your changes to previous/original image in future versions of space. For now only forward editing is available.

Some notes from the official [instruct-pix2pix](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) Space by the authors  
and from the official [Diffusers docs](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/pix2pix) -

If you're not getting what you want, there may be a few reasons:
1. Is the image not changing enough? Your guidance_scale may be too low. It should be >1. Higher guidance scale encourages to generate images 
that are closely linked to the text `prompt`, usually at the expense of lower image quality. This value dictates how similar the output should 
be to the input. This pipeline requires a value of at least `1`. It's possible your edit requires larger changes from the original image. 
                
2. Alternatively, you can toggle image_guidance_scale. Image guidance scale is to push the generated image towards the inital image. Image guidance 
                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to generate images that are closely 
                linked to the source image `image`, usually at the expense of lower image quality.  

3. I have observed that rephrasing the instruction sometimes improves results (e.g., "turn him into a dog" vs. "make him a dog" vs. "as a dog").

4. Increasing the number of steps sometimes improves results.

5. Do faces look weird? The Stable Diffusion autoencoder has a hard time with faces that are small in the image. Try:
    * Cropping the image so the face takes up a larger portion of the frame.
"""

def chat(image_in, in_steps, in_guidance_scale, in_img_guidance_scale, image_hid, img_name, counter_out, prompt, history, progress=gr.Progress(track_tqdm=True)):
    progress(0, desc="Starting...")
    #if message == "revert": --to add revert functionality later
    print(f"counter:{counter_out}, prompt:{prompt}, img_name:{img_name}")
    if counter_out > 0:
      # Open the image
      #image_in = Image.open(img_name) #("edited_image.png") #(img_nm)
      edited_image = pipe(prompt, image=image_hid, num_inference_steps=int(in_steps), guidance_scale=float(in_guidance_scale), image_guidance_scale=float(in_img_guidance_scale)).images[0]
      if os.path.exists(img_name):
        print("***Image exists and will be deleted***")
        os.remove(img_name)
      temp_img_name = img_name[:-4]+str(int(time.time()))+'.png' #img_name[] + str(int(time.time()))
      print(f"temp_img_name is :{temp_img_name}")
      # Create a file-like object
      with open(temp_img_name, "wb") as fp:
        # Save the image to the file-like object
        edited_image.save(fp)
      #Get the name of the saved image
      saved_image_name = fp.name
      print(f"SAVED IMAGE NAME : {saved_image_name}, TMP IMAGE NAME : {temp_img_name}")    
      #edited_image.save(temp_img_name) #, overwrite=True)
      counter_out += 1
    else:
      print("FIRST PASS")
      seed = random.randint(0, 1000000)
      img_name = f"./edited_image_{seed}.png"
      edited_image = pipe(prompt, image=image_in, num_inference_steps=int(in_steps), guidance_scale=float(in_guidance_scale), image_guidance_scale=float(in_img_guidance_scale)).images[0]
      if os.path.exists(img_name):
        print("***First PASS:Image exists and will be deleted***")
        os.remove(img_name)
      with open(img_name, "wb") as fp:
        # Save the image to the file-like object
        edited_image.save(fp)
      #Get the name of the saved image
      saved_image_name2 = fp.name
      print(f"SAVED IMAGE NAME : {saved_image_name2}, IMAGE NAME : {img_name}")
      #edited_image.save(img_name) #, overwrite=True) #("/tmp/edited_image.png") #(img_nm)
      #counter_out += 1
    history = history or []
    #Resizing (or not) the image for better display and adding supportive sample text
    add_text_list = ["There you go", "Enjoy your image!", "Nice work! Wonder what you gonna do next!", "Way to go!", "Does this work for you?", "Something like this?"]
    if counter_out > 0:
        #response = edited_image
        print(f"You are in {counter_out} pass")
        response = random.choice(add_text_list) + '<img src="/file=' + saved_image_name + '">'  #temp_img_name
        history.append((prompt, response))
        return history, history, edited_image, temp_img_name, counter_out
    else:
        #response = edited_image
        print(f"You are in {counter_out} pass or FIRST Pass")
        response = random.choice(add_text_list) + '<img src="/file=' + saved_image_name2 + '">'   #IMG_NAME
        history.append((prompt, response))
        counter_out += 1
        return history, history, edited_image, img_name, counter_out
        

with gr.Blocks() as demo:
    gr.Markdown("""<h1><center> Chat Interface with InstructPix2Pix: Give Image Editing Instructions [Apologies for inconvenience, this Space is still very much a work in progress...] </h1></center>
    <p>For faster inference without waiting in the queue, you may duplicate the space and upgrade to GPU in settings.<br/>
    <a href="https://huggingface.co/spaces/ysharma/InstructPix2Pix_Chatbot?duplicate=true">
    <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
    <p/>""")
    with gr.Row():
      with gr.Column():
        image_in = gr.Image(type='pil', label="Original Image")
        text_in = gr.Textbox()
        state_in = gr.State()
        b1 = gr.Button('Edit the image!')
        with gr.Accordion("Advance settings for Training and Inference", open=False):
          gr.Markdown("Advance settings for - Number of Inference steps, Guidanace scale, and Image guidance scale.")
          in_steps = gr.Number(label="Enter the number of Inference steps", value = 20)
          in_guidance_scale = gr.Slider(1,10, step=0.5, label="Set Guidance scale", value=7.5)
          in_img_guidance_scale = gr.Slider(1,10, step=0.5, label="Set Image Guidance scale", value=1.5)
          image_hid = gr.Image(type='pil')
          img_name_temp_out = gr.Textbox(visible=False)
          counter_out = gr.Number(visible=False, value=0, precision=0)
      chatbot = gr.Chatbot() 
    b1.click(chat,[image_in, in_steps, in_guidance_scale, in_img_guidance_scale, image_hid, img_name_temp_out,counter_out,  text_in, state_in], [chatbot, state_in, image_hid, img_name_temp_out, counter_out]) #, queue=True)
    gr.Markdown(help_text)
    
demo.queue(concurrency_count=10)
demo.launch(debug=True, width="80%", height=2000)