import gradio as gr from PIL import Image import torch from torchvision import transforms from transformers import ( CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPFeatureExtractor, ) import math from typing import List from PIL import Image, ImageChops import numpy as np import torch from diffusers import UnCLIPPipeline # from diffusers.utils.torch_utils import randn_tensor from transformers import CLIPTokenizer from src.priors.prior_transformer import ( PriorTransformer, ) # original huggingface prior transformer without time conditioning from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline from diffusers import DiffusionPipeline __DEVICE__ = "cpu" if torch.cuda.is_available(): __DEVICE__ = "cuda" class Ours: def __init__(self, device): text_encoder = ( CLIPTextModelWithProjection.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280, torch_dtype=torch.float32, ) .eval() .requires_grad_(False) ) tokenizer = CLIPTokenizer.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) prior = PriorTransformer.from_pretrained( "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior", torch_dtype=torch.float32, ) self.pipe_prior = KandinskyPriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", prior=prior, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.float32, ).to(device) self.pipe = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float32 ).to(device) def inference(self, text, negative_text, steps, guidance_scale): gen_images = [] for i in range(1): image_emb, negative_image_emb = self.pipe_prior( text, negative_prompt=negative_text ).to_tuple() image = self.pipe( image_embeds=image_emb, negative_image_embeds=negative_image_emb, num_inference_steps=steps, guidance_scale=guidance_scale, ).images gen_images.append(image[0]) return gen_images selected_model = Ours(device=__DEVICE__) def get_images(text, negative_text, steps, guidance_scale): images = selected_model.inference(text, negative_text, steps, guidance_scale) new_images = [] for img in images: new_images.append(img) return new_images[0] with gr.Blocks() as demo: gr.Markdown( """
This demo is currently hosted on CPU. We will soon provide the GPU support.
Please follow the instructions from here to run it locally: GitHub Inference Code
""" ) with gr.Group(): with gr.Row(): with gr.Column(): text = gr.Textbox( label="Enter your prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", elem_id="prompt-text-input", ) with gr.Row(): with gr.Column(): negative_text = gr.Textbox( label="Enter your negative prompt", show_label=False, max_lines=1, placeholder="Enter your negative prompt", elem_id="prompt-text-input", ) with gr.Row(): steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1) guidance_scale = gr.Slider( label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1 ) with gr.Row(): btn = gr.Button(value="Generate Image") gallery = gr.Image( height=512, width=512, label="Generated images", show_label=True, elem_id="gallery" ) btn.click( get_images, inputs=[ text, negative_text, steps, guidance_scale, ], outputs=gallery, ) text.submit( get_images, inputs=[ text, negative_text, steps, guidance_scale, ], outputs=gallery, ) negative_text.submit( get_images, inputs=[ text, negative_text, steps, guidance_scale, ], outputs=gallery, ) with gr.Accordion(label="Ethics & Privacy", open=False): gr.HTML( """