Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import CLIPProcessor, CLIPModel | |
from torch import nn | |
import numpy as np | |
import PIL | |
from PIL import Image | |
from torchvision import transforms | |
# Load CLIP model and processor | |
model_name = "openai/clip-vit-base-patch16" | |
clip_model = CLIPModel.from_pretrained(model_name) | |
clip_processor = CLIPProcessor.from_pretrained(model_name) | |
# Generate a random noise tensor (this will be transformed into an image) | |
def generate_image_from_text(text_input): | |
# Preprocess the input text for CLIP model | |
inputs = clip_processor(text=text_input, return_tensors="pt", padding=True) | |
# Extract image-text features using CLIP | |
text_features = clip_model.get_text_features(**inputs) | |
# Create a simple GAN-like generator using a random noise tensor | |
class SimpleGenerator(nn.Module): | |
def __init__(self): | |
super(SimpleGenerator, self).__init__() | |
self.fc = nn.Linear(512, 256*256*3) # Adjust output size to match image dimensions | |
self.relu = nn.ReLU() | |
def forward(self, z): | |
x = self.fc(z) | |
x = self.relu(x) | |
x = x.view(-1, 3, 256, 256) # Reshape to match image shape | |
return x | |
# Initialize the generator | |
generator = SimpleGenerator() | |
# Generate random noise based on the text features | |
random_input = torch.randn(1, 512) # Matching CLIP output size (text_features shape) | |
generated_image_tensor = generator(random_input) | |
# Convert generated image tensor to PIL Image | |
generated_image = generated_image_tensor.squeeze().permute(1, 2, 0).detach().numpy() | |
generated_image = np.clip(generated_image, 0, 1) # Normalize pixel values | |
generated_image = (generated_image * 255).astype(np.uint8) | |
generated_image = Image.fromarray(generated_image) | |
return generated_image | |
# Gradio interface | |
iface = gr.Interface(fn=generate_image_from_text, inputs="text", outputs="image", live=True) | |
iface.launch() | |