Spaces:
Runtime error
Runtime error
| from PIL import Image | |
| from transformers import CLIPProcessor, CLIPModel | |
| import gradio as gr | |
| import torchvision.transforms as transforms | |
| # Initialize CLIP model and processor | |
| processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
| model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") | |
| def image_similarity(image: Image.Image, positive_prompt: str, negative_prompts: str): | |
| # Convert the PIL Image to a tensor and preprocess | |
| transform = transforms.Compose([ | |
| transforms.Resize((224, 224)), | |
| transforms.ToTensor(), | |
| transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), | |
| ]) | |
| image_tensor = transform(image).unsqueeze(0) # Add batch dimension | |
| # Split the negative prompts string into a list of prompts | |
| negative_prompts_list = negative_prompts.split(";") | |
| # Combine positive and negative prompts into one list | |
| prompts = [positive_prompt.strip()] + [np.strip() for np in negative_prompts_list] | |
| # Process prompts and image tensor | |
| inputs = processor( | |
| text=prompts, | |
| images=image_tensor, | |
| return_tensors="pt", | |
| padding=True | |
| ) | |
| outputs = model(**inputs) | |
| logits_per_image = outputs.logits_per_image | |
| probs = logits_per_image.softmax(dim=1) | |
| # Determine if positive prompt has a higher probability than any of the negative prompts | |
| is_positive_highest = probs[0][0] > max(probs[0][1:]) | |
| return bool(is_positive_highest), f"Probability for Positive Prompt: {probs[0][0]:.4f}" | |
| interface = gr.Interface( | |
| fn=image_similarity, | |
| inputs=[ | |
| gr.components.Image(type="pil"), | |
| gr.components.Text(label="Enter positive prompt e.g. 'a person drinking a beverage'"), | |
| gr.components.Textbox(label="Enter negative prompts, separated by semicolon e.g. 'an empty scene; person without beverage'", placeholder="negative prompt 1; negative prompt 2; ..."), | |
| ], | |
| outputs=[ | |
| gr.components.Textbox(label="Result"), | |
| gr.components.Textbox(label="Probability for Positive Prompt") | |
| ], | |
| title="Engagify's Image Action Detection", | |
| description="[Author: Ibrahim Hasani] This Method uses CLIP-VIT [Version: BASE-PATCH-16] to determine if an action is being performed in an image or not. (Binary Classifier). It contrasts an Action against multiple negative labels. Ensure the prompts accurately describe the desired detection.", | |
| live=False, | |
| theme=gr.themes.Monochrome(), | |
| ) | |
| interface.launch() |