import gradio as gr from sentence_transformers import SentenceTransformer import requests def greet(name): # Load the model model = SentenceTransformer("intfloat/mmE5-mllama-11b-instruct", trust_remote_code=True) # Download an example image of a cat and a dog dog_cat_image_bytes = requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw.read() with open("cat_dog_example.jpg", "wb") as f: f.write(dog_cat_image_bytes) # Image + Text -> Text image_embeddings = model.encode([{ "image": "cat_dog_example.jpg", "text": "Represent the given image with the following question: What is in the image", }]) text_embeddings = model.encode([ {"text": "A cat and a dog"}, {"text": "A cat and a tiger"}, ]) similarity = model.similarity(image_embeddings, text_embeddings) print(similarity) # tensor([[0.3967, 0.3090]]) # ✅ The first text is most similar to the image # Text -> Image image_embeddings = model.encode([ {"image": dog_cat_image_bytes, "text": "Represent the given image."}, ]) text_embeddings = model.encode([ {"text": "Find me an everyday image that matches the given caption: A cat and a dog."}, {"text": "Find me an everyday image that matches the given caption: A cat and a tiger."}, ]) similarity = model.similarity(image_embeddings, text_embeddings) print(similarity) return "Hello " + name + "!!" demo = gr.Interface(fn=greet, inputs="text", outputs="text") demo.launch()