File size: 1,612 Bytes
5491ced
a50a7f8
 
5491ced
 
a50a7f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5491ced
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gradio as gr
from sentence_transformers import SentenceTransformer
import requests

def greet(name):
    # Load the model
    model = SentenceTransformer("intfloat/mmE5-mllama-11b-instruct", trust_remote_code=True)
    
    # Download an example image of a cat and a dog
    dog_cat_image_bytes = requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw.read()
    with open("cat_dog_example.jpg", "wb") as f:
        f.write(dog_cat_image_bytes)
    
    # Image + Text -> Text
    image_embeddings = model.encode([{
        "image": "cat_dog_example.jpg",
        "text": "Represent the given image with the following question: What is in the image",
    }])
    text_embeddings = model.encode([
        {"text": "A cat and a dog"},
        {"text": "A cat and a tiger"},
    ])
    
    similarity = model.similarity(image_embeddings, text_embeddings)
    print(similarity)
    # tensor([[0.3967, 0.3090]])
    # ✅ The first text is most similar to the image
    
    # Text -> Image
    image_embeddings = model.encode([
        {"image": dog_cat_image_bytes, "text": "Represent the given image."},
    ])
    text_embeddings = model.encode([
        {"text": "Find me an everyday image that matches the given caption: A cat and a dog."},
        {"text": "Find me an everyday image that matches the given caption: A cat and a tiger."},
    ])
    
    similarity = model.similarity(image_embeddings, text_embeddings)
    print(similarity)
    return "Hello " + name + "!!"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo.launch()