File size: 2,650 Bytes
1bb90fa
2c140eb
 
 
 
 
 
 
 
 
 
 
 
 
 
777a816
2c140eb
3388bb6
2c140eb
 
 
 
 
 
 
fa7d0e2
2c140eb
 
 
 
 
 
 
 
 
605a3b1
2c140eb
 
 
 
 
 
 
 
7c6dff2
 
 
39ae392
dea9378
39ae392
dea9378
2c140eb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import data
import torch
import gradio as gr
from models import imagebind_model
from models.imagebind_model import ModalityType


device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)


def image_text_zeroshot(image, text_list):
    image_paths = [image]
    labels = [label.strip(" ") for label in text_list.strip(" ").split("|")]
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(labels, device),
        ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)

    scores = torch.softmax(
        embeddings[ModalityType.VISION] @ embeddings[ModalityType.TEXT].T,
        dim=-1
    ).squeeze(0).tolist()

    score_dict = {label:score for label, score in zip(labels, scores)}

    return score_dict


inputs = [
    gr.inputs.Image(type='filepath',
                    label="Input image"),
    gr.inputs.Textbox(lines=1,
                      label="Candidate texts"),
]

iface = gr.Interface(image_text_zeroshot,
            inputs,
            "label",
            examples=[["assets/dog_image.jpg", "A dog|A car|A bird"],
                      ["assets/car_image.jpg", "A dog|A car|A bird"],
                      ["assets/bird_image.jpg", "A dog|A car|A bird"]],
            description="""<p>This is a simple demo of ImageBind for zeroshot image classification. Please refer to the original <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> and <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repo</a> for more details.<br>
                To test your own cases, you can upload an image, and provide the candidate texts separated by "|".<br>
                You can duplicate this space and run it privately: <a href='https://huggingface.co/spaces/OFA-Sys/chinese-clip-zero-shot-image-classification?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>""",
            title="ImageBind: Zero-shot Image Classification")

iface.launch()