File size: 2,695 Bytes
ba2ab36
3617c74
ba2ab36
9ebc77b
ba2ab36
3617c74
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2ab36
 
 
 
 
 
3617c74
ba2ab36
 
 
c5c3fa2
ba2ab36
 
 
 
 
 
 
 
 
 
 
 
9ebc77b
ba2ab36
9ebc77b
 
 
ba2ab36
c5c3fa2
ba2ab36
3617c74
 
 
 
 
 
 
 
ba2ab36
c5c3fa2
ba2ab36
3617c74
ba2ab36
 
c5c3fa2
3617c74
 
9ebc77b
 
3617c74
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import streamlit as st
from st_pages import Page, show_pages, add_page_title, Section 
from lib.utils.model import get_model, get_similarities 
from lib.utils.timer import timer 

add_page_title()

show_pages(
    [
        Page('app.py', 'IRRA Text-To-Image-Retrival'),
        Section('Implementation Details'),
        Page('pages/losses.py', 'Loss functions'),
    ]
)

st.markdown('''
            A text-to-image retrieval model implemented from [arXiv: Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval](https://arxiv.org/abs/2303.12501).
            The uploaded images should be `384x128` with only one person in the shot. 
            ''')

st.header('Inputs')
caption = st.text_input('Description Input')

images = st.file_uploader('Upload images', accept_multiple_files=True)
if images is not None:
    
    st.image(images) # type: ignore

st.header('Options')
st.subheader('Ranks', help='How many predictions the model is allowed to make')

ranks = st.slider('slider_ranks', min_value=1, max_value=10, label_visibility='collapsed',value=5)

button = st.button('Match most similar', disabled=len(images) == 0 or caption == '')

if button:
    st.header('Results')
    with st.spinner('Loading model'):
        model = get_model()

    st.text(f'IRRA model loaded with {sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters')
    
    time = timer()
    with st.spinner('Computing and ranking similarities'):
        with timer() as t:
            similarities = get_similarities(caption, images, model).squeeze(0)
    elapsed = t()

    indices = similarities.argsort(descending=True).cpu().tolist()[:ranks]
    
    c1, c2, c3 = st.columns(3)
    with c1:
        st.subheader('Rank')
    with c2:
        st.subheader('Image')
    with c3:
        st.subheader('Cosine Similarity', help='Due to the nature of the SDM loss, the higher the similarity, the more similar the match is')
    
    for i, idx in enumerate(indices):
        c1, c2, c3 = st.columns(3)
        with c1:
            st.text(f'{i + 1}')
        with c2:
            st.image(images[idx])
        with c3:
            st.text(f'{similarities[idx].cpu():.2f}')

    st.success(f'Done in {elapsed:.2f}s')

with st.sidebar:
    st.title('IRRA Text-To-Image Retrival')

    st.subheader('Useful Links')
    st.markdown('[arXiv: Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval](https://arxiv.org/abs/2303.12501)')
    st.markdown('[IRRA implementation (Pytorch Lightning + Transformers)](https://github.com/grostaco/modern-IRRA)')
    st.markdown('[IRRA implementation (PyTorch)](https://github.com/anosorae/IRRA/tree/main)')