Update app.py
Browse files
app.py
CHANGED
@@ -23,10 +23,8 @@ def load_v2t_samples(data_root):
|
|
23 |
def load_t2v_samples(data_root):
|
24 |
sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
|
25 |
idx2sid = {0: 2119, 1: 1730, 2: 1276}
|
26 |
-
|
27 |
return sample_text, idx2sid
|
28 |
|
29 |
-
|
30 |
def format_pred(pred, gt):
|
31 |
tp = '[color=green]{}[/color]'
|
32 |
fp = '[color=red]{}[/color]'
|
@@ -57,10 +55,10 @@ def main():
|
|
57 |
|
58 |
def predict_t2v(idx):
|
59 |
sid = idx2sid_t2v[idx]
|
60 |
-
zeroshot_video, gt_video = lavila.predict_t2v(idx, sid)
|
61 |
egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
|
|
|
62 |
|
63 |
-
return
|
64 |
|
65 |
with gr.Blocks() as demo:
|
66 |
with gr.Tab("Video-to-text retrieval"):
|
@@ -97,12 +95,12 @@ def main():
|
|
97 |
text = gr.Text(label="text query")
|
98 |
with gr.Column():
|
99 |
idx = gr.Number(label="Idx", visible=False)
|
100 |
-
zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
|
101 |
#zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
102 |
-
ours = gr.Textbox(label="Ego-VPA prediction")
|
103 |
-
|
104 |
btn = gr.Button("Predict", variant="primary")
|
105 |
-
btn.click(predict_t2v, inputs=[idx], outputs=[
|
106 |
gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])
|
107 |
|
108 |
|
|
|
23 |
def load_t2v_samples(data_root):
|
24 |
sample_text = ['cut the sausage', 'stir vegetables into salmon', 'rinse cutting board']
|
25 |
idx2sid = {0: 2119, 1: 1730, 2: 1276}
|
|
|
26 |
return sample_text, idx2sid
|
27 |
|
|
|
28 |
def format_pred(pred, gt):
|
29 |
tp = '[color=green]{}[/color]'
|
30 |
fp = '[color=red]{}[/color]'
|
|
|
55 |
|
56 |
def predict_t2v(idx):
|
57 |
sid = idx2sid_t2v[idx]
|
|
|
58 |
egovpa_video, gt_video = egovpa.predict_t2v(idx, sid)
|
59 |
+
egovpa_video = [f'{data_root}/video/gif/{x}.gif' for x in ego_video]
|
60 |
|
61 |
+
return egovpa_video
|
62 |
|
63 |
with gr.Blocks() as demo:
|
64 |
with gr.Tab("Video-to-text retrieval"):
|
|
|
95 |
text = gr.Text(label="text query")
|
96 |
with gr.Column():
|
97 |
idx = gr.Number(label="Idx", visible=False)
|
98 |
+
#zeroshot = gr.Textbox(label="LaViLa (zero-shot) prediction")
|
99 |
#zeroshot = gr.Gallery(label="LaViLa (zero-shot) prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
100 |
+
#ours = gr.Textbox(label="Ego-VPA prediction")
|
101 |
+
ours = gr.Gallery(label="Ego-VPA prediction", columns=[3], rows=[1], object_fit="contain", height="auto")
|
102 |
btn = gr.Button("Predict", variant="primary")
|
103 |
+
btn.click(predict_t2v, inputs=[idx], outputs=[ours])
|
104 |
gr.Examples(examples=[[i, x] for i, x in enumerate(t2v_samples)], inputs=[idx, text])
|
105 |
|
106 |
|