File size: 4,443 Bytes
5fb8331
 
 
08ba28a
 
5fb8331
08ba28a
 
 
 
 
 
 
 
9871891
 
 
 
c78e00d
9871891
 
 
 
08ba28a
9871891
08ba28a
 
 
5fb8331
08ba28a
 
 
 
 
 
 
9871891
f274670
 
 
 
 
5fb8331
08ba28a
f274670
 
 
5fb8331
 
 
c837039
 
 
 
 
 
87cbf92
 
 
 
8a67c3f
5fb8331
 
 
 
e80df4f
5fb8331
 
ce45613
cbd9557
08ba28a
 
ce45613
08ba28a
f274670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08ba28a
9871891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import warnings
warnings.filterwarnings("ignore")
import gradio as gr
from src.video_model import describe_video  # Your video processing function
from src.text_processor import process_description  # Your text processing function

# --- Global variable to store the prediction ---
prediction = None

# --- Function to handle video processing ---
def process_video(video, sitting, hands, location, screen):
    global prediction  # Access the global prediction variable

    query = "Describe this video in detail and answer the questions."
    additional_info = []
    if sitting:
        additional_info.append("Is the subject in the video standing or sitting?")
    if hands:
        additional_info.append("Is the subject holding any object in their hands, if so the hands are not free else they are free?")
    if location:
        additional_info.append("Is the subject present indoors or outdoors?")
    if screen:
        additional_info.append("Is the subject interacting with a screen in the background by facing the screen?")

    final_query = query + " " + " ".join(additional_info)
    prediction = describe_video(video, final_query)
    # Enable the "Process Text" button
    return gr.update(visible=True), prediction 

# --- Function to trigger text processing ---
def process_and_display_text():
    global prediction 
    json_response = process_description(prediction)
    return json_response

# ... (Gradio interface code) ... 

#video = gr.Video(label="Video")
#sitting = gr.Checkbox(label="Sitting/Standing")
#hands = gr.Checkbox(label="Hands Free/Not Free")
#location = gr.Checkbox(label="Indoors/Outdoors")
#screen = gr.Checkbox(label="Screen Interaction")

# Output components
#video_description = gr.Textbox(label="Video Description")
#json_output = gr.JSON(label="JSON Output")
#process_button = gr.Button("Process Text", visible=False) 

# Examples for the interface
examples = [
    ["videos/2016-01-01_0100_US_KNBC_Channel_4_News_1867.16-1871.38_now.mp4",],
    ["videos/2016-01-01_0200_US_KNBC_Channel_4_News_1329.12-1333.29_tonight.mp4",],
    ["videos/2016-01-01_0830_US_KNBC_Tonight_Show_with_Jimmy_Fallon_725.45-729.76_tonight.mp4",],
    ["videos/2016-01-01_0200_US_KOCE_The_PBS_Newshour_577.03-581.31_tonight.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-02_0735_US_KCBS_Late_Show_with_Stephen_Colbert_285.94-290.67_this_year.mp4"],
    ["videos/2016-01-13_2200_US_KTTV-FOX_The_Doctor_Oz_Show_1709.79-1714.17_this_month.mp4"],
    ["videos/2016-01-01_1400_US_KTTV-FOX_Morning_News_at_6AM_1842.36-1846.68_this_year.mp4"],
    ["videos/2016-01-01_1300_US_KNBC_Today_in_LA_at_5am_12.46-16.95_this_morning.mp4"],
    ["videos/2016-01-05_0200_US_KNBC_Channel_4_News_1561.29-1565.95_next_week.mp4"],
    ["videos/2016-01-28_0700_US_KNBC_Channel_4_News_at_11PM_629.56-633.99_in_the_future.mp4"]
]

# Title, description, and article for the interface
title = "GSoC Super Raid Annotator"
description = "Annotate Videos"
article = "<p style='text-align: center'><a href='https://github.com/OpenBMB/MiniCPM-V' target='_blank'>Model GitHub Repo</a> | <a href='https://huggingface.co/openbmb/MiniCPM-V-2_6' target='_blank'>Model Page</a></p>"

custom_theme = gr.themes.Soft(
    primary_hue="red", 
    secondary_hue="red"
)


with gr.Blocks(theme=custom_theme) as demo:  # Use 'with' for gr.Blocks
    video = gr.Video(label="Video")
    sitting = gr.Checkbox(label="Sitting/Standing")
    hands = gr.Checkbox(label="Hands Free/Not Free")
    location = gr.Checkbox(label="Indoors/Outdoors")
    screen = gr.Checkbox(label="Screen Interaction")

    # Output components
    video_description = gr.Textbox(label="Video Description")
    json_output = gr.JSON(label="JSON Output")
    process_button = gr.Button("Process Text", visible=False) 

    # --- Connect inputs and outputs within gr.Blocks ---
    interface = gr.Interface(
        fn=process_video,  
        inputs=[video, sitting, hands, location, screen],
        outputs=[process_button, video_description], 
        examples=examples,
        title=title,
        description=description,
        article=article,
        allow_flagging="never",
    )
    interface.launch(debug=False)

    # --- Button click event handler ---
    process_button.click(fn=process_and_display_text, outputs=json_output)# Click event for the "Process Text" button


interface.launch(debug=False)