ManishThota commited on
Commit
406951a
·
verified ·
1 Parent(s): f8a0355

Create both_app.py

Browse files
Files changed (1) hide show
  1. both_app.py +185 -0
both_app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import torch
6
+ import gc
7
+ import os
8
+ from src.video_model import describe_video
9
+ from src.utils import parse_string, parse_annotations
10
+
11
+ # --- Function to process single video ---
12
+ def process_video_and_questions(video, standing, hands, location, screen):
13
+ video_name = os.path.basename(video)
14
+ query = f"Answer the questions from the video\n"
15
+ additional_info = []
16
+ if standing:
17
+ additional_info.append("Is the subject in the video standing or sitting?\n")
18
+ if hands:
19
+ additional_info.append("Is the subject holding any object in their hands?\n")
20
+ if location:
21
+ additional_info.append("Is the subject present indoors?\n")
22
+ if screen:
23
+ additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")
24
+
25
+ end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples\n:
26
+ <annotation>indoors: 0</annotation>
27
+ <annotation>standing: 1</annotation>
28
+ <annotation>hands.free: 0</annotation>
29
+ <annotation>screen.interaction_yes: 0</annotation>
30
+ """
31
+
32
+ final_query = query + " " + " ".join(additional_info)
33
+ final_prompt = final_query + " " + end_query
34
+
35
+ response = describe_video(video, final_prompt)
36
+ final_response = f"<video_name>{video_name}</video_name>" + " \n" + response
37
+
38
+ conditions = {
39
+ 'standing': (standing, 'standing: 1', 'standing: None'),
40
+ 'hands': (hands, 'hands.free: 1', 'hands.free: None'),
41
+ 'location': (location, 'indoors: 1', 'indoors: None'),
42
+ 'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
43
+ }
44
+
45
+ for key, (condition, to_replace, replacement) in conditions.items():
46
+ if not condition:
47
+ final_response = final_response.replace(to_replace, replacement)
48
+
49
+ return final_response
50
+
51
+ # Function to save data to a CSV file using pandas
52
+ def save_to_csv(observations: List[Dict], output_dir: str = "outputs") -> str:
53
+ if not os.path.exists(output_dir):
54
+ os.makedirs(output_dir)
55
+
56
+ # Convert the list of dictionaries to a pandas DataFrame
57
+ df = pd.DataFrame(observations)
58
+
59
+ # Specify the CSV file path
60
+ csv_file = os.path.join(output_dir, "video_observations.csv")
61
+
62
+ # Save the DataFrame to a CSV file
63
+ df.to_csv(csv_file, index=False)
64
+
65
+ return csv_file
66
+
67
+ # Function to process a single video and return the observation data
68
+ def process_single_video(video_path, standing, hands, location, screen) -> Dict:
69
+ video_name = os.path.basename(video_path) # Extract video name from the path
70
+ query = "Describe this video in detail and answer the questions"
71
+ additional_info = []
72
+ if standing:
73
+ additional_info.append("Is the subject in the video standing or sitting?\n")
74
+ if hands:
75
+ additional_info.append("Is the subject holding any object in their hands?\n")
76
+ if location:
77
+ additional_info.append("Is the subject present indoors?\n")
78
+ if screen:
79
+ additional_info.append("Is the subject interacting with a screen in the background by facing the screen?\n")
80
+
81
+ end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Follow the below examples:
82
+ <annotation>indoors: 0</annotation>
83
+ <annotation>standing: 1</annotation>
84
+ <annotation>hands.free: 0</annotation>
85
+ <annotation>screen.interaction_yes: 0</annotation>
86
+ """
87
+
88
+ final_query = query + " " + " ".join(additional_info)
89
+ final_prompt = final_query + " " + end_query
90
+
91
+ # Assuming your describe_video function handles the video processing
92
+ response = describe_video(video_path, final_prompt)
93
+ final_response = f"<video_name>{video_name}</video_name>" + " \n" + response
94
+
95
+ conditions = {
96
+ 'standing': (standing, 'standing: 1', 'standing: None'),
97
+ 'hands': (hands, 'hands.free: 1', 'hands.free: None'),
98
+ 'location': (location, 'indoors: 1', 'indoors: None'),
99
+ 'screen': (screen, 'screen.interaction_yes: 1', 'screen.interaction_yes: None')
100
+ }
101
+
102
+ for key, (condition, to_replace, replacement) in conditions.items():
103
+ if not condition:
104
+ final_response = final_response.replace(to_replace, replacement)
105
+
106
+ # Parse the response to extract video name and annotations
107
+ parsed_content = parse_string(final_response, ["video_name", "annotation"])
108
+ video_name = parsed_content['video_name'][0] if parsed_content['video_name'] else None
109
+ annotations_dict = parse_annotations(parsed_content['annotation']) if parsed_content['annotation'] else {}
110
+
111
+ # Return the observation as a dictionary
112
+ return {'video_name': video_name, **annotations_dict}
113
+
114
+ # Function to process all videos in a folder
115
+ def process_multiple_videos(video_files: List[str], standing, hands, location, screen):
116
+ all_observations = []
117
+
118
+ for video_path in video_files:
119
+ observation = process_single_video(video_path, standing, hands, location, screen)
120
+ if observation['video_name']: # Only add valid observations
121
+ all_observations.append(observation)
122
+ else:
123
+ print("Error processing video:", video_path) # Log any errors
124
+
125
+ # Clear GPU cache
126
+ torch.cuda.empty_cache()
127
+ gc.collect()
128
+
129
+ # Save all observations to a CSV file and return the file path
130
+ csv_file = save_to_csv(all_observations)
131
+ return "Processing completed. Download the CSV file.", csv_file
132
+
133
+ # Gradio interface
134
+ def gradio_interface_single(video, standing, hands, location, screen):
135
+ return process_video_and_questions(video, standing, hands, location, screen)
136
+
137
+ def gradio_interface_multiple(video_files, standing, hands, location, screen):
138
+ video_file_paths = [video.name for video in video_files] # Extract file paths from uploaded files
139
+ return process_multiple_videos(video_file_paths, standing, hands, location, screen)
140
+
141
+ with gr.Blocks() as demo:
142
+ with gr.Tab("Single Video Processing"):
143
+ with gr.Row():
144
+ with gr.Column():
145
+ video = gr.Video(label="Video")
146
+ standing = gr.Checkbox(label="Standing")
147
+ hands = gr.Checkbox(label="Hands Free")
148
+ location = gr.Checkbox(label="Indoors")
149
+ screen = gr.Checkbox(label="Screen Interaction")
150
+ submit_btn = gr.Button("Generate Annotations")
151
+ generate_csv_btn = gr.Button("Generate CSV")
152
+
153
+ with gr.Column():
154
+ response = gr.Textbox(label="Video Description", show_label=True, show_copy_button=True)
155
+ csv_output = gr.File(label="Download CSV", interactive=False)
156
+
157
+ submit_btn.click(
158
+ fn=gradio_interface_single,
159
+ inputs=[video, standing, hands, location, screen],
160
+ outputs=response
161
+ )
162
+
163
+ generate_csv_btn.click(
164
+ fn=save_to_csv,
165
+ inputs=response,
166
+ outputs=csv_output
167
+ )
168
+
169
+ with gr.Tab("Batch Video Processing"):
170
+ with gr.Row():
171
+ video_files = gr.File(file_count="multiple", file_types=["video"], label="Upload multiple videos")
172
+ standing = gr.Checkbox(label="Standing")
173
+ hands = gr.Checkbox(label="Hands Free")
174
+ location = gr.Checkbox(label="Indoors")
175
+ screen = gr.Checkbox(label="Screen Interaction")
176
+ submit_btn = gr.Button("Process Videos")
177
+ download_link = gr.File(label="Download CSV")
178
+
179
+ submit_btn.click(
180
+ fn=gradio_interface_multiple,
181
+ inputs=[video_files, standing, hands, location, screen],
182
+ outputs=[response, download_link]
183
+ )
184
+
185
+ demo.launch(debug=False)