Spaces:

caltech-animal-tracking
/

Primate_Detection_V2

Sleeping

App Files Files Community

annading commited on Mar 19

Commit

d7ae9fa

1 Parent(s): 50b188c

fixed ffmpeg issue

Browse files

Files changed (3) hide show

.gitignore +2 -1
app.py +2 -2
owl_core.py +11 -4

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 *.pyc
-*/__pycache__/**

 *.pyc
+*/__pycache__/**
+*.mp4

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ def run_owl(input_vid,
                                         text_prompt,
                                         confidence_threshold,
                                         fps_processed=fps_processed,
-                                        scaling_factor=scaling_factor)
     global CSV_PATH
     CSV_PATH = csv_path
@@ -54,7 +54,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             input = gr.Video(label="Input Video", interactive=True)
-            text_prompt = gr.Textbox(label="What do you want to detect? (Multiple species should be separated by commas")
             with gr.Accordion("Advanced Options", open=False):
                 conf_threshold = gr.Slider(
                     label="Confidence Threshold",

                                         text_prompt,
                                         confidence_threshold,
                                         fps_processed=fps_processed,
+                                        scaling_factor=1/scaling_factor)
     global CSV_PATH
     CSV_PATH = csv_path
     with gr.Row():
         with gr.Column():
             input = gr.Video(label="Input Video", interactive=True)
+            text_prompt = gr.Textbox(label="What do you want to detect? (Multiple species should be separated by commas)")
             with gr.Accordion("Advanced Options", open=False):
                 conf_threshold = gr.Slider(
                     label="Confidence Threshold",

owl_core.py CHANGED Viewed

@@ -11,11 +11,12 @@ from PIL import Image
 from utils import plot_predictions, mp4_to_png, vid_stitcher
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 def preprocess_text(text_prompt: str, num_prompts: int = 1):
     """
     Takes a string of text prompts and returns a list of lists of text prompts for each image.
     i.e. text_prompt = "a, b, c" -> [["a", "b", "c"], ["a", "b", "c"]]
-    """
     text_prompt = [s.strip() for s in text_prompt.split(",")]
     text_queries = [text_prompt] * num_prompts
     # print("text_queries:", text_queries)
@@ -33,7 +34,7 @@ def owl_batch_prediction(
     with torch.no_grad():
         outputs = model(**inputs)
-     # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
     target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
     # Convert outputs (bounding boxes and class logits) to COCO API, resizes to original image size and filter by threshold
     results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=threshold)
@@ -58,6 +59,7 @@ def owl_full_video(
     # create new dirs and paths for results
     filename = os.path.splitext(os.path.basename(vid_path))[0]
     results_dir = f'../temp/{filename}_{datetime.now().strftime("%H%M%S")}'
     frames_dir = os.path.join(results_dir, "frames")
     # if the frames directory does not exist, create it and get the frames from the video
@@ -86,7 +88,7 @@ def owl_full_video(
     for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
         frame_nums = [i*fps_processed for i in range(batch_size)]
         batch_paths = frame_paths[i:i+batch_size]  # paths for this batch
-        images = [Image.open(image_path) for image_path in batch_paths]
         # run owl on this batch of frames
         text_queries = preprocess_text(text_prompt, len(batch_paths))
@@ -128,4 +130,9 @@ def owl_full_video(
     # stitch the frames into a video
     save_path = vid_stitcher(frames_dir, output_path=os.path.join(results_dir, "output.mp4"))
-    return csv_path, save_path

 from utils import plot_predictions, mp4_to_png, vid_stitcher
 from transformers import Owlv2Processor, Owlv2ForObjectDetection
 def preprocess_text(text_prompt: str, num_prompts: int = 1):
     """
     Takes a string of text prompts and returns a list of lists of text prompts for each image.
     i.e. text_prompt = "a, b, c" -> [["a", "b", "c"], ["a", "b", "c"]]
+    """
     text_prompt = [s.strip() for s in text_prompt.split(",")]
     text_queries = [text_prompt] * num_prompts
     # print("text_queries:", text_queries)
     with torch.no_grad():
         outputs = model(**inputs)
+    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
     target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(device)
     # Convert outputs (bounding boxes and class logits) to COCO API, resizes to original image size and filter by threshold
     results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=threshold)
     # create new dirs and paths for results
     filename = os.path.splitext(os.path.basename(vid_path))[0]
     results_dir = f'../temp/{filename}_{datetime.now().strftime("%H%M%S")}'
+    # results_dir = f'temp/{filename}_{datetime.now().strftime("%H%M%S")}'
     frames_dir = os.path.join(results_dir, "frames")
     # if the frames directory does not exist, create it and get the frames from the video
     for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
         frame_nums = [i*fps_processed for i in range(batch_size)]
         batch_paths = frame_paths[i:i+batch_size]  # paths for this batch
+        images = [Image.open(image_path) for image_path  in batch_paths]
         # run owl on this batch of frames
         text_queries = preprocess_text(text_prompt, len(batch_paths))
     # stitch the frames into a video
     save_path = vid_stitcher(frames_dir, output_path=os.path.join(results_dir, "output.mp4"))
+    return csv_path, save_path
+# # DEBUGGING
+# if __name__ == "__main__":
+#     owl_full_video('baboon_15s.mp4', 'baboon', 0.3, fps_processed=1, scaling_factor=4)