Intel
/

tvp-base

@@ -33,7 +33,7 @@ import cv2
 import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
-from transformers import AutoProcessor, AutoModel
 def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
@@ -85,7 +85,7 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
     Returns:
         frames (tensor): decoded frames from the video.
     """
-    assert clip_idx >= -2, "Not valied clip_idx {}".format(clip_idx)
     frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
     clip_size = sampling_rate * num_frames / target_fps * fps
     index = torch.linspace(0, clip_size - 1, num_frames)
@@ -96,10 +96,33 @@ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps
     return frames
-file = hf_hub_download(repo_id="Intel/tvp_demo", filename="0A8ZT.mp4", repo_type="dataset")
-model = AutoModel.from_pretrained("Intel/tvp-base")
 decoder_kwargs = dict(
     container=av.open(file, metadata_errors="ignore"),
@@ -112,11 +135,15 @@ decoder_kwargs = dict(
 raw_sampled_frms = decode(**decoder_kwargs)
 raw_sampled_frms = raw_sampled_frms.permute(0, 3, 1, 2)
 processor = AutoProcessor.from_pretrained("Intel/tvp-base")
 data = processor(
-    text=["person turn a light on."], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100
 )
 output = model(**data)
 print(f"The model's output is {output}")
@@ -125,7 +152,7 @@ def get_video_duration(filename):
     cap = cv2.VideoCapture(filename)
     if cap.isOpened():
         rate = cap.get(5)
-        frame_num =cap.get(7)
         duration = frame_num/rate
         return duration
     return -1
@@ -133,7 +160,7 @@ def get_video_duration(filename):
 duration = get_video_duration(file)
 timestamp = output['logits'].tolist()
 start, end = round(timestamp[0][0]*duration, 1), round(timestamp[0][1]*duration, 1)
-print(f"The time slot of the video corresponding to the text is from {start}s to {end}s")
 ```
 ### Limitations and bias

 import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
+from transformers import AutoProcessor, TvpForVideoGrounding
 def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
     Returns:
         frames (tensor): decoded frames from the video.
     """
+    assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
     frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
     clip_size = sampling_rate * num_frames / target_fps * fps
     index = torch.linspace(0, clip_size - 1, num_frames)
     return frames
+def get_resize_size(image, max_size):
+    """
+    Args:
+        image: np.ndarray
+        max_size: The max size of height and width
+    Returns:
+        (height, width)
+    Note the height/width order difference >>> pil_img = Image.open("raw_img_tensor.jpg") >>> pil_img.size (640,
+    480) # (width, height) >>> np_img = np.array(pil_img) >>> np_img.shape (480, 640, 3) # (height, width, 3)
+    """
+    height, width = image.shape[-2:]
+    if height >= width:
+        ratio = width * 1.0 / height
+        new_height = max_size
+        new_width = new_height * ratio
+    else:
+        ratio = height * 1.0 / width
+        new_width = max_size
+        new_height = new_width * ratio
+    size = {"height": int(new_height), "width": int(new_width)}
+    return size
+file = hf_hub_download(repo_id="Intel/tvp_demo", filename="3MSZA.mp4", repo_type="dataset")
+model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
 decoder_kwargs = dict(
     container=av.open(file, metadata_errors="ignore"),
 raw_sampled_frms = decode(**decoder_kwargs)
 raw_sampled_frms = raw_sampled_frms.permute(0, 3, 1, 2)
+text = "person turn a light on."
 processor = AutoProcessor.from_pretrained("Intel/tvp-base")
+size = get_resize_size(raw_sampled_frms, model.config.max_img_size)
 data = processor(
+    text=[text], videos=list(raw_sampled_frms.numpy()), return_tensors="pt", max_text_length=100, size=size
 )
+data["pixel_values"] = data["pixel_values"].to(model.dtype)
+data["labels"] = torch.tensor([30.96, 24.3, 30.4])
 output = model(**data)
 print(f"The model's output is {output}")
     cap = cv2.VideoCapture(filename)
     if cap.isOpened():
         rate = cap.get(5)
+        frame_num = cap.get(7)
         duration = frame_num/rate
         return duration
     return -1
 duration = get_video_duration(file)
 timestamp = output['logits'].tolist()
 start, end = round(timestamp[0][0]*duration, 1), round(timestamp[0][1]*duration, 1)
+print(f"The time slot of the video corresponding to the text \"{text}\" is from {start}s to {end}s")
 ```
 ### Limitations and bias