from transformers import Owlv2Processor, Owlv2ForObjectDetection

processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to("cuda")
from typing import List
import os
import numpy as np
import supervision as sv
import uuid
import torch
from tqdm import tqdm
import gradio as gr
import torch
import numpy as np
from PIL import Image


BOUNDING_BOX_ANNOTATOR = sv.BoundingBoxAnnotator()
MASK_ANNOTATOR = sv.MaskAnnotator()
LABEL_ANNOTATOR = sv.LabelAnnotator()


def calculate_end_frame_index(source_video_path):
    video_info = sv.VideoInfo.from_video_path(source_video_path)
    return min(
        video_info.total_frames,
        video_info.fps * 2
    )


def annotate_image(
    input_image,
    detections,
    labels
) -> np.ndarray:
    output_image = MASK_ANNOTATOR.annotate(input_image, detections)
    output_image = BOUNDING_BOX_ANNOTATOR.annotate(output_image, detections)
    output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
    return output_image


def process_video(
    input_video,
    labels,
    progress=gr.Progress(track_tqdm=True)
):
    labels = labels.split(",")
    video_info = sv.VideoInfo.from_video_path(input_video)
    total = calculate_end_frame_index(input_video)
    frame_generator = sv.get_video_frames_generator(
        source_path=input_video,
        end=total
    )

    result_file_name = f"{uuid.uuid4()}.mp4"
    result_file_path = os.path.join("./outputs", result_file_name)
    with sv.VideoSink(result_file_path, video_info=video_info) as sink:
        for _ in tqdm(range(total), desc="Processing video.."):
            frame = next(frame_generator)
            # list of dict of {"box": box, "mask":mask, "score":score, "label":label}
            results = query(frame, labels)

            #detections = sv.Detections.empty()
            detections = sv.Detections.from_transformers(results[0])
            final_labels = []
            for id in results[0]["labels"]:
              final_labels.append(labels[id])
            frame = annotate_image(
                input_image=frame,
                detections=detections,
                labels=final_labels,
            )
            sink.write_frame(frame)
    return result_file_path


def query(image, texts):
  inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda")
  with torch.no_grad():
    outputs = model(**inputs)
  target_sizes = torch.Tensor([image.shape[:-1]])
  
  results = processor.post_process_object_detection(outputs=outputs, threshold=0.3, target_sizes=target_sizes)
  return results


with gr.Blocks() as demo:
  with gr.Markdown(" ## Zero-shot Object Tracking with OWLv2 🦉")
  with gr.Markdown("This is a demo for zero-shot object tracking using [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble) model by Google.")
  with gr.Markdown("Simply upload a video and enter the candidate labels, or try the example below. 👇")
  with gr.Tab(label="Video"):
    with gr.Row():
        input_video = gr.Video(
            label='Input Video'
        )
        output_video = gr.Video(
            label='Output Video'
        )
    with gr.Row():
        candidate_labels = gr.Textbox(
            label='Labels',
            placeholder='Labels separated by a comma',
        )
        submit = gr.Button()
    gr.Examples(
        fn=process_video,
        examples=[["./cats.mp4", "dog,cat"]],
        inputs=[
            input_video,
            candidate_labels,
            
        ],
        outputs=output_video
    )

  submit.click(
      fn=process_video,
      inputs=[input_video, candidate_labels],
      outputs=output_video
  )

demo.launch(debug=False, show_error=True)