VideoScore

Runtime error

App Files Files Community

hexuan21 commited on Jun 4, 2024

Commit

df7a5b3

1 Parent(s): e7742fc

update for app_regression.py

Browse files

Files changed (1) hide show

app_regression.py +27 -15

app_regression.py CHANGED Viewed

@@ -14,10 +14,11 @@ from models.conversation import conv_templates
 from typing import List
-processor = AutoProcessor.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final")
-model = Idefics2ForSequenceClassification.from_pretrained("/home/dongfu/WorkSpace/Mantis/checkpoints/idefics2-8b/mantis-8b-idefics2-video-eval-debug_4096_regression/checkpoint-final", torch_dtype=torch.bfloat16)
 model.eval()
-MAX_NUM_FRAMES = 24
 conv_template = conv_templates["idefics_2"]
 with open("./examples/all_subsets.json", 'r') as f:
@@ -46,22 +47,34 @@ then give scores from 7 different dimensions:
 (4) motion smoothness, the smoothness of motion or movements
 (5) text-to-video alignment, the alignment between the text prompt and the video content
 (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
-(7) overall score, the overall quality of the video
-for each dimension, output a number from [1,2,3,4],
-in which '1' is 'Bad', '2' is 'Average', '3' is 'Good', '4' is 'Perfect'
 Here is an output example:
-visual quality: 3
-object consistency: 4
-dynamic degree: 4
-motion smoothness: 1
-text-to-video alignment: 1
-factual consistency: 2
-overall score: 1
 For this video, the text prompt is "{text_prompt}",
 all the frames of video are as follows:
 """
 @spaces.GPU(duration=60)
 def score(prompt:str, images:List[Image.Image]):
     if not prompt:
@@ -85,7 +98,7 @@ def score(prompt:str, images:List[Image.Image]):
     logits = outputs.logits
     num_aspects = logits.shape[-1]
-    aspects = [f"aspect_{i}" for i in range(num_aspects)]
     aspect_scores = {}
     for i, aspect in enumerate(aspects):
@@ -130,7 +143,6 @@ def eval_video(prompt, video:str):
     eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
     num_image_token = eval_prompt.count("<image>")
     if num_image_token < len(frames):
         eval_prompt += "<image> " * (len(frames) - num_image_token)

 from typing import List
+processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
+model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16)
 model.eval()
+MAX_NUM_FRAMES = 8
 conv_template = conv_templates["idefics_2"]
 with open("./examples/all_subsets.json", 'r') as f:
 (4) motion smoothness, the smoothness of motion or movements
 (5) text-to-video alignment, the alignment between the text prompt and the video content
 (6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
+for each dimension, output a float number from 1.0 to 4.0,
+the higher the number is, the better the video performs in that sub-score,
+the lowest 1.0 means Bad, the highest 4.0 means Perfect/Real (the video is like a real video)
 Here is an output example:
+visual quality: 3.2
+object consistency: 2.7
+dynamic degree: 4.0
+motion smoothness: 1.6
+text-to-video alignment: 2.3
+factual consistency: 1.8
 For this video, the text prompt is "{text_prompt}",
 all the frames of video are as follows:
 """
+aspect_mapping={
+    1:"visual quality",
+    2:"object consistency",
+    3:"dynamic degree",
+    4:"motion smoothness",
+    5:'text-to-video alignment',
+    6:'factual consistency',
+}
 @spaces.GPU(duration=60)
 def score(prompt:str, images:List[Image.Image]):
     if not prompt:
     logits = outputs.logits
     num_aspects = logits.shape[-1]
+    aspects = [aspect_mapping[i+1] for i in range(num_aspects)]
     aspect_scores = {}
     for i, aspect in enumerate(aspects):
     eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
     num_image_token = eval_prompt.count("<image>")
     if num_image_token < len(frames):
         eval_prompt += "<image> " * (len(frames) - num_image_token)