Spaces:
Runtime error
Runtime error
hexuan21
commited on
Commit
·
df7a5b3
1
Parent(s):
e7742fc
update for app_regression.py
Browse files- app_regression.py +27 -15
app_regression.py
CHANGED
|
@@ -14,10 +14,11 @@ from models.conversation import conv_templates
|
|
| 14 |
from typing import List
|
| 15 |
|
| 16 |
|
| 17 |
-
processor = AutoProcessor.from_pretrained("
|
| 18 |
-
model = Idefics2ForSequenceClassification.from_pretrained("
|
| 19 |
model.eval()
|
| 20 |
-
|
|
|
|
| 21 |
conv_template = conv_templates["idefics_2"]
|
| 22 |
|
| 23 |
with open("./examples/all_subsets.json", 'r') as f:
|
|
@@ -46,22 +47,34 @@ then give scores from 7 different dimensions:
|
|
| 46 |
(4) motion smoothness, the smoothness of motion or movements
|
| 47 |
(5) text-to-video alignment, the alignment between the text prompt and the video content
|
| 48 |
(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
|
| 49 |
-
|
| 50 |
-
for each dimension, output a number from
|
| 51 |
-
|
|
|
|
| 52 |
Here is an output example:
|
| 53 |
-
visual quality: 3
|
| 54 |
-
object consistency:
|
| 55 |
-
dynamic degree: 4
|
| 56 |
-
motion smoothness: 1
|
| 57 |
-
text-to-video alignment:
|
| 58 |
-
factual consistency:
|
| 59 |
-
overall score: 1
|
| 60 |
|
| 61 |
For this video, the text prompt is "{text_prompt}",
|
| 62 |
all the frames of video are as follows:
|
| 63 |
|
| 64 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
@spaces.GPU(duration=60)
|
| 66 |
def score(prompt:str, images:List[Image.Image]):
|
| 67 |
if not prompt:
|
|
@@ -85,7 +98,7 @@ def score(prompt:str, images:List[Image.Image]):
|
|
| 85 |
|
| 86 |
logits = outputs.logits
|
| 87 |
num_aspects = logits.shape[-1]
|
| 88 |
-
aspects = [
|
| 89 |
|
| 90 |
aspect_scores = {}
|
| 91 |
for i, aspect in enumerate(aspects):
|
|
@@ -130,7 +143,6 @@ def eval_video(prompt, video:str):
|
|
| 130 |
|
| 131 |
eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
|
| 132 |
|
| 133 |
-
|
| 134 |
num_image_token = eval_prompt.count("<image>")
|
| 135 |
if num_image_token < len(frames):
|
| 136 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|
|
|
|
| 14 |
from typing import List
|
| 15 |
|
| 16 |
|
| 17 |
+
processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
|
| 18 |
+
model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16)
|
| 19 |
model.eval()
|
| 20 |
+
|
| 21 |
+
MAX_NUM_FRAMES = 8
|
| 22 |
conv_template = conv_templates["idefics_2"]
|
| 23 |
|
| 24 |
with open("./examples/all_subsets.json", 'r') as f:
|
|
|
|
| 47 |
(4) motion smoothness, the smoothness of motion or movements
|
| 48 |
(5) text-to-video alignment, the alignment between the text prompt and the video content
|
| 49 |
(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
|
| 50 |
+
|
| 51 |
+
for each dimension, output a float number from 1.0 to 4.0,
|
| 52 |
+
the higher the number is, the better the video performs in that sub-score,
|
| 53 |
+
the lowest 1.0 means Bad, the highest 4.0 means Perfect/Real (the video is like a real video)
|
| 54 |
Here is an output example:
|
| 55 |
+
visual quality: 3.2
|
| 56 |
+
object consistency: 2.7
|
| 57 |
+
dynamic degree: 4.0
|
| 58 |
+
motion smoothness: 1.6
|
| 59 |
+
text-to-video alignment: 2.3
|
| 60 |
+
factual consistency: 1.8
|
|
|
|
| 61 |
|
| 62 |
For this video, the text prompt is "{text_prompt}",
|
| 63 |
all the frames of video are as follows:
|
| 64 |
|
| 65 |
"""
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
aspect_mapping={
|
| 69 |
+
1:"visual quality",
|
| 70 |
+
2:"object consistency",
|
| 71 |
+
3:"dynamic degree",
|
| 72 |
+
4:"motion smoothness",
|
| 73 |
+
5:'text-to-video alignment',
|
| 74 |
+
6:'factual consistency',
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
@spaces.GPU(duration=60)
|
| 79 |
def score(prompt:str, images:List[Image.Image]):
|
| 80 |
if not prompt:
|
|
|
|
| 98 |
|
| 99 |
logits = outputs.logits
|
| 100 |
num_aspects = logits.shape[-1]
|
| 101 |
+
aspects = [aspect_mapping[i+1] for i in range(num_aspects)]
|
| 102 |
|
| 103 |
aspect_scores = {}
|
| 104 |
for i, aspect in enumerate(aspects):
|
|
|
|
| 143 |
|
| 144 |
eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
|
| 145 |
|
|
|
|
| 146 |
num_image_token = eval_prompt.count("<image>")
|
| 147 |
if num_image_token < len(frames):
|
| 148 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|