Spaces:
Runtime error
Runtime error
hexuan21
commited on
Commit
·
df7a5b3
1
Parent(s):
e7742fc
update for app_regression.py
Browse files- app_regression.py +27 -15
app_regression.py
CHANGED
@@ -14,10 +14,11 @@ from models.conversation import conv_templates
|
|
14 |
from typing import List
|
15 |
|
16 |
|
17 |
-
processor = AutoProcessor.from_pretrained("
|
18 |
-
model = Idefics2ForSequenceClassification.from_pretrained("
|
19 |
model.eval()
|
20 |
-
|
|
|
21 |
conv_template = conv_templates["idefics_2"]
|
22 |
|
23 |
with open("./examples/all_subsets.json", 'r') as f:
|
@@ -46,22 +47,34 @@ then give scores from 7 different dimensions:
|
|
46 |
(4) motion smoothness, the smoothness of motion or movements
|
47 |
(5) text-to-video alignment, the alignment between the text prompt and the video content
|
48 |
(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
|
49 |
-
|
50 |
-
for each dimension, output a number from
|
51 |
-
|
|
|
52 |
Here is an output example:
|
53 |
-
visual quality: 3
|
54 |
-
object consistency:
|
55 |
-
dynamic degree: 4
|
56 |
-
motion smoothness: 1
|
57 |
-
text-to-video alignment:
|
58 |
-
factual consistency:
|
59 |
-
overall score: 1
|
60 |
|
61 |
For this video, the text prompt is "{text_prompt}",
|
62 |
all the frames of video are as follows:
|
63 |
|
64 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
@spaces.GPU(duration=60)
|
66 |
def score(prompt:str, images:List[Image.Image]):
|
67 |
if not prompt:
|
@@ -85,7 +98,7 @@ def score(prompt:str, images:List[Image.Image]):
|
|
85 |
|
86 |
logits = outputs.logits
|
87 |
num_aspects = logits.shape[-1]
|
88 |
-
aspects = [
|
89 |
|
90 |
aspect_scores = {}
|
91 |
for i, aspect in enumerate(aspects):
|
@@ -130,7 +143,6 @@ def eval_video(prompt, video:str):
|
|
130 |
|
131 |
eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
|
132 |
|
133 |
-
|
134 |
num_image_token = eval_prompt.count("<image>")
|
135 |
if num_image_token < len(frames):
|
136 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|
|
|
14 |
from typing import List
|
15 |
|
16 |
|
17 |
+
processor = AutoProcessor.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression")
|
18 |
+
model = Idefics2ForSequenceClassification.from_pretrained("Mantis-VL/mantis-8b-idefics2-video-eval-20k-mantis-2epoch_4096_regression", torch_dtype=torch.bfloat16)
|
19 |
model.eval()
|
20 |
+
|
21 |
+
MAX_NUM_FRAMES = 8
|
22 |
conv_template = conv_templates["idefics_2"]
|
23 |
|
24 |
with open("./examples/all_subsets.json", 'r') as f:
|
|
|
47 |
(4) motion smoothness, the smoothness of motion or movements
|
48 |
(5) text-to-video alignment, the alignment between the text prompt and the video content
|
49 |
(6) factual consistency, the consistency of the video content with the common-sense and factual knowledge
|
50 |
+
|
51 |
+
for each dimension, output a float number from 1.0 to 4.0,
|
52 |
+
the higher the number is, the better the video performs in that sub-score,
|
53 |
+
the lowest 1.0 means Bad, the highest 4.0 means Perfect/Real (the video is like a real video)
|
54 |
Here is an output example:
|
55 |
+
visual quality: 3.2
|
56 |
+
object consistency: 2.7
|
57 |
+
dynamic degree: 4.0
|
58 |
+
motion smoothness: 1.6
|
59 |
+
text-to-video alignment: 2.3
|
60 |
+
factual consistency: 1.8
|
|
|
61 |
|
62 |
For this video, the text prompt is "{text_prompt}",
|
63 |
all the frames of video are as follows:
|
64 |
|
65 |
"""
|
66 |
+
|
67 |
+
|
68 |
+
aspect_mapping={
|
69 |
+
1:"visual quality",
|
70 |
+
2:"object consistency",
|
71 |
+
3:"dynamic degree",
|
72 |
+
4:"motion smoothness",
|
73 |
+
5:'text-to-video alignment',
|
74 |
+
6:'factual consistency',
|
75 |
+
}
|
76 |
+
|
77 |
+
|
78 |
@spaces.GPU(duration=60)
|
79 |
def score(prompt:str, images:List[Image.Image]):
|
80 |
if not prompt:
|
|
|
98 |
|
99 |
logits = outputs.logits
|
100 |
num_aspects = logits.shape[-1]
|
101 |
+
aspects = [aspect_mapping[i+1] for i in range(num_aspects)]
|
102 |
|
103 |
aspect_scores = {}
|
104 |
for i, aspect in enumerate(aspects):
|
|
|
143 |
|
144 |
eval_prompt = VIDEO_EVAL_PROMPT.format(text_prompt=prompt)
|
145 |
|
|
|
146 |
num_image_token = eval_prompt.count("<image>")
|
147 |
if num_image_token < len(frames):
|
148 |
eval_prompt += "<image> " * (len(frames) - num_image_token)
|