File size: 1,282 Bytes
d3cd5c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import editdistance
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from ..hf import detect_device

MODEL_ID = "vikhyatk/moondream2"
DEVICE, DTYPE = detect_device()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
moondream = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    torch_dtype=DTYPE,
    device_map={"": DEVICE},
)
moondream.eval()


def get_anls(s1, s2):
    s1 = s1.lower().strip()
    s2 = s2.lower().strip()
    iou = 1 - editdistance.eval(s1, s2) / max(len(s1), len(s2))
    anls = iou if iou >= 0.5 else 0.0
    return anls


docvqa_val = load_dataset("vikhyatk/docvqa", split="validation")

scores = []
for row in tqdm(docvqa_val):
    image = row["image"]
    enc_image = moondream.encode_image(image)
    for qa in row["qa"]:
        question = qa["question"]
        answers = qa["answers"]
        prompt = f"{question}\nAnswer briefly with a single word or phrase."

        model_answer = moondream.answer_question(enc_image, prompt, tokenizer)
        anls = max(get_anls(model_answer, gt) for gt in answers)
        scores.append(anls)

print("ANLS:", sum(scores) / len(scores))