File size: 3,528 Bytes
d7b7dc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import numpy as np
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import CrossEncoder

from leaderboard.src.backend.util import generate_prompt

def load_evaluation_model(model_path):
    model = CrossEncoder(model_path)
    model.save_pretrained('.checkpoints/{model_path}')
    return model

class SummaryGenerator:
    def __init__(self, model_id, revision):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
        self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
        self.summaries_df = pd.DataFrame()
        self.revision = revision
        self.avg_length = None
        self.answer_rate = None

    def generate_summaries(self, df):
        source, summary, dataset = [], [], []

        for index, row in df.iterrows():
            _source = row['text']
            _dataset = row['dataset']

            prompt = generate_prompt(_source)
            inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024, revision=self.revision)
            try:
                outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, revision=self.revision)
                response = self.tokenizer.decode(outputs[0], skip_special_tokens=True, revision=self.revision)
            except Exception as e:
                print(f"Error at index {index}: {e}")
                response = ""
                
            summary.append(response)
            source.append(_source)
            dataset.append(_dataset)

        self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
        self._compute_avg_length()
        self._compute_answer_rate()

        return self.summaries_df

    def _compute_avg_length(self):
        total_words = 0
        count = 0

        for summary in self.summaries_df['summary']:
            if summary != "":
                words = summary.split()
                total_words += len(words)
                count += 1

        self.avg_length = 0 if count == 0 else total_words / count

    def _compute_answer_rate(self):
        non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary != "")
        total_rows = len(self.summaries_df)

        self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows

class EvaluationModel:
    def __init__(self, model_path):
        self.model = load_evaluation_model(model_path)
        self.scores = []
        self.accuracy = None
        self.hallucination_rate = None

    def evaluate_hallucination(self, summaries_df):
        # Convert to NumPy arrays for efficient processing
        source_docs = np.array(summaries_df['source'])
        generated_summaries = np.array(summaries_df['summary'])

        scores = self.model.predict(source_docs, generated_summaries)
        self.scores = scores
        return self.scores

    def compute_accuracy(self):
        if not self.scores:
            raise ValueError("Scores not calculated. Call evaluate_hallucination() first.")

        # Use threshold of 0.5 to compute accuracy
        num_above_threshold = sum(score >= 0.5 for score in self.scores)
        num_total = len(self.scores)

        if num_total == 0:
            raise ValueError("No scores available to compute accuracy.")

        self.accuracy = (num_above_threshold / num_total) * 100
        self.hallucination_rate = 100 - self.accuracy

        return self.accuracy