File size: 2,407 Bytes
a6b0504
 
 
 
4e7ea3a
 
a6b0504
4e7ea3a
a6b0504
4e7ea3a
a6b0504
 
 
 
 
4e7ea3a
a6b0504
4e7ea3a
a6b0504
4e7ea3a
a6b0504
 
 
4e7ea3a
a6b0504
a80a32b
 
 
 
 
a6b0504
4e7ea3a
a6b0504
4e7ea3a
a6b0504
4e7ea3a
a6b0504
 
4e7ea3a
a6b0504
4e7ea3a
a6b0504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e7ea3a
 
 
a6b0504
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Forrest Sheng Bao
# 2025-05-25
# [email protected]

from typing import List, Literal
from IPython.display import display, Markdown
from transformers import AutoModelForSequenceClassification

hhem = AutoModelForSequenceClassification.from_pretrained('vectara/hallucination_evaluation_model', trust_remote_code=True)

def HHEM(
    LLM_Prompt: str = "The sky is blue.",
    LLM_Response: str = "The ocean is blue."
    ) -> Markdown:
    """# GUI demo for Vectara's HHEM-2.1-Open

    Vectara's Hughes Hallucination Evaluation Model (HHEM)  evaluates how well an LLM's output (called the "response" or the "hypothesis") is faithful/grounded to or supported by the input given to it (called the "prompt" or the "premise"). HHEM has two versions: [HHEM-Open](https://huggingface.co/vectara/hallucination_evaluation_model) and [HHEM Commercial](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model).

    To use the demo, fill in the "LLM_Prompt" and "LLM_Response" fields and click the run button. A placeholder example is prefilled for you. Feel free to replace it with your own examples and evaluate them.

    Args:
        LLM_Prompt (str): a.k.a. the "premise".
        LLM_Response (str): a.k.a. the "hypothesis".

    """

    if len(LLM_Prompt) + len(LLM_Response) > 500:
        return Markdown("""Your input is too long for this demo. Please shorten them such that their total length is under 500 characters.""")
        

    pairs = [(LLM_Prompt, LLM_Response)]

    score = hhem.predict(pairs)[0]

    verdict = "consistent" if score > 0.5 else "hallucinated"

    output_string = f"""
Given the **prompt**: 

> {LLM_Prompt}

and 

the **response**: 
> {LLM_Response}

HHEM's **judgement** is: <u>{verdict}</u> **with the score**: <u>{score:0.3f}</u>. 

Wanna know why? Check out [Vectara's Hallucination Corrector (VHC)](https://hcm.demo.vectara.dev/)

Note that in the industry, there are generally two definitions to hallucations:
* faithfulness: the LLM's response contains information not supported by the prompt given to it. 
* factuality: the LLM's response is not true per the world knowledge.

In HHEM, we take the faithfulness definition. 

See also: 
* [HHEM Leaderboard](https://huggingface.co/spaces/vectara/leaderboard)
* [Source code of this app](https://huggingface.co/spaces/vectara/hhem-2.1-open-demo/tree/main)

    """
    
    return output_string