Spaces:
Running
Running
ai-news-analyzer
/
external
/FinGPT
/fingpt
/FinGPT_MultiAgentsRAG
/Evaluation_methods
/HaluEval
/evaluate.py
import random | |
import openai | |
import time | |
import json | |
import argparse | |
import tiktoken | |
# START: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git> | |
openai.api_key = 'sk-' | |
def get_qa_response(model, question, answer, instruction): | |
message = [ | |
{"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""}, | |
{"role": "user", "content": instruction + | |
"\n\n#Question#: " + question + | |
"\n#Answer#: " + answer + | |
"\n#Your Judgement#: "} | |
] | |
prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:" | |
while True: | |
try: | |
if model == "gpt-3.5-turbo": | |
res = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=message, | |
temperature=0.0, | |
) | |
response = res['choices'][0]['message']['content'] | |
else: | |
res = openai.Completion.create( | |
engine=model, | |
prompt=prompt, | |
temperature=0.0 | |
) | |
response = res["choices"][0]['text'].strip() | |
break | |
except openai.error.RateLimitError: | |
print('openai.error.RateLimitError\nRetrying...') | |
time.sleep(60) | |
except openai.error.ServiceUnavailableError: | |
print('openai.error.ServiceUnavailableError\nRetrying...') | |
time.sleep(20) | |
except openai.error.Timeout: | |
print('openai.error.Timeout\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIError: | |
print('openai.error.APIError\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIConnectionError: | |
print('openai.error.APIConnectionError\nRetrying...') | |
time.sleep(20) | |
return response | |
def get_dialogue_response(model, dialog, response, instruction): | |
message = [ | |
{"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, | |
{"role": "user", "content": instruction + | |
"\n\n#Dialogue History#: " + dialog + | |
"\n#Response#: " + response + | |
"\n#Your Judgement#: "} | |
] | |
prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:" | |
while True: | |
try: | |
if model == "gpt-3.5-turbo": | |
res = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=message, | |
temperature=0.0, | |
) | |
response = res['choices'][0]['message']['content'] | |
else: | |
res = openai.Completion.create( | |
model=model, | |
prompt=prompt, | |
temperature=0.0 | |
) | |
response = res["choices"][0]['text'].strip() | |
break | |
except openai.error.RateLimitError: | |
print('openai.error.RateLimitError\nRetrying...') | |
time.sleep(60) | |
except openai.error.ServiceUnavailableError: | |
print('openai.error.ServiceUnavailableError\nRetrying...') | |
time.sleep(20) | |
except openai.error.Timeout: | |
print('openai.error.Timeout\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIError: | |
print('openai.error.APIError\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIConnectionError: | |
print('openai.error.APIConnectionError\nRetrying...') | |
time.sleep(20) | |
return response | |
def num_tokens_from_message(message, model="davinci"): | |
encoding = tiktoken.encoding_for_model(model) | |
num_tokens = len(encoding.encode(message)) | |
return num_tokens | |
def truncate_message(prompt1, prompt2, model="davinci"): | |
if num_tokens_from_message(prompt1 + prompt2, model) > 2033: | |
truncation_length = 2033 - num_tokens_from_message(prompt2) | |
while num_tokens_from_message(prompt1) > truncation_length: | |
prompt1 = " ".join(prompt1.split()[:-1]) | |
prompt = prompt1 + prompt2 | |
return prompt | |
def get_summarization_response(model, document, summary, instruction): | |
message = [ | |
{"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, | |
{"role": "user", "content": instruction + | |
"\n\n#Document#: " + document + | |
"\n#Summary#: " + summary + | |
"\n#Your Judgement#: "} | |
] | |
prompt1 = instruction + "\n\n#Document#: " + document | |
prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:" | |
if model == "davinci": | |
prompt = truncate_message(prompt1, prompt2) | |
else: | |
prompt = prompt1 + prompt2 | |
while True: | |
try: | |
if model == "gpt-3.5-turbo": | |
res = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=message, | |
temperature=0.0, | |
) | |
response = res['choices'][0]['message']['content'] | |
else: | |
res = openai.Completion.create( | |
model=model, | |
prompt=prompt, | |
temperature=0.0 | |
) | |
response = res["choices"][0]['text'].strip() | |
break | |
except openai.error.RateLimitError: | |
print('openai.error.RateLimitError\nRetrying...') | |
time.sleep(60) | |
except openai.error.ServiceUnavailableError: | |
print('openai.error.ServiceUnavailableError\nRetrying...') | |
time.sleep(20) | |
except openai.error.Timeout: | |
print('openai.error.Timeout\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIError: | |
print('openai.error.APIError\nRetrying...') | |
time.sleep(20) | |
except openai.error.APIConnectionError: | |
print('openai.error.APIConnectionError\nRetrying...') | |
time.sleep(20) | |
return response | |
def evaluation_qa_dataset(model, file, instruction, output_path): | |
with open(file, 'r', encoding="utf-8") as f: | |
data = [] | |
for line in f: | |
data.append(json.loads(line)) | |
correct = 0 | |
incorrect = 0 | |
for i in range(len(data)): | |
knowledge = data[i]["knowledge"] | |
question = data[i]["question"] | |
hallucinated_answer = data[i]["hallucinated_answer"] | |
right_answer = data[i]["right_answer"] | |
if random.random() > 0.5: | |
answer = hallucinated_answer | |
ground_truth = "Yes" | |
else: | |
answer = right_answer | |
ground_truth = "No" | |
ans = get_qa_response(model, question, answer, instruction) | |
ans = ans.replace(".", "") | |
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | |
gen = {"knowlwdge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"} | |
dump_jsonl(gen, output_path, append=True) | |
incorrect += 1 | |
print('sample {} fails......'.format(i)) | |
continue | |
elif "Yes" in ans: | |
if ans != "Yes": | |
ans = "Yes" | |
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} | |
elif "No" in ans: | |
if ans != "No": | |
ans = "No" | |
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} | |
else: | |
gen = None | |
incorrect += 1 | |
assert(gen is not None) | |
if ground_truth == ans: | |
correct += 1 | |
else: | |
incorrect += 1 | |
print('sample {} success......'.format(i)) | |
dump_jsonl(gen, output_path, append=True) | |
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data))) | |
def evaluation_dialogue_dataset(model, file, instruction, output_path): | |
with open(file, 'r', encoding="utf-8") as f: | |
data = [] | |
for line in f: | |
data.append(json.loads(line)) | |
correct = 0 | |
incorrect = 0 | |
for i in range(len(data)): | |
knowledge = data[i]["knowledge"] | |
dialog = data[i]["dialogue_history"] | |
hallucinated_response = data[i]["hallucinated_response"] | |
right_response = data[i]["right_response"] | |
if random.random() > 0.5: | |
response = hallucinated_response | |
ground_truth = "Yes" | |
else: | |
response = right_response | |
ground_truth = "No" | |
ans = get_dialogue_response(model, dialog, response, instruction) | |
ans = ans.replace(".", "") | |
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | |
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"} | |
dump_jsonl(gen, output_path, append=True) | |
incorrect += 1 | |
print('sample {} fails......'.format(i)) | |
continue | |
elif "Yes" in ans: | |
if ans != "Yes": | |
ans = "Yes" | |
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} | |
elif "No" in ans: | |
if ans != "No": | |
ans = "No" | |
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} | |
else: | |
gen = None | |
assert (gen is not None) | |
if ground_truth == ans: | |
correct += 1 | |
else: | |
incorrect += 1 | |
print('sample {} success......'.format(i)) | |
dump_jsonl(gen, output_path, append=True) | |
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) | |
def evaluation_summarization_dataset(model, file, instruction, output_path): | |
with open(file, 'r', encoding="utf-8") as f: | |
data = [] | |
for line in f: | |
data.append(json.loads(line)) | |
correct = 0 | |
incorrect = 0 | |
for i in range(len(data)): | |
document = data[i]["document"] | |
hallucinated_summary = data[i]["hallucinated_summary"] | |
right_summary = data[i]["right_summary"] | |
if random.random() > 0.5: | |
summary = hallucinated_summary | |
ground_truth = "Yes" | |
else: | |
summary = right_summary | |
ground_truth = "No" | |
ans = get_summarization_response(model, document, summary, instruction) | |
ans = ans.replace(".", "") | |
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | |
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"} | |
dump_jsonl(gen, output_path, append=True) | |
incorrect += 1 | |
print('sample {} fails......'.format(i)) | |
continue | |
elif "Yes" in ans: | |
if ans != "Yes": | |
ans = "Yes" | |
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} | |
elif "No" in ans: | |
if ans != "No": | |
ans = "No" | |
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} | |
else: | |
gen = None | |
assert (gen is not None) | |
if ground_truth == ans: | |
correct += 1 | |
else: | |
incorrect += 1 | |
print('sample {} success......'.format(i)) | |
dump_jsonl(gen, output_path, append=True) | |
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) | |
def dump_jsonl(data, output_path, append=False): | |
""" | |
Write list of objects to a JSON lines file. | |
""" | |
mode = 'a+' if append else 'w' | |
with open(output_path, mode, encoding='utf-8') as f: | |
json_record = json.dumps(data, ensure_ascii=False) | |
f.write(json_record + '\n') | |
#END: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git> | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Hallucination Generation") | |
parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization") | |
parser.add_argument("--model", default="davinci", help="model name") | |
args = parser.parse_args() | |
instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task) | |
f = open(instruction_file, 'r', encoding="utf-8") | |
instruction = f.read() | |
model = args.model | |
output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model) | |
data = "../data/{}_data.json".format(args.task) | |
if args.task == "qa": | |
evaluation_qa_dataset(model, data, instruction, output_path) | |
elif args.task == "dialogue": | |
evaluation_dialogue_dataset(model, data, instruction, output_path) | |
elif args.task == "summarization": | |
evaluation_summarization_dataset(model, data, instruction, output_path) | |
else: | |
raise ValueError("The task must be qa, dialogue, or summarization!") | |