import os import pickle from tqdm import tqdm from typing import List import pandas as pd from prompt_injection.evaluators.base import PromptEvaluator def init_evaluator_result_object(output_path,evaluator_list): result={'idx':[],'Prompt':[]} if os.path.exists(output_path): with open(output_path,'rb') as f: result=pickle.load(f) if os.path.exists(output_path): with open(output_path,'rb') as f: result=pickle.load(f) for evaluator in evaluator_list: result[evaluator.get_name()]=result.get(evaluator.get_name(),[]) return result def evaluate_all(prompts,evaluator_list:List[PromptEvaluator],output_path): result=init_evaluator_result_object(output_path,evaluator_list) for i in tqdm(range(len(prompts))): if i in result["idx"]: continue prompt=prompts[i] result['idx'].append(i) result['Prompt'].append(prompt) for evaluator in evaluator_list: result[evaluator.get_name()].append(evaluator.eval_sample(prompt)) with open(output_path,'wb') as f: pickle.dump(result, f, protocol=pickle.HIGHEST_PROTOCOL) return pd.DataFrame.from_dict(result)