File size: 3,360 Bytes
f7c2fa3 5184c29 e384879 f7c2fa3 9bde774 f7c2fa3 e384879 f7c2fa3 cfb3435 f7c2fa3 e384879 5184c29 f7c2fa3 11b4c9f cfb3435 f7c2fa3 cfb3435 f7c2fa3 cfb3435 e384879 f7c2fa3 e384879 f7c2fa3 e384879 cfb3435 f7c2fa3 9bde774 cfb3435 f7c2fa3 cfb3435 f7c2fa3 9bde774 cfb3435 f7c2fa3 cfb3435 e384879 9bde774 e384879 9bde774 e234b58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from sklearn.metrics import roc_auc_score, root_mean_squared_error
from generator.generate_metrics import generate_metrics, retrieve_and_generate_response
import logging
def compute_rmse_auc_roc_metrics(gen_llm, val_llm, dataset, vector_store, num_question):
# Lists to accumulate ground truths and predictions for AUC-ROC computation
all_ground_truth_relevance = []
all_predicted_relevance = []
all_ground_truth_utilization = []
all_predicted_utilization = []
all_ground_truth_adherence = []
all_predicted_adherence = []
# For each question in dataset get the metrics
for i, document in enumerate(dataset):
# Extract ground truth metrics from dataset
ground_truth_relevance = dataset[i]['relevance_score']
ground_truth_utilization = dataset[i]['utilization_score']
ground_truth_adherence = 1 if dataset[i]['adherence_score'] else 0
query = document['question']
logging.info(f'Query number: {i + 1}')
# Call the generate_metrics for each query
response, source_docs = retrieve_and_generate_response(gen_llm, vector_store, query)
attributes, metrics = generate_metrics(val_llm, response, source_docs, query, 25)
# Extract predicted metrics (ensure these are continuous if possible)
predicted_relevance = metrics.get('Context Relevance', 0) if metrics else 0
predicted_utilization = metrics.get('Context Utilization', 0) if metrics else 0
predicted_adherence = 1 if metrics.get('Adherence', False) else 0
# === Handle Continuous Inputs for RMSE ===
all_ground_truth_relevance.append(ground_truth_relevance)
all_predicted_relevance.append(predicted_relevance)
all_ground_truth_utilization.append(ground_truth_utilization)
all_predicted_utilization.append(predicted_utilization)
all_ground_truth_adherence.append(ground_truth_adherence)
all_predicted_adherence.append(predicted_adherence)
if i == num_question:
break
# === Compute RMSE & AUC-ROC for the Entire Dataset ===
try:
logging.info(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
logging.info(f"All Predicted Relevance: {all_predicted_relevance}")
relevance_rmse = root_mean_squared_error(all_ground_truth_relevance, all_predicted_relevance)
except ValueError:
relevance_rmse = None
try:
logging.info(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
logging.info(f"All Predicted Utilization: {all_predicted_utilization}")
utilization_rmse = root_mean_squared_error(all_ground_truth_utilization, all_predicted_utilization)
except ValueError:
utilization_rmse = None
try:
logging.info(f"All Ground Truth Adherence: {all_ground_truth_adherence}")
logging.info(f"All Predicted Adherence: {all_predicted_adherence}")
adherence_auc = roc_auc_score(all_ground_truth_adherence, all_predicted_adherence)
except ValueError:
adherence_auc = None
logging.info(f"Relevance RMSE score: {relevance_rmse}")
logging.info(f"Utilization RMSE score: {utilization_rmse}")
logging.info(f"Overall Adherence AUC-ROC: {adherence_auc}")
return relevance_rmse, utilization_rmse, adherence_auc
|