Gourisankar Padihary
corrected rmse and auroc calculation
cfb3435
raw
history blame
1.44 kB
import logging
from data.load_dataset import load_data
from generator.compute_rmse_auc_roc_metrics import compute_rmse_auc_roc_metrics
from retriever.chunk_documents import chunk_documents
from retriever.embed_documents import embed_documents
from generator.generate_metrics import generate_metrics
from generator.initialize_llm import initialize_llm
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def main():
logging.info("Starting the RAG pipeline")
data_set_name = 'covidqa'
# Load the dataset
dataset = load_data(data_set_name)
logging.info("Dataset loaded")
# Chunk the dataset
chunk_size = 1000 # default value
if data_set_name == 'cuad':
chunk_size = 3000
documents = chunk_documents(dataset, chunk_size)
logging.info("Documents chunked")
# Embed the documents
vector_store = embed_documents(documents)
logging.info("Documents embedded")
# Initialize the Generation LLM
llm = initialize_llm()
# Sample question
row_num = 10
sample_question = dataset[row_num]['question']
# Call generate_metrics for above sample question
generate_metrics(llm, vector_store, sample_question)
#Compute RMSE and AUC-ROC for entire dataset
compute_rmse_auc_roc_metrics(llm, dataset, vector_store, 10)
logging.info("Finished!!!")
if __name__ == "__main__":
main()