from datasets import load_dataset ds = load_dataset("neural-bridge/rag-dataset-12000") # Test the RAG system with DS dataset from sentence_transformers import SentenceTransformer from development_scripts.preprocessing import model_selection, create_embeddings, build_faiss_index, retrieve_similar_chunks, agentic_rag import dotenv from langchain_community.tools.tavily_search import TavilySearchResults import json import gc import torch # For clearing CUDA cache if available import os from langchain.memory import ConversationBufferMemory import json import csv from sentence_transformers import SentenceTransformer, util from rouge_score import rouge_scorer # Configuration parameters SAMPLE_SIZE = 80 # Number of documents to test BATCH_SIZE = 1 # Save results after every X iterations OUTPUT_FILE = 'rag_test_output.json' tools = [TavilySearchResults(max_results=5)] dotenv.load_dotenv() # create a simple chunking function for text based def chunk_text(text, max_length=250): # Split the text into chunks of max_length with metadata chunks = [] for i in range(0, len(text), max_length): chunk = text[i:i + max_length] chunks.append({"text": chunk, "metadata": {"chunk_id": i // max_length}}) return chunks # Function to clear memory def clear_memory(): gc.collect() # Run garbage collector if torch.cuda.is_available(): # If using GPU torch.cuda.empty_cache() # Clear CUDA cache # Initialize or load output data if os.path.exists(OUTPUT_FILE): with open(OUTPUT_FILE, 'r') as f: try: output_data = json.load(f) start_idx = len(output_data) # Resume from where we left off print(f"Resuming from index {start_idx}") except json.JSONDecodeError: output_data = [] # Start fresh if file is corrupted start_idx = 0 else: output_data = [] # Start fresh if file doesn't exist start_idx = 0 # Process documents in range try: for i in range(start_idx, min(start_idx + SAMPLE_SIZE, len(ds['train']))): print(f"Processing document {i}/{min(start_idx + SAMPLE_SIZE, len(ds['train']))}") # Get current document data llm = model_selection("meta-llama/llama-4-scout-17b-16e-instruct") current_context_text = ds['train'][i]['context'] model = SentenceTransformer('BAAI/bge-large-en-v1.5') # Process text and create embeddings chunks = chunk_text(current_context_text, max_length=100) embeddings, chunks = create_embeddings(chunks, model) index = build_faiss_index(embeddings) query = ds['train'][i]['question'] # Retrieve similar chunks similar_chunks = retrieve_similar_chunks(query, index, chunks, model, k=5) agent_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) # Run RAG system print(f"Query: {query}") response = agentic_rag(llm, tools, query=query, context_chunks=similar_chunks, memory=agent_memory, Use_Tavily=False) print("Assistant:", response["output"]) print("Ground Truth:", ds['train'][i]['answer']) print("==="*50) # Store the results output_data.append({ "query": query, "assistant_response": response["output"], "ground_truth": ds['train'][i]['answer'], "context": current_context_text }) # Save results periodically to preserve memory if (i + 1) % BATCH_SIZE == 0 or i == min(start_idx + SAMPLE_SIZE, len(ds['train'])) - 1: with open(OUTPUT_FILE, 'w') as f: json.dump(output_data, f, indent=4) print(f"\nSaved results for {len(output_data)} documents to {OUTPUT_FILE}") # Clear memory del llm, current_context_text, model, chunks, embeddings, index, similar_chunks, response clear_memory() except Exception as e: print(f"Error occurred at document index {i}: {str(e)}") # Save whatever results we have so far with open(OUTPUT_FILE, 'w') as f: json.dump(output_data, f, indent=4) print(f"\nSaved partial results for {len(output_data)} documents to {OUTPUT_FILE}") print(f"\nCompleted processing {len(output_data)} documents. Results saved to {OUTPUT_FILE}") # Load model model = SentenceTransformer('BAAI/bge-large-en-v1.5') rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) # File paths input_file = 'rag_test_output.json' output_file = 'rag_scores.csv' semantic_threshold = 0.75 # Read JSON array with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) results = [] # Score each item for item in data: query = item.get("query", "") assistant_response = item.get("assistant_response", "") ground_truth = item.get("ground_truth", "") context = item.get("context", "") # Compute semantic similarity emb_response = model.encode(assistant_response, convert_to_tensor=True) emb_truth = model.encode(ground_truth, convert_to_tensor=True) similarity = util.pytorch_cos_sim(emb_response, emb_truth).item() # Compute ROUGE-L F1 rouge_score = rouge.score(assistant_response, ground_truth)['rougeL'].fmeasure # Final status status = "PASS" if similarity >= semantic_threshold else "FAIL" results.append({ "query": query, "semantic_similarity": round(similarity, 4), "rougeL_f1": round(rouge_score, 4), "status": status }) # Write results to CSV with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=["query", "semantic_similarity", "rougeL_f1", "status"]) writer.writeheader() writer.writerows(results) print(f"Scores saved to '{output_file}'")