File size: 5,846 Bytes
8d0a63e
 
 
 
 
 
ba907cd
8d0a63e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from datasets import load_dataset

ds = load_dataset("neural-bridge/rag-dataset-12000")

# Test the RAG system with DS dataset
from sentence_transformers import SentenceTransformer
from development_scripts.preprocessing import model_selection, create_embeddings, build_faiss_index, retrieve_similar_chunks, agentic_rag
import dotenv
from langchain_community.tools.tavily_search import TavilySearchResults
import json 
import gc  
import torch  # For clearing CUDA cache if available
import os
from langchain.memory import ConversationBufferMemory
import json
import csv
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer

# Configuration parameters
SAMPLE_SIZE = 80  # Number of documents to test
BATCH_SIZE = 1    # Save results after every X iterations
OUTPUT_FILE = 'rag_test_output.json'

tools = [TavilySearchResults(max_results=5)]
dotenv.load_dotenv()

# create a simple chunking function for text based
def chunk_text(text, max_length=250):
    # Split the text into chunks of max_length with metadata
    chunks = []
    for i in range(0, len(text), max_length):
        chunk = text[i:i + max_length]
        chunks.append({"text": chunk, "metadata": {"chunk_id": i // max_length}})
    return chunks

# Function to clear memory
def clear_memory():
    gc.collect()  # Run garbage collector
    if torch.cuda.is_available():  # If using GPU
        torch.cuda.empty_cache()  # Clear CUDA cache

# Initialize or load output data
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r') as f:
        try:
            output_data = json.load(f)
            start_idx = len(output_data)  # Resume from where we left off
            print(f"Resuming from index {start_idx}")
        except json.JSONDecodeError:
            output_data = []  # Start fresh if file is corrupted
            start_idx = 0
else:
    output_data = []  # Start fresh if file doesn't exist
    start_idx = 0

# Process documents in range
try:
    for i in range(start_idx, min(start_idx + SAMPLE_SIZE, len(ds['train']))):
        print(f"Processing document {i}/{min(start_idx + SAMPLE_SIZE, len(ds['train']))}")
        
        # Get current document data
        llm = model_selection("meta-llama/llama-4-scout-17b-16e-instruct")
        current_context_text = ds['train'][i]['context']
        model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        
        # Process text and create embeddings
        chunks = chunk_text(current_context_text, max_length=100)
        embeddings, chunks = create_embeddings(chunks, model)
        index = build_faiss_index(embeddings)
        query = ds['train'][i]['question']
        
        # Retrieve similar chunks
        similar_chunks = retrieve_similar_chunks(query, index, chunks, model, k=5)
        agent_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
        # Run RAG system
        print(f"Query: {query}")
        response = agentic_rag(llm, tools, query=query, context_chunks=similar_chunks, memory=agent_memory, Use_Tavily=False)
        
        print("Assistant:", response["output"])
        print("Ground Truth:", ds['train'][i]['answer'])
        print("==="*50)
        
        # Store the results
        output_data.append({
            "query": query,
            "assistant_response": response["output"],
            "ground_truth": ds['train'][i]['answer'],
            "context": current_context_text
        })
        
        # Save results periodically to preserve memory
        if (i + 1) % BATCH_SIZE == 0 or i == min(start_idx + SAMPLE_SIZE, len(ds['train'])) - 1:
            with open(OUTPUT_FILE, 'w') as f:
                json.dump(output_data, f, indent=4)
            print(f"\nSaved results for {len(output_data)} documents to {OUTPUT_FILE}")
        
        # Clear memory
        del llm, current_context_text, model, chunks, embeddings, index, similar_chunks, response
        clear_memory()
        
except Exception as e:
    print(f"Error occurred at document index {i}: {str(e)}")
    # Save whatever results we have so far
    with open(OUTPUT_FILE, 'w') as f:
        json.dump(output_data, f, indent=4)
    print(f"\nSaved partial results for {len(output_data)} documents to {OUTPUT_FILE}")
    
print(f"\nCompleted processing {len(output_data)} documents. Results saved to {OUTPUT_FILE}")


# Load model
model = SentenceTransformer('BAAI/bge-large-en-v1.5')
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

# File paths
input_file = 'rag_test_output.json'
output_file = 'rag_scores.csv'
semantic_threshold = 0.75

# Read JSON array
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

results = []

# Score each item
for item in data:
    query = item.get("query", "")
    assistant_response = item.get("assistant_response", "")
    ground_truth = item.get("ground_truth", "")
    context = item.get("context", "")

    # Compute semantic similarity
    emb_response = model.encode(assistant_response, convert_to_tensor=True)
    emb_truth = model.encode(ground_truth, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(emb_response, emb_truth).item()

    # Compute ROUGE-L F1
    rouge_score = rouge.score(assistant_response, ground_truth)['rougeL'].fmeasure

    # Final status
    status = "PASS" if similarity >= semantic_threshold else "FAIL"

    results.append({
        "query": query,
        "semantic_similarity": round(similarity, 4),
        "rougeL_f1": round(rouge_score, 4),
        "status": status
    })

# Write results to CSV
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["query", "semantic_similarity", "rougeL_f1", "status"])
    writer.writeheader()
    writer.writerows(results)

print(f"Scores saved to '{output_file}'")