Spaces:

gourisankar85
/

realtime-rag-pipeline

Running

Gourisankar Padihary commited on Dec 22, 2024

Commit

b58a992

1 Parent(s): 11b4c9f

Fix for other datasets

Files changed (6) hide show

data/load_dataset.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
 from datasets import load_dataset
-def load_data():
     logging.info("Loading dataset")
-    dataset = load_dataset("rungalileo/ragbench", 'covidqa', split="test")
     logging.info("Dataset loaded successfully")
     logging.info(f"Number of documents found: {dataset.num_rows}")
     return dataset

 import logging
 from datasets import load_dataset
+def load_data(data_set_name):
     logging.info("Loading dataset")
+    dataset = load_dataset("rungalileo/ragbench", data_set_name, split="test")
     logging.info("Dataset loaded successfully")
     logging.info(f"Number of documents found: {dataset.num_rows}")
     return dataset

generator/compute_metrics.py CHANGED Viewed

@@ -32,6 +32,7 @@ def compute_metrics(attributes, total_sentences):
 def get_metrics(attributes, total_sentences):
     if attributes.content:
         result_content = attributes.content  # Access the content attribute
         # Extract the JSON part from the result_content
         json_start = result_content.find("{")
@@ -40,8 +41,6 @@ def get_metrics(attributes, total_sentences):
         try:
             result_json = json.loads(json_str)
-            #print(json.dumps(result_json, indent=2))
             # Compute metrics using the extracted attributes
             metrics = compute_metrics(result_json, total_sentences)
             print(metrics)

 def get_metrics(attributes, total_sentences):
     if attributes.content:
+        #print(attributes)
         result_content = attributes.content  # Access the content attribute
         # Extract the JSON part from the result_content
         json_start = result_content.find("{")
         try:
             result_json = json.loads(json_str)
             # Compute metrics using the extracted attributes
             metrics = compute_metrics(result_json, total_sentences)
             print(metrics)

generator/document_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ class Document:
 def apply_sentence_keys_documents(relevant_docs: List[Document]):
     result = []
-    for i, doc in enumerate(relevant_docs):
         doc_id = str(i)
         title_passage = doc.page_content.split('\nPassage: ')
         title = title_passage[0]
@@ -19,7 +19,13 @@ def apply_sentence_keys_documents(relevant_docs: List[Document]):
         for j, passage in enumerate(passages):
             doc_result.append([f"{doc_id}{chr(98 + j)}", passage])
-        result.append(doc_result)
     return result

 def apply_sentence_keys_documents(relevant_docs: List[Document]):
     result = []
+    '''for i, doc in enumerate(relevant_docs):
         doc_id = str(i)
         title_passage = doc.page_content.split('\nPassage: ')
         title = title_passage[0]
         for j, passage in enumerate(passages):
             doc_result.append([f"{doc_id}{chr(98 + j)}", passage])
+        result.append(doc_result)'''
+    for relevant_doc_index, relevant_doc in enumerate(relevant_docs):
+        sentences = []
+        for sentence_index, sentence in enumerate(relevant_doc.page_content.split(".")):
+            sentences.append([str(relevant_doc_index)+chr(97 + sentence_index), sentence])
+        result.append(sentences)
     return result

generator/initialize_llm.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import os
 from langchain_groq import ChatGroq
 def initialize_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
-    llm = ChatGroq(model="llama3-8b-8192", temperature=0.7)
     return llm
 def initialize_validation_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
-    llm = ChatGroq(model="llama3-70b-8192", temperature=0.7)
     return llm

+import logging
 import os
 from langchain_groq import ChatGroq
 def initialize_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
+    model_name = "llama3-8b-8192"
+    llm = ChatGroq(model=model_name, temperature=0.7)
+    logging.info(f'Generation LLM {model_name} initialized')
     return llm
 def initialize_validation_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
+    model_name = "llama-3.1-8b-instant"
+    llm = ChatGroq(model=model_name, temperature=0.7)
+    logging.info(f'Validation LLM {model_name} initialized')
     return llm

main.py CHANGED Viewed

@@ -11,13 +11,17 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(
 def main():
     logging.info("Starting the RAG pipeline")
     # Load the dataset
-    dataset = load_data()
     logging.info("Dataset loaded")
     # Chunk the dataset
-    documents = chunk_documents(dataset)
     logging.info("Documents chunked")
     # Embed the documents
@@ -26,17 +30,16 @@ def main():
      # Initialize the Generation LLM
     llm = initialize_llm()
-    logging.info("LLM initialized")
     # Sample question
-    row_num = 43
     sample_question = dataset[row_num]['question']
     # Call generate_metrics for above sample question
     generate_metrics(llm, vector_store, sample_question)
     #Compute RMSE and AUC-ROC for entire dataset
-    compute_rmse_auc_roc_metrics(llm, dataset, vector_store, dataset.num_rows)
     logging.info("Finished!!!")

 def main():
     logging.info("Starting the RAG pipeline")
+    data_set_name = 'techqa'
     # Load the dataset
+    dataset = load_data(data_set_name)
     logging.info("Dataset loaded")
     # Chunk the dataset
+    chunk_size = 1000  # default value
+    if data_set_name == 'cuad':
+        chunk_size = 3000
+    documents = chunk_documents(dataset, chunk_size)
     logging.info("Documents chunked")
     # Embed the documents
      # Initialize the Generation LLM
     llm = initialize_llm()
     # Sample question
+    row_num = 10
     sample_question = dataset[row_num]['question']
     # Call generate_metrics for above sample question
     generate_metrics(llm, vector_store, sample_question)
     #Compute RMSE and AUC-ROC for entire dataset
+    #compute_rmse_auc_roc_metrics(llm, dataset, vector_store, 10)
     logging.info("Finished!!!")

retriever/chunk_documents.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from langchain.text_splitter import CharacterTextSplitter
 def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
-    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     documents = []
     for data in dataset:
         text_list = data['documents']

+from langchain.text_splitter import RecursiveCharacterTextSplitter
 def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     documents = []
     for data in dataset:
         text_list = data['documents']