Spaces:

gourisankar85
/

realtime-rag-pipeline

Running

App Files Files Community

gourisankar85 commited on Feb 19

Commit

826253d

verified ·

1 Parent(s): e73ca1c

Upload 4 files

Browse files

Files changed (4) hide show

retriever/chunk_documents.py +24 -24
retriever/embed_documents.py +98 -96
retriever/load_selected_datasets.py +41 -0
retriever/retrieve_documents.py +85 -85

retriever/chunk_documents.py CHANGED Viewed

@@ -1,25 +1,25 @@
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-import hashlib
-def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-    documents = []
-    seen_hashes = set()  # Track hashes of chunks to avoid duplicates
-    for data in dataset:
-        text_list = data['documents']
-        for text in text_list:
-            chunks = text_splitter.split_text(text)
-            for i, chunk in enumerate(chunks):
-                # Generate a unique hash for the chunk
-                chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
-                # Skip if the chunk is a duplicate
-                if chunk_hash in seen_hashes:
-                    continue
-                # Add the chunk to the documents list and track its hash
-                documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
-                seen_hashes.add(chunk_hash)
     return documents

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import hashlib
+def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    documents = []
+    seen_hashes = set()  # Track hashes of chunks to avoid duplicates
+    for data in dataset:
+        text_list = data['documents']
+        for text in text_list:
+            chunks = text_splitter.split_text(text)
+            for i, chunk in enumerate(chunks):
+                # Generate a unique hash for the chunk
+                chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
+                # Skip if the chunk is a duplicate
+                if chunk_hash in seen_hashes:
+                    continue
+                # Add the chunk to the documents list and track its hash
+                documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
+                seen_hashes.add(chunk_hash)
     return documents

retriever/embed_documents.py CHANGED Viewed

@@ -1,96 +1,98 @@
-'''import os
-import logging
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from config import ConfigConstants
-def embed_documents(documents, embedding_path="embeddings.faiss"):
-    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
-    if os.path.exists(embedding_path):
-        logging.info("Loading embeddings from local file")
-        vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
-    else:
-        logging.info("Generating and saving embeddings")
-        vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
-        vector_store.save_local(embedding_path)
-    return vector_store'''
-import os
-import logging
-import hashlib
-from typing import List, Dict
-from concurrent.futures import ThreadPoolExecutor
-from tqdm import tqdm
-from langchain_community.vectorstores import FAISS
-from langchain_huggingface import HuggingFaceEmbeddings
-from config import ConfigConstants
-def embed_documents(documents: List[Dict], embedding_path: str = "embeddings.faiss", metadata_path: str = "metadata.json") -> FAISS:
-    logging.info(f"Total documents got :{len(documents)}")
-    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
-    if os.path.exists(embedding_path) and os.path.exists(metadata_path):
-        logging.info("Loading embeddings and metadata from local files")
-        vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
-        existing_metadata = _load_metadata(metadata_path)
-    else:
-        # Initialize FAISS with at least one document to avoid the IndexError
-        if documents:
-            vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
-        else:
-            # If no documents are provided, initialize an empty FAISS index with a dummy document
-            vector_store = FAISS.from_texts(["dummy document"], embedding_model)
-        existing_metadata = {}
-    # Identify new or modified documents
-    new_documents = []
-    for doc in documents:
-        doc_hash = _generate_document_hash(doc['text'])
-        if doc_hash not in existing_metadata:
-            new_documents.append(doc)
-            existing_metadata[doc_hash] = True  # Mark as processed
-    if new_documents:
-        logging.info(f"Generating embeddings for {len(new_documents)} new documents")
-        with ThreadPoolExecutor() as executor:
-            futures = []
-            for doc in new_documents:
-                futures.append(executor.submit(_embed_single_document, doc, embedding_model))
-            for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
-                vector_store.add_texts([future.result()])
-        # Save updated embeddings and metadata
-        vector_store.save_local(embedding_path)
-        _save_metadata(metadata_path, existing_metadata)
-    else:
-        logging.info("No new documents to process. Using existing embeddings.")
-    return vector_store
-def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
-    return doc['text']
-def _generate_document_hash(text: str) -> str:
-    """Generate a unique hash for a document based on its text."""
-    return hashlib.sha256(text.encode()).hexdigest()
-def _load_metadata(metadata_path: str) -> Dict[str, bool]:
-    """Load metadata from a file."""
-    import json
-    if os.path.exists(metadata_path):
-        with open(metadata_path, "r") as f:
-            return json.load(f)
-    return {}
-def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
-    """Save metadata to a file."""
-    import json
-    with open(metadata_path, "w") as f:
-        json.dump(metadata, f)

+'''import os
+import logging
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from config import ConfigConstants
+def embed_documents(documents, embedding_path="embeddings.faiss"):
+    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
+    if os.path.exists(embedding_path):
+        logging.info("Loading embeddings from local file")
+        vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
+    else:
+        logging.info("Generating and saving embeddings")
+        vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
+        vector_store.save_local(embedding_path)
+    return vector_store'''
+import os
+import logging
+import hashlib
+from typing import List, Dict
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from config import ConfigConstants
+def embed_documents(documents: List[Dict], embedding_path: str = "/persistent/embeddings/embeddings.faiss", metadata_path: str = "/persistent/embeddings/metadata.json") -> FAISS:
+    logging.info(f"Total documents got :{len(documents)}")
+    os.makedirs(os.path.dirname(embedding_path), exist_ok=True)
+    os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
+    embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
+    if os.path.exists(embedding_path) and os.path.exists(metadata_path):
+        logging.info("Loading embeddings and metadata from local files")
+        vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
+        existing_metadata = _load_metadata(metadata_path)
+    else:
+        # Initialize FAISS with at least one document to avoid the IndexError
+        if documents:
+            vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
+        else:
+            # If no documents are provided, initialize an empty FAISS index with a dummy document
+            vector_store = FAISS.from_texts(["dummy document"], embedding_model)
+        existing_metadata = {}
+    # Identify new or modified documents
+    new_documents = []
+    for doc in documents:
+        doc_hash = _generate_document_hash(doc['text'])
+        if doc_hash not in existing_metadata:
+            new_documents.append(doc)
+            existing_metadata[doc_hash] = True  # Mark as processed
+    if new_documents:
+        logging.info(f"Generating embeddings for {len(new_documents)} new documents")
+        with ThreadPoolExecutor() as executor:
+            futures = []
+            for doc in new_documents:
+                futures.append(executor.submit(_embed_single_document, doc, embedding_model))
+            for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
+                vector_store.add_texts([future.result()])
+        # Save updated embeddings and metadata
+        vector_store.save_local(embedding_path)
+        _save_metadata(metadata_path, existing_metadata)
+    else:
+        logging.info("No new documents to process. Using existing embeddings.")
+    return vector_store
+def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
+    return doc['text']
+def _generate_document_hash(text: str) -> str:
+    """Generate a unique hash for a document based on its text."""
+    return hashlib.sha256(text.encode()).hexdigest()
+def _load_metadata(metadata_path: str) -> Dict[str, bool]:
+    """Load metadata from a file."""
+    import json
+    if os.path.exists(metadata_path):
+        with open(metadata_path, "r") as f:
+            return json.load(f)
+    return {}
+def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
+    """Save metadata to a file."""
+    import json
+    with open(metadata_path, "w") as f:
+        json.dump(metadata, f)

retriever/load_selected_datasets.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import logging
+from data.load_dataset import load_data
+from retriever.embed_documents import embed_documents
+from retriever.chunk_documents import chunk_documents
+loaded_datasets = set()  # Keep track of loaded datasets
+def load_selected_datasets(selected_datasets, config) -> str:
+    """Load, chunk, and embed selected datasets."""
+    global loaded_datasets
+    if not selected_datasets:
+        return "No dataset selected."
+    all_chunked_documents = []
+    datasets = {}
+    for data_set_name in selected_datasets:
+        logging.info(f"Loading dataset: {data_set_name}")
+        datasets[data_set_name] = load_data(data_set_name)
+        # Set chunk size
+        chunk_size = 4000 if data_set_name == 'cuad' else 1000  # Example chunk sizes
+        # Chunk documents
+        chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=200)
+        all_chunked_documents.extend(chunked_documents)
+        # Logging final count
+        logging.info(f"Total chunked documents: {len(all_chunked_documents)}")
+        # Mark dataset as loaded
+        loaded_datasets.add(data_set_name)
+    # Embed documents
+    config.vector_store = embed_documents(all_chunked_documents)
+    logging.info("Documents embeding completed.")
+    # **🔹 Refresh loaded datasets after loading**
+    config.loaded_datasets = config.detect_loaded_datasets()
+    return loaded_datasets #f"Loaded datasets: {', '.join(loaded_datasets)}"

retriever/retrieve_documents.py CHANGED Viewed

@@ -1,86 +1,86 @@
-import logging
-import numpy as np
-from transformers import pipeline
-from config import ConfigConstants
-def retrieve_top_k_documents(vector_store, query, top_k=5):
-    documents = vector_store.similarity_search(query, k=top_k)
-    logging.info(f"Top {top_k} documents reterived for query")
-    #documents = rerank_documents(query, documents)
-    return documents
-# Reranking: Cross-Encoder for refining top-k results
-def rerank_documents(query, documents):
-    """
-    Re-rank documents using a cross-encoder model.
-    Parameters:
-        query (str): The user's query.
-        documents (list): List of LangChain Document objects.
-        reranker_model_name (str): Hugging Face model name for re-ranking.
-    Returns:
-        list: Re-ranked list of Document objects with updated scores.
-    """
-    # Initialize the cross-encoder model
-    reranker = pipeline("text-classification", model=ConfigConstants.RE_RANKER_MODEL_NAME, top_k=1)
-    # Pair the query with each document's text
-    rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
-    # Get relevance scores for each query-document pair
-    scores = reranker(rerank_inputs)
-   # Attach the new scores to the documents
-    for doc, score in zip(documents, scores):
-        doc.metadata["rerank_score"] = score[0]['score']  # Access score from the first item in the list
-    # Sort documents by the rerank_score in descending order
-    documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
-    logging.info("Re-ranked documents using a cross-encoder model")
-    return documents
-# Query Handling: Retrieve top-k candidates using FAISS with IVF index not used only for learning
-def retrieve_top_k_documents_manual(vector_store, query, top_k=5):
-    """
-    Retrieve top-k documents using FAISS index and optionally rerank them.
-    Parameters:
-        vector_store (FAISS): The vector store containing the FAISS index and docstore.
-        query (str): The user's query string.
-        top_k (int): The number of top results to retrieve.
-        reranker_model_name (str): The Hugging Face model name for cross-encoder reranking.
-    Returns:
-        list: Top-k retrieved and reranked documents.
-    """
-    # Encode the query into a dense vector
-    embedding_model = vector_store.embedding_function
-    query_vector = embedding_model.embed_query(query)  # Encode the query
-    query_vector = np.array([query_vector]).astype('float32')
-    # Search the FAISS index for top_k results
-    distances, indices = vector_store.index.search(query_vector, top_k)
-    # Retrieve documents from the docstore
-    documents = []
-    for idx in indices.flatten():
-        if idx == -1:  # FAISS can return -1 for invalid indices
-            continue
-        doc_id = vector_store.index_to_docstore_id[idx]
-        # Access the internal dictionary of InMemoryDocstore
-        internal_docstore = getattr(vector_store.docstore, "_dict", None)
-        if internal_docstore and doc_id in internal_docstore:  # Check if doc_id exists
-            document = internal_docstore[doc_id]
-            documents.append(document)
-    # Rerank the documents
-    documents = rerank_documents(query, documents)
     return documents

+import logging
+import numpy as np
+from transformers import pipeline
+from config import ConfigConstants
+def retrieve_top_k_documents(vector_store, query, top_k=5):
+    documents = vector_store.similarity_search(query, k=top_k)
+    logging.info(f"Top {top_k} documents reterived for query")
+    #documents = rerank_documents(query, documents)
+    return documents
+# Reranking: Cross-Encoder for refining top-k results
+def rerank_documents(query, documents):
+    """
+    Re-rank documents using a cross-encoder model.
+    Parameters:
+        query (str): The user's query.
+        documents (list): List of LangChain Document objects.
+        reranker_model_name (str): Hugging Face model name for re-ranking.
+    Returns:
+        list: Re-ranked list of Document objects with updated scores.
+    """
+    # Initialize the cross-encoder model
+    reranker = pipeline("text-classification", model=ConfigConstants.RE_RANKER_MODEL_NAME, top_k=1)
+    # Pair the query with each document's text
+    rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
+    # Get relevance scores for each query-document pair
+    scores = reranker(rerank_inputs)
+   # Attach the new scores to the documents
+    for doc, score in zip(documents, scores):
+        doc.metadata["rerank_score"] = score[0]['score']  # Access score from the first item in the list
+    # Sort documents by the rerank_score in descending order
+    documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
+    logging.info("Re-ranked documents using a cross-encoder model")
+    return documents
+# Query Handling: Retrieve top-k candidates using FAISS with IVF index not used only for learning
+def retrieve_top_k_documents_manual(vector_store, query, top_k=5):
+    """
+    Retrieve top-k documents using FAISS index and optionally rerank them.
+    Parameters:
+        vector_store (FAISS): The vector store containing the FAISS index and docstore.
+        query (str): The user's query string.
+        top_k (int): The number of top results to retrieve.
+        reranker_model_name (str): The Hugging Face model name for cross-encoder reranking.
+    Returns:
+        list: Top-k retrieved and reranked documents.
+    """
+    # Encode the query into a dense vector
+    embedding_model = vector_store.embedding_function
+    query_vector = embedding_model.embed_query(query)  # Encode the query
+    query_vector = np.array([query_vector]).astype('float32')
+    # Search the FAISS index for top_k results
+    distances, indices = vector_store.index.search(query_vector, top_k)
+    # Retrieve documents from the docstore
+    documents = []
+    for idx in indices.flatten():
+        if idx == -1:  # FAISS can return -1 for invalid indices
+            continue
+        doc_id = vector_store.index_to_docstore_id[idx]
+        # Access the internal dictionary of InMemoryDocstore
+        internal_docstore = getattr(vector_store.docstore, "_dict", None)
+        if internal_docstore and doc_id in internal_docstore:  # Check if doc_id exists
+            document = internal_docstore[doc_id]
+            documents.append(document)
+    # Rerank the documents
+    documents = rerank_documents(query, documents)
     return documents