Spaces:

anindya-hf-2002
/

Research-and-RAG-Assistant

Sleeping

File size: 8,491 Bytes

db17bc0

from src.data_processing.loader import MultiFormatDocumentLoader
from src.data_processing.chunker import SDPMChunker, BGEM3Embeddings

import pandas as pd
from typing import List, Dict, Any
from pinecone import Pinecone, ServerlessSpec
import time
from tqdm import tqdm
from dotenv import load_dotenv
import os


load_dotenv()

# API Keys
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

embedding_model = BGEM3Embeddings(model_name="BAAI/bge-m3")


def load_documents(file_paths: List[str], output_path='./data/output.md'):
    """
    Load documents from multiple sources and combine them into a single markdown file
    """
    loader = MultiFormatDocumentLoader(
        file_paths=file_paths,
        enable_ocr=False,
        enable_tables=True
    )
    
    # Append all documents to the markdown file
    with open(output_path, 'w') as f:
        for doc in loader.lazy_load():
            # Add metadata as YAML frontmatter
            f.write('---\n')
            for key, value in doc.metadata.items():
                f.write(f'{key}: {value}\n')
            f.write('---\n\n')
            f.write(doc.page_content)
            f.write('\n\n')
    
    return output_path

def process_chunks(markdown_path: str, chunk_size: int = 256, 
                  threshold: float = 0.7, skip_window: int = 2):
    """
    Process the markdown file into chunks and prepare for vector storage
    """
    chunker = SDPMChunker(
        embedding_model=embedding_model,
        chunk_size=chunk_size,
        threshold=threshold,
        skip_window=skip_window
    )
    
    # Read the markdown file
    with open(markdown_path, 'r') as file:
        text = file.read()
    
    # Generate chunks
    chunks = chunker.chunk(text)
    
    # Prepare data for Parquet
    processed_chunks = []
    for chunk in chunks:
        
        processed_chunks.append({
            'text': chunk.text,
            'token_count': chunk.token_count,
            'start_index': chunk.start_index,
            'end_index': chunk.end_index,
            'num_sentences': len(chunk.sentences),
        })
    
    return processed_chunks

def save_to_parquet(chunks: List[Dict[str, Any]], output_path='./data/chunks.parquet'):
    """
    Save processed chunks to a Parquet file
    """
    df = pd.DataFrame(chunks)
    print(f"Saving to Parquet: {output_path}")
    df.to_parquet(output_path)
    print(f"Saved to Parquet: {output_path}")
    return output_path


class PineconeRetriever:
    def __init__(
        self,
        pinecone_client: Pinecone,
        index_name: str,
        namespace: str,
        embedding_generator: BGEM3Embeddings
    ):
        """Initialize the retriever with Pinecone client and embedding generator.
        
        Args:
            pinecone_client: Initialized Pinecone client
            index_name: Name of the Pinecone index
            namespace: Namespace in the index
            embedding_generator: BGEM3Embeddings instance
        """
        self.pinecone = pinecone_client
        self.index = self.pinecone.Index(index_name)
        self.namespace = namespace
        self.embedding_generator = embedding_generator
    
    def invoke(self, question: str, top_k: int = 5):
        """Retrieve similar documents for a question.
        
        Args:
            question: Query string
            top_k: Number of results to return
            
        Returns:
            List of dictionaries containing retrieved documents
        """
        # Generate embedding for the question
        question_embedding = self.embedding_generator.embed(question)
        question_embedding = question_embedding.tolist()
        # Query Pinecone
        results = self.index.query(
            namespace=self.namespace,
            vector=question_embedding,
            top_k=top_k,
            include_values=False,
            include_metadata=True
        )
        
        # Format results
        retrieved_docs = [
            {"page_content": match.metadata["text"], "score": match.score} 
            for match in results.matches
        ]
        
        return retrieved_docs

def ingest_data(
    pc,
    parquet_path: str,
    text_column: str,
    pinecone_client: Pinecone,
    index_name= "vector-index",
    namespace= "rag",
    batch_size: int = 100
):
    """Ingest data from a Parquet file into Pinecone.
    
    Args:
        parquet_path: Path to the Parquet file
        text_column: Name of the column containing text data
        pinecone_client: Initialized Pinecone client
        index_name: Name of the Pinecone index
        namespace: Namespace in the index
        batch_size: Batch size for processing
    """
    # Read Parquet file
    print(f"Reading Parquet file: {parquet_path}")
    df = pd.read_parquet(parquet_path)
    print(f"Total records: {len(df)}")
    # Create or get index
    if not pinecone_client.has_index(index_name):
        pinecone_client.create_index(
            name=index_name,
            dimension=1024,  # BGE-M3 dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        
        # Wait for index to be ready
        while not pinecone_client.describe_index(index_name).status['ready']:
            time.sleep(1)
    
    index = pinecone_client.Index(index_name)
    
    # Process in batches
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]
        
        # Generate embeddings for batch
        texts = batch_df[text_column].tolist()
        embeddings = embedding_model.embed_batch(texts)
        print(f"embeddings for batch: {i}")
        # Prepare records for upsert
        records = []
        for idx, (_, row) in enumerate(batch_df.iterrows()):
            records.append({
                "id": str(row.name),  # Using DataFrame index as ID
                "values": embeddings[idx],
                "metadata": {"text": row[text_column]}
            })
        
        # Upsert to Pinecone
        index.upsert(vectors=records, namespace=namespace)
        
        # Small delay to handle rate limits
        time.sleep(0.5)

def get_retriever(
    pinecone_client: Pinecone,
    index_name= "vector-index",
    namespace= "rag"
):
    """Create and return a PineconeRetriever instance.
    
    Args:
        pinecone_client: Initialized Pinecone client
        index_name: Name of the Pinecone index
        namespace: Namespace in the index
        
    Returns:
        Configured PineconeRetriever instance
    """
    return PineconeRetriever(
        pinecone_client=pinecone_client,
        index_name=index_name,
        namespace=namespace,
        embedding_generator=embedding_model
    )
    
def main():
    # Initialize Pinecone client
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Define input files
    file_paths=[
        # './data/2404.19756v1.pdf',
        # './data/OD429347375590223100.pdf',
        # './data/Project Report Format.docx',
        './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
    ]

    # Process pipeline
    try:
        # Step 1: Load and combine documents
        # print("Loading documents...")
        # markdown_path = load_documents(file_paths)
        
        # # Step 2: Process into chunks with embeddings
        # print("Processing chunks...")
        # chunks = process_chunks(markdown_path)
        
        # # Step 3: Save to Parquet
        # print("Saving to Parquet...")
        # parquet_path = save_to_parquet(chunks)
        
        # # Step 4: Ingest into Pinecone
        # print("Ingesting into Pinecone...")
        # ingest_data(
            # pc,
        #     parquet_path=parquet_path,
        #     text_column="text",
        #     pinecone_client=pc,
        # )
        
        # Step 5: Test retrieval
        print("\nTesting retrieval...")
        retriever = get_retriever(
            pinecone_client=pc,
            index_name="vector-index",
            namespace="rag"
        )
        
        results = retriever.invoke(
            question="describe the gender based violence",
            top_k=5
        )
        
        for i, doc in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"Content: {doc['page_content']}...")
            print(f"Score: {doc['score']}")
            
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")

if __name__ == "__main__":
    main()