gourisankar85 commited on
Commit
826253d
·
verified ·
1 Parent(s): e73ca1c

Upload 4 files

Browse files
retriever/chunk_documents.py CHANGED
@@ -1,25 +1,25 @@
1
- from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- import hashlib
3
-
4
- def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
5
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
6
- documents = []
7
- seen_hashes = set() # Track hashes of chunks to avoid duplicates
8
-
9
- for data in dataset:
10
- text_list = data['documents']
11
- for text in text_list:
12
- chunks = text_splitter.split_text(text)
13
- for i, chunk in enumerate(chunks):
14
- # Generate a unique hash for the chunk
15
- chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
16
-
17
- # Skip if the chunk is a duplicate
18
- if chunk_hash in seen_hashes:
19
- continue
20
-
21
- # Add the chunk to the documents list and track its hash
22
- documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
23
- seen_hashes.add(chunk_hash)
24
-
25
  return documents
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ import hashlib
3
+
4
+ def chunk_documents(dataset, chunk_size=1000, chunk_overlap=200):
5
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
6
+ documents = []
7
+ seen_hashes = set() # Track hashes of chunks to avoid duplicates
8
+
9
+ for data in dataset:
10
+ text_list = data['documents']
11
+ for text in text_list:
12
+ chunks = text_splitter.split_text(text)
13
+ for i, chunk in enumerate(chunks):
14
+ # Generate a unique hash for the chunk
15
+ chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
16
+
17
+ # Skip if the chunk is a duplicate
18
+ if chunk_hash in seen_hashes:
19
+ continue
20
+
21
+ # Add the chunk to the documents list and track its hash
22
+ documents.append({'text': chunk, 'source': f"{data['question']}_chunk_{i}"})
23
+ seen_hashes.add(chunk_hash)
24
+
25
  return documents
retriever/embed_documents.py CHANGED
@@ -1,96 +1,98 @@
1
- '''import os
2
- import logging
3
- from langchain_huggingface import HuggingFaceEmbeddings
4
- from langchain_community.vectorstores import FAISS
5
-
6
- from config import ConfigConstants
7
-
8
- def embed_documents(documents, embedding_path="embeddings.faiss"):
9
- embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
10
-
11
- if os.path.exists(embedding_path):
12
- logging.info("Loading embeddings from local file")
13
- vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
14
- else:
15
- logging.info("Generating and saving embeddings")
16
- vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
17
- vector_store.save_local(embedding_path)
18
-
19
- return vector_store'''
20
-
21
- import os
22
- import logging
23
- import hashlib
24
- from typing import List, Dict
25
- from concurrent.futures import ThreadPoolExecutor
26
- from tqdm import tqdm
27
- from langchain_community.vectorstores import FAISS
28
- from langchain_huggingface import HuggingFaceEmbeddings
29
- from config import ConfigConstants
30
-
31
-
32
- def embed_documents(documents: List[Dict], embedding_path: str = "embeddings.faiss", metadata_path: str = "metadata.json") -> FAISS:
33
- logging.info(f"Total documents got :{len(documents)}")
34
- embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
35
-
36
- if os.path.exists(embedding_path) and os.path.exists(metadata_path):
37
- logging.info("Loading embeddings and metadata from local files")
38
- vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
39
- existing_metadata = _load_metadata(metadata_path)
40
- else:
41
- # Initialize FAISS with at least one document to avoid the IndexError
42
- if documents:
43
- vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
44
- else:
45
- # If no documents are provided, initialize an empty FAISS index with a dummy document
46
- vector_store = FAISS.from_texts(["dummy document"], embedding_model)
47
- existing_metadata = {}
48
-
49
- # Identify new or modified documents
50
- new_documents = []
51
- for doc in documents:
52
- doc_hash = _generate_document_hash(doc['text'])
53
- if doc_hash not in existing_metadata:
54
- new_documents.append(doc)
55
- existing_metadata[doc_hash] = True # Mark as processed
56
-
57
- if new_documents:
58
- logging.info(f"Generating embeddings for {len(new_documents)} new documents")
59
- with ThreadPoolExecutor() as executor:
60
- futures = []
61
- for doc in new_documents:
62
- futures.append(executor.submit(_embed_single_document, doc, embedding_model))
63
-
64
- for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
65
- vector_store.add_texts([future.result()])
66
-
67
- # Save updated embeddings and metadata
68
- vector_store.save_local(embedding_path)
69
- _save_metadata(metadata_path, existing_metadata)
70
- else:
71
- logging.info("No new documents to process. Using existing embeddings.")
72
-
73
- return vector_store
74
-
75
- def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
76
- return doc['text']
77
-
78
- def _generate_document_hash(text: str) -> str:
79
- """Generate a unique hash for a document based on its text."""
80
- return hashlib.sha256(text.encode()).hexdigest()
81
-
82
- def _load_metadata(metadata_path: str) -> Dict[str, bool]:
83
- """Load metadata from a file."""
84
- import json
85
- if os.path.exists(metadata_path):
86
- with open(metadata_path, "r") as f:
87
- return json.load(f)
88
- return {}
89
-
90
- def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
91
- """Save metadata to a file."""
92
- import json
93
- with open(metadata_path, "w") as f:
94
- json.dump(metadata, f)
95
-
96
-
 
 
 
1
+ '''import os
2
+ import logging
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+ from config import ConfigConstants
7
+
8
+ def embed_documents(documents, embedding_path="embeddings.faiss"):
9
+ embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
10
+
11
+ if os.path.exists(embedding_path):
12
+ logging.info("Loading embeddings from local file")
13
+ vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
14
+ else:
15
+ logging.info("Generating and saving embeddings")
16
+ vector_store = FAISS.from_texts([doc['text'] for doc in documents], embedding_model)
17
+ vector_store.save_local(embedding_path)
18
+
19
+ return vector_store'''
20
+
21
+ import os
22
+ import logging
23
+ import hashlib
24
+ from typing import List, Dict
25
+ from concurrent.futures import ThreadPoolExecutor
26
+ from tqdm import tqdm
27
+ from langchain_community.vectorstores import FAISS
28
+ from langchain_huggingface import HuggingFaceEmbeddings
29
+ from config import ConfigConstants
30
+
31
+
32
+ def embed_documents(documents: List[Dict], embedding_path: str = "/persistent/embeddings/embeddings.faiss", metadata_path: str = "/persistent/embeddings/metadata.json") -> FAISS:
33
+ logging.info(f"Total documents got :{len(documents)}")
34
+ os.makedirs(os.path.dirname(embedding_path), exist_ok=True)
35
+ os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
36
+ embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
37
+
38
+ if os.path.exists(embedding_path) and os.path.exists(metadata_path):
39
+ logging.info("Loading embeddings and metadata from local files")
40
+ vector_store = FAISS.load_local(embedding_path, embedding_model, allow_dangerous_deserialization=True)
41
+ existing_metadata = _load_metadata(metadata_path)
42
+ else:
43
+ # Initialize FAISS with at least one document to avoid the IndexError
44
+ if documents:
45
+ vector_store = FAISS.from_texts([documents[0]['text']], embedding_model)
46
+ else:
47
+ # If no documents are provided, initialize an empty FAISS index with a dummy document
48
+ vector_store = FAISS.from_texts(["dummy document"], embedding_model)
49
+ existing_metadata = {}
50
+
51
+ # Identify new or modified documents
52
+ new_documents = []
53
+ for doc in documents:
54
+ doc_hash = _generate_document_hash(doc['text'])
55
+ if doc_hash not in existing_metadata:
56
+ new_documents.append(doc)
57
+ existing_metadata[doc_hash] = True # Mark as processed
58
+
59
+ if new_documents:
60
+ logging.info(f"Generating embeddings for {len(new_documents)} new documents")
61
+ with ThreadPoolExecutor() as executor:
62
+ futures = []
63
+ for doc in new_documents:
64
+ futures.append(executor.submit(_embed_single_document, doc, embedding_model))
65
+
66
+ for future in tqdm(futures, desc="Generating embeddings", unit="doc"):
67
+ vector_store.add_texts([future.result()])
68
+
69
+ # Save updated embeddings and metadata
70
+ vector_store.save_local(embedding_path)
71
+ _save_metadata(metadata_path, existing_metadata)
72
+ else:
73
+ logging.info("No new documents to process. Using existing embeddings.")
74
+
75
+ return vector_store
76
+
77
+ def _embed_single_document(doc: Dict, embedding_model: HuggingFaceEmbeddings) -> str:
78
+ return doc['text']
79
+
80
+ def _generate_document_hash(text: str) -> str:
81
+ """Generate a unique hash for a document based on its text."""
82
+ return hashlib.sha256(text.encode()).hexdigest()
83
+
84
+ def _load_metadata(metadata_path: str) -> Dict[str, bool]:
85
+ """Load metadata from a file."""
86
+ import json
87
+ if os.path.exists(metadata_path):
88
+ with open(metadata_path, "r") as f:
89
+ return json.load(f)
90
+ return {}
91
+
92
+ def _save_metadata(metadata_path: str, metadata: Dict[str, bool]):
93
+ """Save metadata to a file."""
94
+ import json
95
+ with open(metadata_path, "w") as f:
96
+ json.dump(metadata, f)
97
+
98
+
retriever/load_selected_datasets.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from data.load_dataset import load_data
3
+ from retriever.embed_documents import embed_documents
4
+ from retriever.chunk_documents import chunk_documents
5
+
6
+ loaded_datasets = set() # Keep track of loaded datasets
7
+
8
+ def load_selected_datasets(selected_datasets, config) -> str:
9
+ """Load, chunk, and embed selected datasets."""
10
+ global loaded_datasets
11
+
12
+ if not selected_datasets:
13
+ return "No dataset selected."
14
+
15
+ all_chunked_documents = []
16
+ datasets = {}
17
+
18
+ for data_set_name in selected_datasets:
19
+ logging.info(f"Loading dataset: {data_set_name}")
20
+ datasets[data_set_name] = load_data(data_set_name)
21
+
22
+ # Set chunk size
23
+ chunk_size = 4000 if data_set_name == 'cuad' else 1000 # Example chunk sizes
24
+
25
+ # Chunk documents
26
+ chunked_documents = chunk_documents(datasets[data_set_name], chunk_size=chunk_size, chunk_overlap=200)
27
+ all_chunked_documents.extend(chunked_documents)
28
+ # Logging final count
29
+ logging.info(f"Total chunked documents: {len(all_chunked_documents)}")
30
+
31
+ # Mark dataset as loaded
32
+ loaded_datasets.add(data_set_name)
33
+
34
+ # Embed documents
35
+ config.vector_store = embed_documents(all_chunked_documents)
36
+ logging.info("Documents embeding completed.")
37
+
38
+ # **🔹 Refresh loaded datasets after loading**
39
+ config.loaded_datasets = config.detect_loaded_datasets()
40
+
41
+ return loaded_datasets #f"Loaded datasets: {', '.join(loaded_datasets)}"
retriever/retrieve_documents.py CHANGED
@@ -1,86 +1,86 @@
1
- import logging
2
- import numpy as np
3
- from transformers import pipeline
4
-
5
- from config import ConfigConstants
6
-
7
- def retrieve_top_k_documents(vector_store, query, top_k=5):
8
- documents = vector_store.similarity_search(query, k=top_k)
9
- logging.info(f"Top {top_k} documents reterived for query")
10
-
11
- #documents = rerank_documents(query, documents)
12
-
13
- return documents
14
-
15
- # Reranking: Cross-Encoder for refining top-k results
16
- def rerank_documents(query, documents):
17
- """
18
- Re-rank documents using a cross-encoder model.
19
-
20
- Parameters:
21
- query (str): The user's query.
22
- documents (list): List of LangChain Document objects.
23
- reranker_model_name (str): Hugging Face model name for re-ranking.
24
-
25
- Returns:
26
- list: Re-ranked list of Document objects with updated scores.
27
- """
28
- # Initialize the cross-encoder model
29
- reranker = pipeline("text-classification", model=ConfigConstants.RE_RANKER_MODEL_NAME, top_k=1)
30
-
31
- # Pair the query with each document's text
32
- rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
33
-
34
- # Get relevance scores for each query-document pair
35
- scores = reranker(rerank_inputs)
36
-
37
- # Attach the new scores to the documents
38
- for doc, score in zip(documents, scores):
39
- doc.metadata["rerank_score"] = score[0]['score'] # Access score from the first item in the list
40
-
41
- # Sort documents by the rerank_score in descending order
42
- documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
43
- logging.info("Re-ranked documents using a cross-encoder model")
44
-
45
- return documents
46
-
47
-
48
- # Query Handling: Retrieve top-k candidates using FAISS with IVF index not used only for learning
49
- def retrieve_top_k_documents_manual(vector_store, query, top_k=5):
50
- """
51
- Retrieve top-k documents using FAISS index and optionally rerank them.
52
-
53
- Parameters:
54
- vector_store (FAISS): The vector store containing the FAISS index and docstore.
55
- query (str): The user's query string.
56
- top_k (int): The number of top results to retrieve.
57
- reranker_model_name (str): The Hugging Face model name for cross-encoder reranking.
58
-
59
- Returns:
60
- list: Top-k retrieved and reranked documents.
61
- """
62
- # Encode the query into a dense vector
63
- embedding_model = vector_store.embedding_function
64
- query_vector = embedding_model.embed_query(query) # Encode the query
65
- query_vector = np.array([query_vector]).astype('float32')
66
-
67
- # Search the FAISS index for top_k results
68
- distances, indices = vector_store.index.search(query_vector, top_k)
69
-
70
- # Retrieve documents from the docstore
71
- documents = []
72
- for idx in indices.flatten():
73
- if idx == -1: # FAISS can return -1 for invalid indices
74
- continue
75
- doc_id = vector_store.index_to_docstore_id[idx]
76
-
77
- # Access the internal dictionary of InMemoryDocstore
78
- internal_docstore = getattr(vector_store.docstore, "_dict", None)
79
- if internal_docstore and doc_id in internal_docstore: # Check if doc_id exists
80
- document = internal_docstore[doc_id]
81
- documents.append(document)
82
-
83
- # Rerank the documents
84
- documents = rerank_documents(query, documents)
85
-
86
  return documents
 
1
+ import logging
2
+ import numpy as np
3
+ from transformers import pipeline
4
+
5
+ from config import ConfigConstants
6
+
7
+ def retrieve_top_k_documents(vector_store, query, top_k=5):
8
+ documents = vector_store.similarity_search(query, k=top_k)
9
+ logging.info(f"Top {top_k} documents reterived for query")
10
+
11
+ #documents = rerank_documents(query, documents)
12
+
13
+ return documents
14
+
15
+ # Reranking: Cross-Encoder for refining top-k results
16
+ def rerank_documents(query, documents):
17
+ """
18
+ Re-rank documents using a cross-encoder model.
19
+
20
+ Parameters:
21
+ query (str): The user's query.
22
+ documents (list): List of LangChain Document objects.
23
+ reranker_model_name (str): Hugging Face model name for re-ranking.
24
+
25
+ Returns:
26
+ list: Re-ranked list of Document objects with updated scores.
27
+ """
28
+ # Initialize the cross-encoder model
29
+ reranker = pipeline("text-classification", model=ConfigConstants.RE_RANKER_MODEL_NAME, top_k=1)
30
+
31
+ # Pair the query with each document's text
32
+ rerank_inputs = [{"text": query, "text_pair": doc.page_content} for doc in documents]
33
+
34
+ # Get relevance scores for each query-document pair
35
+ scores = reranker(rerank_inputs)
36
+
37
+ # Attach the new scores to the documents
38
+ for doc, score in zip(documents, scores):
39
+ doc.metadata["rerank_score"] = score[0]['score'] # Access score from the first item in the list
40
+
41
+ # Sort documents by the rerank_score in descending order
42
+ documents = sorted(documents, key=lambda x: x.metadata.get("rerank_score", 0), reverse=True)
43
+ logging.info("Re-ranked documents using a cross-encoder model")
44
+
45
+ return documents
46
+
47
+
48
+ # Query Handling: Retrieve top-k candidates using FAISS with IVF index not used only for learning
49
+ def retrieve_top_k_documents_manual(vector_store, query, top_k=5):
50
+ """
51
+ Retrieve top-k documents using FAISS index and optionally rerank them.
52
+
53
+ Parameters:
54
+ vector_store (FAISS): The vector store containing the FAISS index and docstore.
55
+ query (str): The user's query string.
56
+ top_k (int): The number of top results to retrieve.
57
+ reranker_model_name (str): The Hugging Face model name for cross-encoder reranking.
58
+
59
+ Returns:
60
+ list: Top-k retrieved and reranked documents.
61
+ """
62
+ # Encode the query into a dense vector
63
+ embedding_model = vector_store.embedding_function
64
+ query_vector = embedding_model.embed_query(query) # Encode the query
65
+ query_vector = np.array([query_vector]).astype('float32')
66
+
67
+ # Search the FAISS index for top_k results
68
+ distances, indices = vector_store.index.search(query_vector, top_k)
69
+
70
+ # Retrieve documents from the docstore
71
+ documents = []
72
+ for idx in indices.flatten():
73
+ if idx == -1: # FAISS can return -1 for invalid indices
74
+ continue
75
+ doc_id = vector_store.index_to_docstore_id[idx]
76
+
77
+ # Access the internal dictionary of InMemoryDocstore
78
+ internal_docstore = getattr(vector_store.docstore, "_dict", None)
79
+ if internal_docstore and doc_id in internal_docstore: # Check if doc_id exists
80
+ document = internal_docstore[doc_id]
81
+ documents.append(document)
82
+
83
+ # Rerank the documents
84
+ documents = rerank_documents(query, documents)
85
+
86
  return documents