Spaces:

tony10010
/

cohex

Paused

cohex / src /query_processing /late_chunking /late_chunker.py

Hemang Thakur

deploy

d5c104e 14 days ago

12 kB

	import os
	import asyncio
	from concurrent.futures import ThreadPoolExecutor
	from model2vec import StaticModel
	from transformers import AutoConfig
	from sentence_transformers import SentenceTransformer
	import torch
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	from src.utils.api_key_manager import APIKeyManager
	from src.helpers.helper import chunk_text

	class LateChunker:
	def __init__(
	self,
	model_name='minishlab/potion-base-8M',
	max_workers=os.cpu_count() * 2,
	verbose=False
	):
	self.verbose = verbose

	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.llm = APIKeyManager().get_llm()
	self.model_name = model_name

	# Initialize model using the fallback strategy
	self.model, self.context_length = self._initialize_model()

	# Initialize ThreadPoolExecutor
	self.executor = ThreadPoolExecutor(max_workers=max_workers)

	def _initialize_model(self):
	sentence_transformer_error = None
	model2vec_error = None

	# First attempt: Try SentenceTransformer
	try:
	# Get the model config to check max context length
	config = AutoConfig.from_pretrained(self.model_name)
	max_length = config.max_position_embeddings

	# Initialize SentenceTransformer model
	model = SentenceTransformer(self.model_name, trust_remote_code=True)
	model.max_seq_length = max_length # Set the correct max length
	model.to(self.device).half()
	context_length = model.max_seq_length
	return model, context_length
	except Exception as e:
	sentence_transformer_error = str(e)

	# Second attempt: Try Model2Vec
	try:
	# Initialize Model2Vec model
	model = StaticModel.from_pretrained(
	self.model_name
	)
	# Get max sequence length from static model config
	context_length = model.config['seq_length']
	return model, context_length
	except Exception as e:
	model2vec_error = str(e)
	error_msg = (
	f"Failed to load model {self.model_name}.\n"
	f"SentenceTransformer error: {sentence_transformer_error}\n"
	f"Model2Vec error: {model2vec_error}"
	)
	raise Exception(error_msg) from e

	async def late_chunking(self, text, span_annotations, current_chunk_idx=None, total_chunks=None):
	print(f"Processing chunk {current_chunk_idx+1}/{total_chunks}...") \
	if self.verbose else None

	# Get the current running event loop
	loop = asyncio.get_running_loop()

	# Generate chunk embeddings
	chunk_embeddings = []
	for start, end in span_annotations:
	chunk_text = text[start:end]
	print("Generating chunk embeddings...") if self.verbose else None
	chunk_embedding = await loop.run_in_executor(
	self.executor,
	lambda: torch.tensor(
	self.model.encode(
	chunk_text,
	convert_to_tensor=True
	)
	)
	)
	if isinstance(chunk_embedding, torch.Tensor):
	chunk_embedding = chunk_embedding.clone().detach().to(self.device)

	print(f"Chunk embedding shape: {chunk_embedding.shape}") if self.verbose else None
	chunk_embeddings.append(chunk_embedding)

	print("Late Chunking applied successfully!") if self.verbose else None
	return chunk_embeddings if chunk_embeddings else None

	def get_text_embedding(self, text):
	embeddings = self.model.encode(text, convert_to_tensor=True)
	if isinstance(embeddings, torch.Tensor):
	return embeddings.clone().detach().to(self.device)
	return torch.tensor(embeddings).to(self.device)

	def calculate_embedding_similarities(self, text1_embedding, text2_embedding):
	text1_embedding = text1_embedding.cpu().numpy()
	text2_embedding = text2_embedding.cpu().numpy()

	if text1_embedding.ndim == 1:
	text1_embedding = text1_embedding.reshape(1, -1)
	if text2_embedding.ndim == 1:
	text2_embedding = text2_embedding.reshape(1, -1)

	if text1_embedding.shape[1] != text2_embedding.shape[1]:
	text1_embedding = text1_embedding.T
	if text2_embedding.shape[1] != text1_embedding.shape[1]:
	text2_embedding = text2_embedding.T

	return cosine_similarity(text1_embedding, text2_embedding)[0]

	def select_relevant_chunks(self, similarities, chunks, max_tokens):
	sorted_indices = np.argsort(similarities)[::-1]
	selected_chunks = []
	total_tokens = 0

	for i, idx in enumerate(sorted_indices):
	print(f"Selected chunk {i+1}/{len(sorted_indices)} with similarity {similarities[idx]:.2f}") \
	if self.verbose else None
	chunk_tokens = self.llm.get_num_tokens(chunks[idx])
	print(f"Chunk tokens: {chunk_tokens}") if self.verbose else None

	if total_tokens + chunk_tokens > max_tokens:
	print(f"Total tokens exceed max tokens allowed ({total_tokens} > {max_tokens}). \
	Stopping chunk selection.") if self.verbose else None
	break

	selected_chunks.append((idx, chunks[idx]))
	total_tokens += chunk_tokens

	print("Sorting selected chunks...") if self.verbose else None
	selected_chunks.sort(key=lambda x: x[0])
	print("Selected chunks sorted successfully!") if self.verbose else None
	return " ".join([chunk for _, chunk in selected_chunks])

	async def chunker(self, text, query, max_chunk_length=1000, max_tokens=2048, overlap=200):
	# Tokenize the entire text to check its length
	total_tokens = self.llm.get_num_tokens(text)

	# If the text is less than max tokens, return the text as is
	if total_tokens <= max_tokens:
	print(f"Text is less than the max tokens allowed ({total_tokens} <= {max_tokens}). \
	Returning original text.") if self.verbose else None
	return text

	# Chunk the text if it exceeds max tokens
	print(f"Text is greater than the max tokens allowed ({total_tokens} > {max_tokens}). \
	Chunking text...") if self.verbose else None
	chunks, span_annotations = chunk_text(
	text,
	max_chunk_length=max_chunk_length,
	overlap=overlap,
	# Use the smaller of either context length or max tokens
	context_length=min(self.context_length, max_tokens)
	)
	print(f"Text chunked into {len(chunks)} macro chunks.") if self.verbose else None

	# Process each macro chunk individually
	chunk_embeddings = []
	tasks = []

	for i, macro_chunk in enumerate(chunks):
	# Adjust span annotations relative to the current macro chunk
	start_offset = span_annotations[i][0]
	adjusted_spans = [
	(start - start_offset, end - start_offset)
	for start, end in span_annotations
	if start >= start_offset and end <= start_offset + len(macro_chunk)
	]

	# Apply late chunking for the current macro chunk
	tasks.append(self.late_chunking(macro_chunk, adjusted_spans, i, len(chunks)))

	# Aggregate embeddings asynchronously
	results = await asyncio.gather(*tasks)
	chunk_embeddings = torch.stack([result[0] for result in results])

	# Generate query embedding
	print("Generating query embedding...") if self.verbose else None
	query_embedding = self.get_text_embedding(query)
	print(f"Query embedding shape: {query_embedding.shape}") if self.verbose else None

	# Calculate similarities between query embedding and chunk embeddings
	print("Calculating embedding similarities...") if self.verbose else None
	similarities = self.calculate_embedding_similarities(query_embedding, chunk_embeddings)
	print(f"Similarities shape: {similarities.shape}") if self.verbose else None

	# Select relevant chunks based on similarity
	print("Selecting relevant chunks...") if self.verbose else None
	return self.select_relevant_chunks(similarities, chunks, max_tokens)

	if __name__ == "__main__":
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from src.reasoning.reasoner import Reasoner
	from src.search.search_engine import SearchEngine
	from src.crawl.crawler import CustomCrawler
	import time

	search_engine = SearchEngine()
	crawler = CustomCrawler()
	reasoner = Reasoner()
	chunking = LateChunker(verbose=True)

	loop = asyncio.new_event_loop()

	search1 = loop.run_until_complete(search_engine.search(
	"What is the history of climate change and pollution since the pre-indutrial revolution?",
	num_results=20,
	exclude_filetypes=["pdf"]
	))
	urls = [result["link"] for result in search1]
	search2 = loop.run_until_complete(search_engine.search(
	"What is the impact of climate change on the Indian economy?",
	num_results=20,
	exclude_filetypes=["pdf"]
	))
	urls.extend([result["link"] for result in search2])
	search3 = loop.run_until_complete(search_engine.search(
	"What are some of the latest, state of art techniques used to fight climate change?",
	num_results=20,
	exclude_filetypes=["pdf"]
	))
	urls.extend([result["link"] for result in search3])
	search4 = loop.run_until_complete(search_engine.search(
	"What does the projection for climate change look like in the next 50 years?",
	num_results=20,
	exclude_filetypes=["pdf"]
	))
	urls.extend([result["link"] for result in search4])
	search5 = loop.run_until_complete(search_engine.search(
	"What efforts are being made by governments all around the world to combat climate change?",
	num_results=20,
	exclude_filetypes=["pdf"]
	))
	urls.extend([result["link"] for result in search5])

	results = loop.run_until_complete(crawler.fetch_page_contents(
	urls=urls,
	max_attempts=1,
	delay=0
	))
	text = "\n".join([f"Document {i}:\n{result}\n" for i, result in enumerate(results)])

	num_tokens_before_chunking = chunking.llm.get_num_tokens(text)
	start_time = time.perf_counter()
	response = loop.run_until_complete(chunking.chunker(
	text,
	query="What is this text about? Give me a detailed answer",
	max_tokens=128000
	))
	end_time = time.perf_counter()
	num_tokens_after_chunking = chunking.llm.get_num_tokens(response)
	print(f"\nResponse:\n{response}")
	print(f"\nNumber of URLs: {len(urls)}")
	print(f"\nNumber of tokens before late chunking: {num_tokens_before_chunking}")
	print(f"\nNumber of tokens after late chunking: {num_tokens_after_chunking}")
	print(f"\nTime taken: {end_time - start_time:.2f} seconds")

	# Calculate cosine similarity between original text and response
	def calculate_cosine_similarity(text1, text2):
	vectorizer = TfidfVectorizer().fit_transform([text1, text2])
	vectors = vectorizer.toarray()
	return cosine_similarity(vectors)[0][1]

	similarity = calculate_cosine_similarity(text, response)
	print(f"\nCosine similarity between original text and late chunked text: {similarity * 100:.2f}%")