Spaces:

airabbitX
/

mongo-vector-search-util

Sleeping

App Files Files Community

mongo-vector-search-util / embedding_utils.py

airabbitX

Upload 7 files

7301668 verified about 2 months ago

raw

history blame contribute delete

6.26 kB

	from typing import List, Tuple
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from pymongo import UpdateOne
	from pymongo.collection import Collection
	import math
	import time
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def get_embedding(text: str, openai_client, model="text-embedding-ada-002", max_retries=3) -> list[float]:
	"""Get embeddings for given text using OpenAI API with retry logic"""
	text = text.replace("\n", " ")

	for attempt in range(max_retries):
	try:
	resp = openai_client.embeddings.create(
	input=[text],
	model=model
	)
	return resp.data[0].embedding
	except Exception as e:
	if attempt == max_retries - 1:
	raise
	error_details = f"{type(e).__name__}: {str(e)}"
	if hasattr(e, 'response'):
	error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
	logger.warning(f"Embedding API error (attempt {attempt + 1}/{max_retries}):\n{error_details}")
	time.sleep(2 ** attempt) # Exponential backoff

	def process_batch(docs: List[dict], field_name: str, embedding_field: str, openai_client) -> List[Tuple[str, list]]:
	"""Process a batch of documents to generate embeddings"""
	logger.info(f"Processing batch of {len(docs)} documents")
	results = []
	for doc in docs:
	# Skip if embedding already exists
	if embedding_field in doc:
	continue

	text = doc[field_name]
	if isinstance(text, str):
	embedding = get_embedding(text, openai_client)
	results.append((doc["_id"], embedding))
	return results

	def process_futures(futures: List, collection: Collection, embedding_field: str, processed: int, total_docs: int, callback=None) -> int:
	"""Process completed futures and update progress"""
	completed = 0
	for future in as_completed(futures, timeout=30): # 30 second timeout
	try:
	results = future.result()
	if results:
	bulk_ops = [
	UpdateOne({"_id": doc_id}, {"$set": {embedding_field: embedding}})
	for doc_id, embedding in results
	]
	if bulk_ops:
	collection.bulk_write(bulk_ops)
	completed += len(bulk_ops)

	# Update progress
	if callback:
	progress = ((processed + completed) / total_docs) * 100
	callback(progress, processed + completed, total_docs)
	except Exception as e:
	error_details = f"{type(e).__name__}: {str(e)}"
	if hasattr(e, 'response'):
	error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
	logger.error(f"Error processing future:\n{error_details}")
	return completed

	def parallel_generate_embeddings(
	collection: Collection,
	cursor,
	field_name: str,
	embedding_field: str,
	openai_client,
	total_docs: int,
	batch_size: int = 10, # Reduced initial batch size
	callback=None
	) -> int:
	"""Generate embeddings in parallel using ThreadPoolExecutor with cursor-based batching and dynamic processing"""
	if total_docs == 0:
	return 0

	processed = 0
	current_batch_size = batch_size
	max_workers = 10 # Start with fewer workers

	logger.info(f"Starting embedding generation for {total_docs} documents")
	if callback:
	callback(0, 0, total_docs)

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	batch = []
	futures = []

	for doc in cursor:
	batch.append(doc)

	if len(batch) >= current_batch_size:
	logger.info(f"Submitting batch of {len(batch)} documents (batch size: {current_batch_size})")
	future = executor.submit(process_batch, batch.copy(), field_name, embedding_field, openai_client)
	futures.append(future)
	batch = []

	# Process completed futures more frequently
	if len(futures) >= max_workers:
	try:
	completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
	processed += completed
	futures = [] # Clear processed futures

	# Gradually increase batch size and workers if processing is successful
	if completed > 0:
	current_batch_size = min(current_batch_size + 5, 30)
	max_workers = min(max_workers + 2, 20)
	logger.info(f"Increased batch size to {current_batch_size}, workers to {max_workers}")
	except Exception as e:
	logger.error(f"Error processing futures: {str(e)}")
	# Reduce batch size and workers on error
	current_batch_size = max(5, current_batch_size - 5)
	max_workers = max(5, max_workers - 2)
	logger.info(f"Reduced batch size to {current_batch_size}, workers to {max_workers}")

	# Process remaining batch
	if batch:
	logger.info(f"Processing final batch of {len(batch)} documents")
	future = executor.submit(process_batch, batch, field_name, embedding_field, openai_client)
	futures.append(future)

	# Process remaining futures
	if futures:
	try:
	completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
	processed += completed
	except Exception as e:
	logger.error(f"Error processing final futures: {str(e)}")

	logger.info(f"Completed embedding generation. Processed {processed}/{total_docs} documents")
	return processed