Spaces:

airabbitX
/

mongo-vector-search-util

Running

App Files Files Community

airabbitX commited on Jan 27

Commit

bbccab6

verified ·

1 Parent(s): 787933d

Upload 7 files

Browse files

Files changed (2) hide show

embedding_utils.py +90 -69
run.sh +1 -0

embedding_utils.py CHANGED Viewed

@@ -1,20 +1,38 @@
 from typing import List, Tuple
-from concurrent.futures import ThreadPoolExecutor
 from pymongo import UpdateOne
 from pymongo.collection import Collection
 import math
-def get_embedding(text: str, openai_client, model="text-embedding-ada-002") -> list[float]:
-    """Get embeddings for given text using OpenAI API"""
     text = text.replace("\n", " ")
-    resp = openai_client.embeddings.create(
-        input=[text],
-        model=model
-    )
-    return resp.data[0].embedding
 def process_batch(docs: List[dict], field_name: str, embedding_field: str, openai_client) -> List[Tuple[str, list]]:
     """Process a batch of documents to generate embeddings"""
     results = []
     for doc in docs:
         # Skip if embedding already exists
@@ -27,6 +45,32 @@ def process_batch(docs: List[dict], field_name: str, embedding_field: str, opena
             results.append((doc["_id"], embedding))
     return results
 def parallel_generate_embeddings(
     collection: Collection,
     cursor,
@@ -34,89 +78,66 @@ def parallel_generate_embeddings(
     embedding_field: str,
     openai_client,
     total_docs: int,
-    batch_size: int = 20,
     callback=None
 ) -> int:
-    """Generate embeddings in parallel using ThreadPoolExecutor with cursor-based batching
-    Args:
-        collection: MongoDB collection
-        cursor: MongoDB cursor for document iteration
-        field_name: Field containing text to embed
-        embedding_field: Field to store embeddings
-        openai_client: OpenAI client instance
-        total_docs: Total number of documents to process
-        batch_size: Size of batches for parallel processing
-        callback: Optional callback function for progress updates
-    Returns:
-        Number of documents processed
-    """
     if total_docs == 0:
         return 0
     processed = 0
-    # Initial progress update
     if callback:
         callback(0, 0, total_docs)
-    # Process documents in batches using cursor
-    with ThreadPoolExecutor(max_workers=20) as executor:
         batch = []
         futures = []
-        # Iterate through cursor and build batches
         for doc in cursor:
             batch.append(doc)
-            if len(batch) >= batch_size:
-                # Submit batch for processing
                 future = executor.submit(process_batch, batch.copy(), field_name, embedding_field, openai_client)
                 futures.append(future)
-                batch = []  # Clear batch for next round
-                # Process completed futures to free up memory
-                completed_futures = [f for f in futures if f.done()]
-                for future in completed_futures:
-                    results = future.result()
-                    if results:
-                        # Batch update MongoDB
-                        bulk_ops = [
-                            UpdateOne({"_id": doc_id}, {"$set": {embedding_field: embedding}})
-                            for doc_id, embedding in results
-                        ]
-                        if bulk_ops:
-                            collection.bulk_write(bulk_ops)
-                            processed += len(bulk_ops)
-                        # Update progress
-                        if callback:
-                            progress = (processed / total_docs) * 100
-                            callback(progress, processed, total_docs)
-                futures = [f for f in futures if not f.done()]
-        # Process any remaining documents in the last batch
         if batch:
             future = executor.submit(process_batch, batch, field_name, embedding_field, openai_client)
             futures.append(future)
-        # Wait for remaining futures to complete
-        for future in futures:
-            results = future.result()
-            if results:
-                bulk_ops = [
-                    UpdateOne({"_id": doc_id}, {"$set": {embedding_field: embedding}})
-                    for doc_id, embedding in results
-                ]
-                if bulk_ops:
-                    collection.bulk_write(bulk_ops)
-                    processed += len(bulk_ops)
-                    # Final progress update
-                    if callback:
-                        progress = (processed / total_docs) * 100
-                        callback(progress, processed, total_docs)
     return processed

 from typing import List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pymongo import UpdateOne
 from pymongo.collection import Collection
 import math
+import time
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_embedding(text: str, openai_client, model="text-embedding-ada-002", max_retries=3) -> list[float]:
+    """Get embeddings for given text using OpenAI API with retry logic"""
     text = text.replace("\n", " ")
+    for attempt in range(max_retries):
+        try:
+            resp = openai_client.embeddings.create(
+                input=[text],
+                model=model
+            )
+            return resp.data[0].embedding
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            error_details = f"{type(e).__name__}: {str(e)}"
+            if hasattr(e, 'response'):
+                error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
+            logger.warning(f"Embedding API error (attempt {attempt + 1}/{max_retries}):\n{error_details}")
+            time.sleep(2 ** attempt)  # Exponential backoff
 def process_batch(docs: List[dict], field_name: str, embedding_field: str, openai_client) -> List[Tuple[str, list]]:
     """Process a batch of documents to generate embeddings"""
+    logger.info(f"Processing batch of {len(docs)} documents")
     results = []
     for doc in docs:
         # Skip if embedding already exists
             results.append((doc["_id"], embedding))
     return results
+def process_futures(futures: List, collection: Collection, embedding_field: str, processed: int, total_docs: int, callback=None) -> int:
+    """Process completed futures and update progress"""
+    completed = 0
+    for future in as_completed(futures, timeout=30):  # 30 second timeout
+        try:
+            results = future.result()
+            if results:
+                bulk_ops = [
+                    UpdateOne({"_id": doc_id}, {"$set": {embedding_field: embedding}})
+                    for doc_id, embedding in results
+                ]
+                if bulk_ops:
+                    collection.bulk_write(bulk_ops)
+                    completed += len(bulk_ops)
+                    # Update progress
+                    if callback:
+                        progress = ((processed + completed) / total_docs) * 100
+                        callback(progress, processed + completed, total_docs)
+        except Exception as e:
+            error_details = f"{type(e).__name__}: {str(e)}"
+            if hasattr(e, 'response'):
+                error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
+            logger.error(f"Error processing future:\n{error_details}")
+    return completed
 def parallel_generate_embeddings(
     collection: Collection,
     cursor,
     embedding_field: str,
     openai_client,
     total_docs: int,
+    batch_size: int = 10,  # Reduced initial batch size
     callback=None
 ) -> int:
+    """Generate embeddings in parallel using ThreadPoolExecutor with cursor-based batching and dynamic processing"""
     if total_docs == 0:
         return 0
     processed = 0
+    current_batch_size = batch_size
+    max_workers = 5  # Start with fewer workers
+    logger.info(f"Starting embedding generation for {total_docs} documents")
     if callback:
         callback(0, 0, total_docs)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
         batch = []
         futures = []
         for doc in cursor:
             batch.append(doc)
+            if len(batch) >= current_batch_size:
+                logger.info(f"Submitting batch of {len(batch)} documents (batch size: {current_batch_size})")
                 future = executor.submit(process_batch, batch.copy(), field_name, embedding_field, openai_client)
                 futures.append(future)
+                batch = []
+                # Process completed futures more frequently
+                if len(futures) >= max_workers:
+                    try:
+                        completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
+                        processed += completed
+                        futures = []  # Clear processed futures
+                        # Gradually increase batch size and workers if processing is successful
+                        if completed > 0:
+                            current_batch_size = min(current_batch_size + 5, 30)
+                            max_workers = min(max_workers + 2, 20)
+                            logger.info(f"Increased batch size to {current_batch_size}, workers to {max_workers}")
+                    except Exception as e:
+                        logger.error(f"Error processing futures: {str(e)}")
+                        # Reduce batch size and workers on error
+                        current_batch_size = max(5, current_batch_size - 5)
+                        max_workers = max(3, max_workers - 2)
+                        logger.info(f"Reduced batch size to {current_batch_size}, workers to {max_workers}")
+        # Process remaining batch
         if batch:
+            logger.info(f"Processing final batch of {len(batch)} documents")
             future = executor.submit(process_batch, batch, field_name, embedding_field, openai_client)
             futures.append(future)
+        # Process remaining futures
+        if futures:
+            try:
+                completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
+                processed += completed
+            except Exception as e:
+                logger.error(f"Error processing final futures: {str(e)}")
+    logger.info(f"Completed embedding generation. Processed {processed}/{total_docs} documents")
     return processed

run.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ python app.py