Spaces:

airabbitX
/

mongo-vector-search-util

Running

App Files Files Community

airabbitX commited on Jan 28

Commit

46a6768

verified ·

1 Parent(s): 5df473f

Upload 19 files

Browse files

Files changed (13) hide show

app.py +50 -309
ui/__pycache__/embeddings_tab.cpython-312.pyc +0 -0
ui/__pycache__/search_tab.cpython-312.pyc +0 -0
ui/embeddings_tab.py +192 -0
ui/search_tab.py +142 -0
utils/__init__.py +12 -0
utils/__pycache__/__init__.cpython-312.pyc +0 -0
utils/__pycache__/credentials.cpython-312.pyc +0 -0
utils/__pycache__/db_utils.cpython-312.pyc +0 -0
utils/__pycache__/embedding_utils.cpython-312.pyc +0 -0
utils/credentials.py +47 -0
utils/db_utils.py +159 -0
utils/embedding_utils.py +143 -0

app.py CHANGED Viewed

@@ -1,319 +1,60 @@
-import os
 import gradio as gr
-from openai import OpenAI
-import json
-from dotenv import load_dotenv
-from db_utils import DatabaseUtils
-from embedding_utils import parallel_generate_embeddings, get_embedding
-# Load environment variables from .env file
-load_dotenv()
-def check_credentials() -> tuple[bool, str]:
-    """Check if required credentials are set and valid"""
-    atlas_uri = os.getenv("ATLAS_URI")
-    openai_key = os.getenv("OPENAI_API_KEY")
-    if not atlas_uri:
-        return False, """Please set up your MongoDB Atlas credentials:
-1. Go to Settings tab
-2. Add ATLAS_URI as a Repository Secret
-3. Paste your MongoDB connection string (should start with 'mongodb+srv://')"""
-    if not openai_key:
-        return False, """Please set up your OpenAI API key:
-1. Go to Settings tab
-2. Add OPENAI_API_KEY as a Repository Secret
-3. Paste your OpenAI API key"""
-    return True, ""
-def init_clients():
-    """Initialize OpenAI and MongoDB clients"""
-    try:
-        openai_client = OpenAI()
-        db_utils = DatabaseUtils()
-        return openai_client, db_utils
-    except Exception as e:
-        return None, None
-def get_field_names(db_name: str, collection_name: str) -> list[str]:
-    """Get list of fields in the collection"""
-    return db_utils.get_field_names(db_name, collection_name)
-def generate_embeddings_for_field(db_name: str, collection_name: str, field_name: str, embedding_field: str, limit: int = 10, progress=gr.Progress()) -> tuple[str, str]:
-    """Generate embeddings for documents in parallel with progress tracking"""
-    try:
-        db = db_utils.client[db_name]
-        collection = db[collection_name]
-        # Count documents that need embeddings
-        total_docs = collection.count_documents({field_name: {"$exists": True}})
-        if total_docs == 0:
-            return f"No documents found with field '{field_name}'", ""
-        # Get total count of documents that need processing
-        query = {
-            field_name: {"$exists": True},
-            embedding_field: {"$exists": False}  # Only get docs without embeddings
-        }
-        total_to_process = collection.count_documents(query)
-        if total_to_process == 0:
-            return "No documents found that need embeddings", ""
-        # Apply limit if specified
-        if limit > 0:
-            total_to_process = min(total_to_process, limit)
-        print(f"\nFound {total_to_process} documents that need embeddings...")
-        # Progress tracking
-        progress_text = ""
-        def update_progress(prog: float, processed: int, total: int):
-            nonlocal progress_text
-            progress_text = f"Progress: {prog:.1f}% ({processed}/{total} documents)\n"
-            print(progress_text)  # Terminal logging
-            progress(prog/100, f"Processed {processed}/{total} documents")
-        # Show initial progress
-        update_progress(0, 0, total_to_process)
-        # Create cursor for batch processing
-        cursor = collection.find(query)
-        if limit > 0:
-            cursor = cursor.limit(limit)
-        # Generate embeddings in parallel with cursor-based batching
-        processed = parallel_generate_embeddings(
-            collection=collection,
-            cursor=cursor,
-            field_name=field_name,
-            embedding_field=embedding_field,
-            openai_client=openai_client,
-            total_docs=total_to_process,
-            callback=update_progress
-        )
-        # Return completion message and final progress
-        instructions = f"""
-Successfully generated embeddings for {processed} documents using parallel processing!
-To create the vector search index in MongoDB Atlas:
-1. Go to your Atlas cluster
-2. Click on 'Search' tab
-3. Create an index named 'vector_index' with this configuration:
-{{
-  "fields": [
-    {{
-      "type": "vector",
-      "path": "{embedding_field}",
-      "numDimensions": 1536,
-      "similarity": "dotProduct"
-    }}
-  ]
-}}
-You can now use the search tab with:
-- Field to search: {field_name}
-- Embedding field: {embedding_field}
-"""
-        return instructions, progress_text
-    except Exception as e:
-        return f"Error: {str(e)}", ""
-def vector_search(query_text: str, db_name: str, collection_name: str, embedding_field: str, index_name: str) -> str:
-    """Perform vector search using embeddings"""
-    try:
-        print(f"\nProcessing query: {query_text}")
-        db = db_utils.client[db_name]
-        collection = db[collection_name]
-        # Get embeddings for query
-        embedding = get_embedding(query_text, openai_client)
-        print("Generated embeddings successfully")
-        results = collection.aggregate([
-            {
-                '$vectorSearch': {
-                    "index": index_name,
-                    "path": embedding_field,
-                    "queryVector": embedding,
-                    "numCandidates": 50,
-                    "limit": 5
-                }
-            },
-            {
-                "$project": {
-                    "search_score": { "$meta": "vectorSearchScore" },
-                    "document": "$$ROOT"
-                }
-            }
-        ])
-        # Format results
-        results_list = list(results)
-        formatted_results = []
-        for idx, result in enumerate(results_list, 1):
-            doc = result['document']
-            formatted_result = f"{idx}. Score: {result['search_score']:.4f}\n"
-            # Add all fields except _id and embeddings
-            for key, value in doc.items():
-                if key not in ['_id', embedding_field]:
-                    formatted_result += f"{key}: {value}\n"
-            formatted_results.append(formatted_result)
-        return "\n".join(formatted_results) if formatted_results else "No results found"
-    except Exception as e:
-        return f"Error: {str(e)}"
-# Create Gradio interface with tabs
-with gr.Blocks(title="MongoDB Vector Search Tool") as iface:
-    gr.Markdown("# MongoDB Vector Search Tool")
-    # Check credentials first
-    has_creds, cred_message = check_credentials()
-    if not has_creds:
-        gr.Markdown(f"""
-        ## ⚠️ Setup Required
-        {cred_message}
-        After setting up credentials, refresh this page.
-        """)
-    else:
-        # Initialize clients
-        openai_client, db_utils = init_clients()
-        if not openai_client or not db_utils:
-            gr.Markdown("""
-            ## ⚠️ Connection Error
-            Failed to connect to MongoDB Atlas or OpenAI. Please check your credentials and try again.
             """)
         else:
-            # Get available databases
-            try:
-                databases = db_utils.get_databases()
-    with gr.Tab("Generate Embeddings"):
-        with gr.Row():
-            db_input = gr.Dropdown(
-                choices=databases,
-                label="Select Database",
-                info="Available databases in Atlas cluster"
-            )
-            collection_input = gr.Dropdown(
-                choices=[],
-                label="Select Collection",
-                info="Collections in selected database"
-            )
-        with gr.Row():
-            field_input = gr.Dropdown(
-                choices=[],
-                label="Select Field for Embeddings",
-                info="Fields available in collection"
-            )
-            embedding_field_input = gr.Textbox(
-                label="Embedding Field Name",
-                value="embedding",
-                info="Field name where embeddings will be stored"
-            )
-            limit_input = gr.Number(
-                label="Document Limit",
-                value=10,
-                minimum=0,
-                info="Number of documents to process (0 for all documents)"
-            )
-        def update_collections(db_name):
-            collections = db_utils.get_collections(db_name)
-            # If there's only one collection, select it by default
-            value = collections[0] if len(collections) == 1 else None
-            return gr.Dropdown(choices=collections, value=value)
-        def update_fields(db_name, collection_name):
-            if db_name and collection_name:
-                fields = get_field_names(db_name, collection_name)
-                return gr.Dropdown(choices=fields)
-            return gr.Dropdown(choices=[])
-        # Update collections when database changes
-        db_input.change(
-            fn=update_collections,
-            inputs=[db_input],
-            outputs=[collection_input]
-        )
-        # Update fields when collection changes
-        collection_input.change(
-            fn=update_fields,
-            inputs=[db_input, collection_input],
-            outputs=[field_input]
-        )
-        generate_btn = gr.Button("Generate Embeddings")
-        generate_output = gr.Textbox(label="Results", lines=10)
-        progress_output = gr.Textbox(label="Progress", lines=3)
-        generate_btn.click(
-            generate_embeddings_for_field,
-            inputs=[db_input, collection_input, field_input, embedding_field_input, limit_input],
-            outputs=[generate_output, progress_output]
-        )
-    with gr.Tab("Search"):
-        with gr.Row():
-            search_db_input = gr.Dropdown(
-                choices=databases,
-                label="Select Database",
-                info="Database containing the vectors"
-            )
-            search_collection_input = gr.Dropdown(
-                choices=[],
-                label="Select Collection",
-                info="Collection containing the vectors"
-            )
-        with gr.Row():
-            search_embedding_field_input = gr.Textbox(
-                label="Embedding Field Name",
-                value="embedding",
-                info="Field containing the vectors"
-            )
-            search_index_input = gr.Textbox(
-                label="Vector Search Index Name",
-                value="vector_index",
-                info="Index created in Atlas UI"
-            )
-        # Update collections when database changes
-        search_db_input.change(
-            fn=update_collections,
-            inputs=[search_db_input],
-            outputs=[search_collection_input]
-        )
-        query_input = gr.Textbox(
-            label="Search Query",
-            lines=2,
-            placeholder="What would you like to search for?"
-        )
-        search_btn = gr.Button("Search")
-        search_output = gr.Textbox(label="Results", lines=10)
-        search_btn.click(
-            vector_search,
-            inputs=[
-                query_input,
-                search_db_input,
-                search_collection_input,
-                search_embedding_field_input,
-                search_index_input
-            ],
-            outputs=search_output
-        )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+from utils.credentials import check_credentials, init_clients
+from ui.embeddings_tab import create_embeddings_tab
+from ui.search_tab import create_search_tab
+def create_app():
+    """Create and configure the Gradio application"""
+    with gr.Blocks(title="MongoDB Vector Search Tool") as iface:
+        gr.Markdown("# MongoDB Vector Search Tool")
+        # Check credentials first
+        has_creds, cred_message = check_credentials()
+        if not has_creds:
+            gr.Markdown(f"""
+            ## ⚠️ Setup Required
+            {cred_message}
+            After setting up credentials, refresh this page.
             """)
         else:
+            # Initialize clients
+            openai_client, db_utils = init_clients()
+            if not openai_client or not db_utils:
+                gr.Markdown("""
+                ## ⚠️ Connection Error
+                Failed to connect to MongoDB Atlas or OpenAI. Please check your credentials and try again.
+                """)
+            else:
+                # Get available databases
+                try:
+                    databases = db_utils.get_databases()
+                except Exception as e:
+                    gr.Markdown(f"""
+                    ## ⚠️ Database Error
+                    Failed to list databases: {str(e)}
+                    Please check your MongoDB Atlas connection and try again.
+                    """)
+                    databases = []
+                # Create tabs
+                embeddings_tab, embeddings_interface = create_embeddings_tab(
+                    openai_client=openai_client,
+                    db_utils=db_utils,
+                    databases=databases
+                )
+                search_tab, search_interface = create_search_tab(
+                    openai_client=openai_client,
+                    db_utils=db_utils,
+                    databases=databases
+                )
+    return iface
 if __name__ == "__main__":
+    app = create_app()
+    app.launch(server_name="0.0.0.0")

ui/__pycache__/embeddings_tab.cpython-312.pyc ADDED Viewed

Binary file (6.98 kB). View file

ui/__pycache__/search_tab.cpython-312.pyc ADDED Viewed

Binary file (5.06 kB). View file

ui/embeddings_tab.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import gradio as gr
+from typing import Tuple, Optional, List
+from openai import OpenAI
+from utils.db_utils import DatabaseUtils
+from utils.embedding_utils import parallel_generate_embeddings
+def create_embeddings_tab(openai_client: OpenAI, db_utils: DatabaseUtils, databases: List[str]) -> Tuple[gr.Tab, dict]:
+    """Create the embeddings generation tab UI
+    Args:
+        openai_client: OpenAI client instance
+        db_utils: DatabaseUtils instance
+        databases: List of available databases
+    Returns:
+        Tuple[gr.Tab, dict]: The tab component and its interface elements
+    """
+    def update_collections(db_name: str) -> gr.Dropdown:
+        """Update collections dropdown when database changes"""
+        collections = db_utils.get_collections(db_name)
+        # If there's only one collection, select it by default
+        value = collections[0] if len(collections) == 1 else None
+        return gr.Dropdown(choices=collections, value=value)
+    def update_fields(db_name: str, collection_name: str) -> gr.Dropdown:
+        """Update fields dropdown when collection changes"""
+        if db_name and collection_name:
+            fields = db_utils.get_field_names(db_name, collection_name)
+            return gr.Dropdown(choices=fields)
+        return gr.Dropdown(choices=[])
+    def generate_embeddings(
+        db_name: str,
+        collection_name: str,
+        field_name: str,
+        embedding_field: str,
+        limit: int = 10,
+        progress=gr.Progress()
+    ) -> Tuple[str, str]:
+        """Generate embeddings for documents with progress tracking"""
+        try:
+            db = db_utils.client[db_name]
+            collection = db[collection_name]
+            # Count documents that need embeddings
+            total_docs = collection.count_documents({field_name: {"$exists": True}})
+            if total_docs == 0:
+                return f"No documents found with field '{field_name}'", ""
+            # Get total count of documents that need processing
+            query = {
+                field_name: {"$exists": True},
+                embedding_field: {"$exists": False}  # Only get docs without embeddings
+            }
+            total_to_process = collection.count_documents(query)
+            if total_to_process == 0:
+                return "No documents found that need embeddings", ""
+            # Apply limit if specified
+            if limit > 0:
+                total_to_process = min(total_to_process, limit)
+            print(f"\nFound {total_to_process} documents that need embeddings...")
+            # Progress tracking
+            progress_text = ""
+            def update_progress(prog: float, processed: int, total: int):
+                nonlocal progress_text
+                progress_text = f"Progress: {prog:.1f}% ({processed}/{total} documents)\n"
+                print(progress_text)  # Terminal logging
+                progress(prog/100, f"Processed {processed}/{total} documents")
+            # Show initial progress
+            update_progress(0, 0, total_to_process)
+            # Create cursor for batch processing
+            cursor = collection.find(query)
+            if limit > 0:
+                cursor = cursor.limit(limit)
+            # Generate embeddings in parallel with cursor-based batching
+            processed = parallel_generate_embeddings(
+                collection=collection,
+                cursor=cursor,
+                field_name=field_name,
+                embedding_field=embedding_field,
+                openai_client=openai_client,
+                total_docs=total_to_process,
+                callback=update_progress
+            )
+            # Return completion message and final progress
+            instructions = f"""
+Successfully generated embeddings for {processed} documents using parallel processing!
+To create the vector search index in MongoDB Atlas:
+1. Go to your Atlas cluster
+2. Click on 'Search' tab
+3. Create an index named 'vector_index' with this configuration:
+{{
+  "fields": [
+    {{
+      "type": "vector",
+      "path": "{embedding_field}",
+      "numDimensions": 1536,
+      "similarity": "dotProduct"
+    }}
+  ]
+}}
+You can now use the search tab with:
+- Field to search: {field_name}
+- Embedding field: {embedding_field}
+"""
+            return instructions, progress_text
+        except Exception as e:
+            return f"Error: {str(e)}", ""
+    # Create the tab UI
+    with gr.Tab("Generate Embeddings") as tab:
+        with gr.Row():
+            db_input = gr.Dropdown(
+                choices=databases,
+                label="Select Database",
+                info="Available databases in Atlas cluster"
+            )
+            collection_input = gr.Dropdown(
+                choices=[],
+                label="Select Collection",
+                info="Collections in selected database"
+            )
+        with gr.Row():
+            field_input = gr.Dropdown(
+                choices=[],
+                label="Select Field for Embeddings",
+                info="Fields available in collection"
+            )
+            embedding_field_input = gr.Textbox(
+                label="Embedding Field Name",
+                value="embedding",
+                info="Field name where embeddings will be stored"
+            )
+            limit_input = gr.Number(
+                label="Document Limit",
+                value=10,
+                minimum=0,
+                info="Number of documents to process (0 for all documents)"
+            )
+        generate_btn = gr.Button("Generate Embeddings")
+        generate_output = gr.Textbox(label="Results", lines=10)
+        progress_output = gr.Textbox(label="Progress", lines=3)
+        # Set up event handlers
+        db_input.change(
+            fn=update_collections,
+            inputs=[db_input],
+            outputs=[collection_input]
+        )
+        collection_input.change(
+            fn=update_fields,
+            inputs=[db_input, collection_input],
+            outputs=[field_input]
+        )
+        generate_btn.click(
+            fn=generate_embeddings,
+            inputs=[
+                db_input,
+                collection_input,
+                field_input,
+                embedding_field_input,
+                limit_input
+            ],
+            outputs=[generate_output, progress_output]
+        )
+    # Return the tab and its interface elements
+    interface = {
+        'db_input': db_input,
+        'collection_input': collection_input,
+        'field_input': field_input,
+        'embedding_field_input': embedding_field_input,
+        'limit_input': limit_input,
+        'generate_btn': generate_btn,
+        'generate_output': generate_output,
+        'progress_output': progress_output
+    }
+    return tab, interface

ui/search_tab.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import gradio as gr
+from typing import Tuple, List
+from openai import OpenAI
+from utils.db_utils import DatabaseUtils
+from utils.embedding_utils import get_embedding
+def create_search_tab(openai_client: OpenAI, db_utils: DatabaseUtils, databases: List[str]) -> Tuple[gr.Tab, dict]:
+    """Create the vector search tab UI
+    Args:
+        openai_client: OpenAI client instance
+        db_utils: DatabaseUtils instance
+        databases: List of available databases
+    Returns:
+        Tuple[gr.Tab, dict]: The tab component and its interface elements
+    """
+    def update_collections(db_name: str) -> gr.Dropdown:
+        """Update collections dropdown when database changes"""
+        collections = db_utils.get_collections(db_name)
+        # If there's only one collection, select it by default
+        value = collections[0] if len(collections) == 1 else None
+        return gr.Dropdown(choices=collections, value=value)
+    def vector_search(
+        query_text: str,
+        db_name: str,
+        collection_name: str,
+        embedding_field: str,
+        index_name: str
+    ) -> str:
+        """Perform vector search using embeddings"""
+        try:
+            print(f"\nProcessing query: {query_text}")
+            db = db_utils.client[db_name]
+            collection = db[collection_name]
+            # Get embeddings for query
+            embedding = get_embedding(query_text, openai_client)
+            print("Generated embeddings successfully")
+            results = collection.aggregate([
+                {
+                    '$vectorSearch': {
+                        "index": index_name,
+                        "path": embedding_field,
+                        "queryVector": embedding,
+                        "numCandidates": 50,
+                        "limit": 5
+                    }
+                },
+                {
+                    "$project": {
+                        "search_score": { "$meta": "vectorSearchScore" },
+                        "document": "$$ROOT"
+                    }
+                }
+            ])
+            # Format results
+            results_list = list(results)
+            formatted_results = []
+            for idx, result in enumerate(results_list, 1):
+                doc = result['document']
+                formatted_result = f"{idx}. Score: {result['search_score']:.4f}\n"
+                # Add all fields except _id and embeddings
+                for key, value in doc.items():
+                    if key not in ['_id', embedding_field]:
+                        formatted_result += f"{key}: {value}\n"
+                formatted_results.append(formatted_result)
+            return "\n".join(formatted_results) if formatted_results else "No results found"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    # Create the tab UI
+    with gr.Tab("Search") as tab:
+        with gr.Row():
+            db_input = gr.Dropdown(
+                choices=databases,
+                label="Select Database",
+                info="Database containing the vectors"
+            )
+            collection_input = gr.Dropdown(
+                choices=[],
+                label="Select Collection",
+                info="Collection containing the vectors"
+            )
+        with gr.Row():
+            embedding_field_input = gr.Textbox(
+                label="Embedding Field Name",
+                value="embedding",
+                info="Field containing the vectors"
+            )
+            index_input = gr.Textbox(
+                label="Vector Search Index Name",
+                value="vector_index",
+                info="Index created in Atlas UI"
+            )
+        query_input = gr.Textbox(
+            label="Search Query",
+            lines=2,
+            placeholder="What would you like to search for?"
+        )
+        search_btn = gr.Button("Search")
+        search_output = gr.Textbox(label="Results", lines=10)
+        # Set up event handlers
+        db_input.change(
+            fn=update_collections,
+            inputs=[db_input],
+            outputs=[collection_input]
+        )
+        search_btn.click(
+            fn=vector_search,
+            inputs=[
+                query_input,
+                db_input,
+                collection_input,
+                embedding_field_input,
+                index_input
+            ],
+            outputs=search_output
+        )
+    # Return the tab and its interface elements
+    interface = {
+        'db_input': db_input,
+        'collection_input': collection_input,
+        'embedding_field_input': embedding_field_input,
+        'index_input': index_input,
+        'query_input': query_input,
+        'search_btn': search_btn,
+        'search_output': search_output
+    }
+    return tab, interface

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Utils package for MongoDB Vector Search Tool
+from utils.credentials import check_credentials, init_clients
+from utils.db_utils import DatabaseUtils
+from utils.embedding_utils import get_embedding, parallel_generate_embeddings
+__all__ = [
+    'check_credentials',
+    'init_clients',
+    'DatabaseUtils',
+    'get_embedding',
+    'parallel_generate_embeddings'
+]

utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (427 Bytes). View file

utils/__pycache__/credentials.cpython-312.pyc ADDED Viewed

Binary file (1.79 kB). View file

utils/__pycache__/db_utils.cpython-312.pyc ADDED Viewed

Binary file (7.62 kB). View file

utils/__pycache__/embedding_utils.cpython-312.pyc ADDED Viewed

Binary file (7.2 kB). View file

utils/credentials.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+from typing import Tuple
+from dotenv import load_dotenv
+from openai import OpenAI
+from utils.db_utils import DatabaseUtils
+def check_credentials() -> Tuple[bool, str]:
+    """Check if required credentials are set and valid
+    Returns:
+        Tuple[bool, str]: (is_valid, message)
+        - is_valid: True if all credentials are valid
+        - message: Error message if credentials are invalid
+    """
+    # Load environment variables
+    load_dotenv()
+    atlas_uri = os.getenv("ATLAS_URI")
+    openai_key = os.getenv("OPENAI_API_KEY")
+    if not atlas_uri:
+        return False, """Please set up your MongoDB Atlas credentials:
+1. Go to Settings tab
+2. Add ATLAS_URI as a Repository Secret
+3. Paste your MongoDB connection string (should start with 'mongodb+srv://')"""
+    if not openai_key:
+        return False, """Please set up your OpenAI API key:
+1. Go to Settings tab
+2. Add OPENAI_API_KEY as a Repository Secret
+3. Paste your OpenAI API key"""
+    return True, ""
+def init_clients():
+    """Initialize OpenAI and MongoDB clients
+    Returns:
+        Tuple[OpenAI, DatabaseUtils]: OpenAI client and DatabaseUtils instance
+        or (None, None) if initialization fails
+    """
+    try:
+        openai_client = OpenAI()
+        db_utils = DatabaseUtils()
+        return openai_client, db_utils
+    except Exception as e:
+        return None, None

utils/db_utils.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os
+from typing import List, Dict, Any, Optional
+from pymongo import MongoClient
+from pymongo.errors import (
+    ConnectionFailure,
+    OperationFailure,
+    ServerSelectionTimeoutError,
+    InvalidName
+)
+from dotenv import load_dotenv
+class DatabaseError(Exception):
+    """Base class for database operation errors"""
+    pass
+class ConnectionError(DatabaseError):
+    """Error when connecting to MongoDB Atlas"""
+    pass
+class OperationError(DatabaseError):
+    """Error during database operations"""
+    pass
+class DatabaseUtils:
+    """Utility class for MongoDB Atlas database operations
+    This class provides methods to interact with MongoDB Atlas databases and collections,
+    including listing databases, collections, and retrieving collection information.
+    Attributes:
+        atlas_uri (str): MongoDB Atlas connection string
+        client (MongoClient): MongoDB client instance
+    """
+    def __init__(self):
+        """Initialize DatabaseUtils with MongoDB Atlas connection
+        Raises:
+            ConnectionError: If unable to connect to MongoDB Atlas
+            ValueError: If ATLAS_URI environment variable is not set
+        """
+        # Load environment variables
+        load_dotenv()
+        self.atlas_uri = os.getenv("ATLAS_URI")
+        if not self.atlas_uri:
+            raise ValueError("ATLAS_URI environment variable is not set")
+        try:
+            self.client = MongoClient(self.atlas_uri)
+            # Test connection
+            self.client.admin.command('ping')
+        except (ConnectionFailure, ServerSelectionTimeoutError) as e:
+            raise ConnectionError(f"Failed to connect to MongoDB Atlas: {str(e)}")
+    def get_databases(self) -> List[str]:
+        """Get list of all databases in Atlas cluster
+        Returns:
+            List[str]: List of database names
+        Raises:
+            OperationError: If unable to list databases
+        """
+        try:
+            return self.client.list_database_names()
+        except OperationFailure as e:
+            raise OperationError(f"Failed to list databases: {str(e)}")
+    def get_collections(self, db_name: str) -> List[str]:
+        """Get list of collections in a database
+        Args:
+            db_name (str): Name of the database
+        Returns:
+            List[str]: List of collection names
+        Raises:
+            OperationError: If unable to list collections
+            ValueError: If db_name is empty or invalid
+        """
+        if not db_name or not isinstance(db_name, str):
+            raise ValueError("Database name must be a non-empty string")
+        try:
+            db = self.client[db_name]
+            return db.list_collection_names()
+        except (OperationFailure, InvalidName) as e:
+            raise OperationError(f"Failed to list collections for database '{db_name}': {str(e)}")
+    def get_collection_info(self, db_name: str, collection_name: str) -> Dict[str, Any]:
+        """Get information about a collection including document count and sample document
+        Args:
+            db_name (str): Name of the database
+            collection_name (str): Name of the collection
+        Returns:
+            Dict[str, Any]: Dictionary containing collection information:
+                - count: Number of documents in collection
+                - sample: Sample document from collection (if exists)
+        Raises:
+            OperationError: If unable to get collection information
+            ValueError: If db_name or collection_name is empty or invalid
+        """
+        if not db_name or not isinstance(db_name, str):
+            raise ValueError("Database name must be a non-empty string")
+        if not collection_name or not isinstance(collection_name, str):
+            raise ValueError("Collection name must be a non-empty string")
+        try:
+            db = self.client[db_name]
+            collection = db[collection_name]
+            return {
+                'count': collection.count_documents({}),
+                'sample': collection.find_one()
+            }
+        except (OperationFailure, InvalidName) as e:
+            raise OperationError(
+                f"Failed to get info for collection '{collection_name}' "
+                f"in database '{db_name}': {str(e)}"
+            )
+    def get_field_names(self, db_name: str, collection_name: str) -> List[str]:
+        """Get list of fields in a collection based on sample document
+        Args:
+            db_name (str): Name of the database
+            collection_name (str): Name of the collection
+        Returns:
+            List[str]: List of field names (excluding _id and embedding fields)
+        Raises:
+            OperationError: If unable to get field names
+            ValueError: If db_name or collection_name is empty or invalid
+        """
+        try:
+            info = self.get_collection_info(db_name, collection_name)
+            sample = info.get('sample', {})
+            if sample:
+                # Get all field names except _id and any existing embedding fields
+                return [field for field in sample.keys()
+                        if field != '_id' and not field.endswith('_embedding')]
+            return []
+        except DatabaseError as e:
+            raise OperationError(
+                f"Failed to get field names for collection '{collection_name}' "
+                f"in database '{db_name}': {str(e)}"
+            )
+    def close(self):
+        """Close MongoDB connection safely"""
+        if hasattr(self, 'client'):
+            self.client.close()

utils/embedding_utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from typing import List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pymongo import UpdateOne
+from pymongo.collection import Collection
+import math
+import time
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_embedding(text: str, openai_client, model="text-embedding-ada-002", max_retries=3) -> list[float]:
+    """Get embeddings for given text using OpenAI API with retry logic"""
+    text = text.replace("\n", " ")
+    for attempt in range(max_retries):
+        try:
+            resp = openai_client.embeddings.create(
+                input=[text],
+                model=model
+            )
+            return resp.data[0].embedding
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            error_details = f"{type(e).__name__}: {str(e)}"
+            if hasattr(e, 'response'):
+                error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
+            logger.warning(f"Embedding API error (attempt {attempt + 1}/{max_retries}):\n{error_details}")
+            time.sleep(2 ** attempt)  # Exponential backoff
+def process_batch(docs: List[dict], field_name: str, embedding_field: str, openai_client) -> List[Tuple[str, list]]:
+    """Process a batch of documents to generate embeddings"""
+    logger.info(f"Processing batch of {len(docs)} documents")
+    results = []
+    for doc in docs:
+        # Skip if embedding already exists
+        if embedding_field in doc:
+            continue
+        text = doc[field_name]
+        if isinstance(text, str):
+            embedding = get_embedding(text, openai_client)
+            results.append((doc["_id"], embedding))
+    return results
+def process_futures(futures: List, collection: Collection, embedding_field: str, processed: int, total_docs: int, callback=None) -> int:
+    """Process completed futures and update progress"""
+    completed = 0
+    for future in as_completed(futures, timeout=30):  # 30 second timeout
+        try:
+            results = future.result()
+            if results:
+                bulk_ops = [
+                    UpdateOne({"_id": doc_id}, {"$set": {embedding_field: embedding}})
+                    for doc_id, embedding in results
+                ]
+                if bulk_ops:
+                    collection.bulk_write(bulk_ops)
+                    completed += len(bulk_ops)
+                    # Update progress
+                    if callback:
+                        progress = ((processed + completed) / total_docs) * 100
+                        callback(progress, processed + completed, total_docs)
+        except Exception as e:
+            error_details = f"{type(e).__name__}: {str(e)}"
+            if hasattr(e, 'response'):
+                error_details += f"\nResponse: {e.response.text if hasattr(e.response, 'text') else 'No response text'}"
+            logger.error(f"Error processing future:\n{error_details}")
+    return completed
+def parallel_generate_embeddings(
+    collection: Collection,
+    cursor,
+    field_name: str,
+    embedding_field: str,
+    openai_client,
+    total_docs: int,
+    batch_size: int = 10,  # Reduced initial batch size
+    callback=None
+) -> int:
+    """Generate embeddings in parallel using ThreadPoolExecutor with cursor-based batching and dynamic processing"""
+    if total_docs == 0:
+        return 0
+    processed = 0
+    current_batch_size = batch_size
+    max_workers = 10  # Start with fewer workers
+    logger.info(f"Starting embedding generation for {total_docs} documents")
+    if callback:
+        callback(0, 0, total_docs)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        batch = []
+        futures = []
+        for doc in cursor:
+            batch.append(doc)
+            if len(batch) >= current_batch_size:
+                logger.info(f"Submitting batch of {len(batch)} documents (batch size: {current_batch_size})")
+                future = executor.submit(process_batch, batch.copy(), field_name, embedding_field, openai_client)
+                futures.append(future)
+                batch = []
+                # Process completed futures more frequently
+                if len(futures) >= max_workers:
+                    try:
+                        completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
+                        processed += completed
+                        futures = []  # Clear processed futures
+                        # Gradually increase batch size and workers if processing is successful
+                        if completed > 0:
+                            current_batch_size = min(current_batch_size + 5, 30)
+                            max_workers = min(max_workers + 2, 20)
+                            logger.info(f"Increased batch size to {current_batch_size}, workers to {max_workers}")
+                    except Exception as e:
+                        logger.error(f"Error processing futures: {str(e)}")
+                        # Reduce batch size and workers on error
+                        current_batch_size = max(5, current_batch_size - 5)
+                        max_workers = max(5, max_workers - 2)
+                        logger.info(f"Reduced batch size to {current_batch_size}, workers to {max_workers}")
+        # Process remaining batch
+        if batch:
+            logger.info(f"Processing final batch of {len(batch)} documents")
+            future = executor.submit(process_batch, batch, field_name, embedding_field, openai_client)
+            futures.append(future)
+        # Process remaining futures
+        if futures:
+            try:
+                completed = process_futures(futures, collection, embedding_field, processed, total_docs, callback)
+                processed += completed
+            except Exception as e:
+                logger.error(f"Error processing final futures: {str(e)}")
+    logger.info(f"Completed embedding generation. Processed {processed}/{total_docs} documents")
+    return processed