Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

gabykim commited on Feb 7

Commit

c9b82b3

1 Parent(s): e5bfc68

voyageai code embedding support

Browse files

Files changed (11) hide show

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/data_level0.bin +3 -0
chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/header.bin +3 -0
chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/index_metadata.pickle +3 -0
chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/length.bin +3 -0
chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/link_lists.bin +3 -0
chromadb/transformers-voyage-voyage-code-3/chroma.sqlite3 +3 -0
src/know_lang_bot/chat_bot/chat_graph.py +3 -2
src/know_lang_bot/evaluation/embedding_evaluation.py +2 -2
src/know_lang_bot/models/embeddings.py +20 -3
src/know_lang_bot/utils/migration/{embedding_migrations.py → openai_embedding_migrations.py} +0 -0
src/know_lang_bot/utils/migration/voyage_embedding_migraions.py +122 -0

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc2ad2936b97a745b3944f6e620b6b68e700f087b537d9d5fdf841e05289dbc0
+size 21180000

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9704261c6e5bfae182e06958ecff6199f03c9e1b13a6d73eb4c7034a7be4aaeb
+size 100

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cc112e3dd2dac5a0e2bb79cbb76f769a6e976706be58582bbdf420a7dd3b29b
+size 590939

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96b3e7659800f977eb5fb17017d037cfbcfde55284ef46f342b6b9973a770a55
+size 20000

chromadb/transformers-voyage-voyage-code-3/8b1aa7a1-ab3a-481d-93f0-d3cfe1102024/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19fd5f1636c4e4ec69b581a68db57a148fcd2ae3339bbcacc16c58950718f212
+size 42780

chromadb/transformers-voyage-voyage-code-3/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:952a740a945d42a643a84b55922ca78adaf773b3924a7fdb7f78dd2eb5e7f3c3
+size 88666112

src/know_lang_bot/chat_bot/chat_graph.py CHANGED Viewed

@@ -14,7 +14,7 @@ from enum import Enum
 from rich.console import Console
 from know_lang_bot.utils.model_provider import create_pydantic_model
 from know_lang_bot.utils.chunking_util import truncate_chunk
-from know_lang_bot.models.embeddings import generate_embedding
 import voyageai
 from voyageai.object.reranking import RerankingObject
@@ -162,7 +162,8 @@ class RetrieveContextNode(BaseNode[ChatGraphState, ChatGraphDeps, ChatResult]):
         """Get initial chunks using embedding search"""
         question_embedding = generate_embedding(
             input=query,
-            config=embedding_config
         )
         results = collection.query(

 from rich.console import Console
 from know_lang_bot.utils.model_provider import create_pydantic_model
 from know_lang_bot.utils.chunking_util import truncate_chunk
+from know_lang_bot.models.embeddings import EmbeddingInputType, generate_embedding
 import voyageai
 from voyageai.object.reranking import RerankingObject
         """Get initial chunks using embedding search"""
         question_embedding = generate_embedding(
             input=query,
+            config=embedding_config,
+            input_type=EmbeddingInputType.QUERY
         )
         results = collection.query(

src/know_lang_bot/evaluation/embedding_evaluation.py CHANGED Viewed

@@ -10,7 +10,7 @@ from know_lang_bot.chat_bot.chat_graph import ChatResult
 from know_lang_bot.config import AppConfig, EmbeddingConfig
 import json
 from know_lang_bot.evaluation.chatbot_evaluation import EvalCase, TRANSFORMER_TEST_CASES
-from know_lang_bot.models.embeddings import generate_embedding, EmbeddingVector
 @dataclass
 class ConfigEvalResult:
@@ -65,7 +65,7 @@ async def analyze_embedding_distributions(
             # Generate embeddings for all test cases
             questions = [case.question for case in test_cases]
             try:
-                embeddings = generate_embedding(questions, config.embedding)
                 # Cache the embeddings
                 cached_embeddings = {

 from know_lang_bot.config import AppConfig, EmbeddingConfig
 import json
 from know_lang_bot.evaluation.chatbot_evaluation import EvalCase, TRANSFORMER_TEST_CASES
+from know_lang_bot.models.embeddings import EmbeddingInputType, generate_embedding, EmbeddingVector
 @dataclass
 class ConfigEvalResult:
             # Generate embeddings for all test cases
             questions = [case.question for case in test_cases]
             try:
+                embeddings = generate_embedding(questions, config.embedding, input_type=EmbeddingInputType.QUERY)
                 # Cache the embeddings
                 cached_embeddings = {

src/know_lang_bot/models/embeddings.py CHANGED Viewed

@@ -1,11 +1,20 @@
 import ollama
 import openai
 from know_lang_bot.config import EmbeddingConfig, ModelProvider
-from typing import Union, List, overload
 # Type definitions
 EmbeddingVector = List[float]
 def _process_ollama_batch(inputs: List[str], model_name: str) -> List[EmbeddingVector]:
     """Helper function to process Ollama embeddings in batch."""
     return ollama.embed(model=model_name, input=inputs)['embeddings']
@@ -19,6 +28,12 @@ def _process_openai_batch(inputs: List[str], model_name: str) -> List[EmbeddingV
     )
     return [item.embedding for item in response.data]
 @overload
 def generate_embedding(input: str, config: EmbeddingConfig) -> EmbeddingVector: ...
@@ -27,7 +42,8 @@ def generate_embedding(input: List[str], config: EmbeddingConfig) -> List[Embedd
 def generate_embedding(
     input: Union[str, List[str]],
-    config: EmbeddingConfig
 ) -> Union[EmbeddingVector, List[EmbeddingVector]]:
     """
     Generate embeddings for single text input or batch of texts.
@@ -54,8 +70,9 @@ def generate_embedding(
         if config.model_provider == ModelProvider.OLLAMA:
             embeddings = _process_ollama_batch(inputs, config.model_name)
         elif config.model_provider == ModelProvider.OPENAI:
-            openai.api_key = config.api_key
             embeddings = _process_openai_batch(inputs, config.model_name)
         else:
             raise ValueError(f"Unsupported provider: {config.model_provider}")

 import ollama
 import openai
+import voyageai
+import voyageai.client
 from know_lang_bot.config import EmbeddingConfig, ModelProvider
+from typing import Union, List, overload, Optional
+from enum import Enum
 # Type definitions
 EmbeddingVector = List[float]
+class EmbeddingInputType(Enum):
+    DOCUMENT = "document"
+    QUERY = "query"
 def _process_ollama_batch(inputs: List[str], model_name: str) -> List[EmbeddingVector]:
     """Helper function to process Ollama embeddings in batch."""
     return ollama.embed(model=model_name, input=inputs)['embeddings']
     )
     return [item.embedding for item in response.data]
+def _process_voiage_batch(inputs: List[str], model_name: str, input_type:EmbeddingInputType) -> List[EmbeddingVector]:
+    """Helper function to process VoyageAI embeddings in batch."""
+    vo = voyageai.Client()
+    embeddings_obj = vo.embed(model=model_name, texts=inputs, input_type=input_type.value)
+    return embeddings_obj.embeddings
 @overload
 def generate_embedding(input: str, config: EmbeddingConfig) -> EmbeddingVector: ...
 def generate_embedding(
     input: Union[str, List[str]],
+    config: EmbeddingConfig,
+    input_type: Optional[EmbeddingInputType] = EmbeddingInputType.DOCUMENT
 ) -> Union[EmbeddingVector, List[EmbeddingVector]]:
     """
     Generate embeddings for single text input or batch of texts.
         if config.model_provider == ModelProvider.OLLAMA:
             embeddings = _process_ollama_batch(inputs, config.model_name)
         elif config.model_provider == ModelProvider.OPENAI:
             embeddings = _process_openai_batch(inputs, config.model_name)
+        elif config.model_provider == ModelProvider.VOYAGE:
+            embeddings = _process_voiage_batch(inputs, config.model_name, input_type)
         else:
             raise ValueError(f"Unsupported provider: {config.model_provider}")

src/know_lang_bot/utils/migration/{embedding_migrations.py → openai_embedding_migrations.py} RENAMED Viewed

File without changes

src/know_lang_bot/utils/migration/voyage_embedding_migraions.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from pathlib import Path
+import asyncio
+import chromadb
+from chromadb.errors import InvalidCollectionException
+from rich.progress import Progress
+from rich.console import Console
+from typing import List
+from know_lang_bot.config import AppConfig, EmbeddingConfig
+from know_lang_bot.models.embeddings import generate_embedding, EmbeddingInputType
+from know_lang_bot.utils.fancy_log import FancyLogger
+LOG = FancyLogger(__name__)
+console = Console()
+BATCH_SIZE = 64  # VoyageAI's maximum batch size is 128
+async def process_batch(
+    documents: List[str],
+    config: EmbeddingConfig,
+) -> List[List[float]]:
+    """Process a batch of documents to generate embeddings"""
+    try:
+        embeddings = generate_embedding(
+            input=documents,
+            config=config,
+            input_type=EmbeddingInputType.DOCUMENT
+        )
+        return embeddings
+    except Exception as e:
+        LOG.error(f"Error processing batch: {e}")
+        raise
+async def migrate_embeddings(config: AppConfig):
+    """Migrate embeddings using VoyageAI's API"""
+    # Initialize source DB client (existing)
+    source_client = chromadb.PersistentClient(
+        path=str(config.db.persist_directory)
+    )
+    source_collection = source_client.get_collection(
+        name=config.db.collection_name
+    )
+    # Initialize target DB client (new)
+    target_path = Path(config.db.persist_directory).parent / f"transformers-{config.embedding.model_provider.value}-{config.embedding.model_name}"
+    target_path.mkdir(exist_ok=True)
+    target_client = chromadb.PersistentClient(path=str(target_path))
+    # Create new collection
+    new_collection_name = f"{config.db.collection_name}_voyage"
+    try:
+        target_collection = target_client.get_collection(name=new_collection_name)
+        console.print(f"[yellow]Collection {new_collection_name} already exists. Deleting...")
+        target_client.delete_collection(name=new_collection_name)
+    except InvalidCollectionException:
+        pass
+    target_collection = target_client.create_collection(
+        name=new_collection_name,
+        metadata={"hnsw:space": "cosine"}
+    )
+    # Get all documents from source
+    results = source_collection.get(
+        include=['documents', 'metadatas']
+    )
+    total_documents = len(results['ids'])
+    console.print(f"[green]Found {total_documents} documents to process")
+    with Progress() as progress:
+        batch_task = progress.add_task(
+            "Processing batches...",
+            total=total_documents
+        )
+        # Process in batches
+        for i in range(0, total_documents, BATCH_SIZE):
+            batch_end = min(i + BATCH_SIZE, total_documents)
+            batch_docs = results['documents'][i:batch_end]
+            batch_ids = results['ids'][i:batch_end]
+            batch_metadatas = results['metadatas'][i:batch_end]
+            try:
+                # Generate embeddings for batch
+                embeddings = await process_batch(
+                    documents=batch_docs,
+                    config=config.embedding
+                )
+                # Add to new collection
+                target_collection.add(
+                    embeddings=embeddings,
+                    documents=batch_docs,
+                    metadatas=batch_metadatas,
+                    ids=batch_ids
+                )
+                await asyncio.sleep(2)
+            except Exception as e:
+                LOG.error(f"Failed to process batch {i//BATCH_SIZE}: {e}")
+                # Log failed IDs for retry
+                failed_ids = batch_ids
+                console.print(f"[red]Failed IDs: {failed_ids}")
+                continue
+            finally:
+                progress.advance(batch_task, len(batch_docs))
+    # Print statistics
+    final_count = len(target_collection.get()['ids'])
+    console.print(f"\n[green]Migration complete!")
+    console.print(f"Source documents: {total_documents}")
+    console.print(f"Target documents: {final_count}")
+    console.print(f"\nNew database location: {target_path}")
+    if final_count < total_documents:
+        console.print(f"[yellow]Warning: {total_documents - final_count} documents failed to process")
+if __name__ == "__main__":
+    config = AppConfig()
+    asyncio.run(migrate_embeddings(config))