Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

gabykim commited on Jan 28

Commit

369993f

1 Parent(s): 27ad088

ollama embedding for chromadb chunks

Browse files

Files changed (5) hide show

poetry.lock +17 -1
pyproject.toml +2 -1
src/know_lang_bot/code_parser/summarizer.py +16 -0
src/know_lang_bot/config.py +8 -0
tests/test_summarizer.py +53 -2

poetry.lock CHANGED Viewed

@@ -1493,6 +1493,22 @@ rsa = ["cryptography (>=3.0.0)"]
 signals = ["blinker (>=1.4.0)"]
 signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
 [[package]]
 name = "onnxruntime"
 version = "1.20.1"
@@ -3140,4 +3156,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <4.0"
-content-hash = "e832a3ea167213ca280f213201124d535205b11ddadd8f4affbbdf0431a78906"

 signals = ["blinker (>=1.4.0)"]
 signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+[[package]]
+name = "ollama"
+version = "0.4.7"
+description = "The official Python client for Ollama."
+optional = false
+python-versions = "<4.0,>=3.8"
+groups = ["main"]
+files = [
+    {file = "ollama-0.4.7-py3-none-any.whl", hash = "sha256:85505663cca67a83707be5fb3aeff0ea72e67846cea5985529d8eca4366564a1"},
+    {file = "ollama-0.4.7.tar.gz", hash = "sha256:891dcbe54f55397d82d289c459de0ea897e103b86a3f1fad0fdb1895922a75ff"},
+]
+[package.dependencies]
+httpx = ">=0.27,<0.29"
+pydantic = ">=2.9.0,<3.0.0"
 [[package]]
 name = "onnxruntime"
 version = "1.20.1"
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10, <4.0"
+content-hash = "2f776d5c10f8354dd6e7916b1453b1a5de7c28b93b8bbc287dbd874d1f9f1cee"

pyproject.toml CHANGED Viewed

@@ -17,7 +17,8 @@ dependencies = [
     "tree-sitter (>=0.24.0,<0.25.0)",
     "tree-sitter-python (>=0.23.6,<0.24.0)",
     "pydantic-settings (>=2.7.1,<3.0.0)",
-    "chromadb (>=0.6.3,<0.7.0)"
 ]
 [tool.poetry]

     "tree-sitter (>=0.24.0,<0.25.0)",
     "tree-sitter-python (>=0.23.6,<0.24.0)",
     "pydantic-settings (>=2.7.1,<3.0.0)",
+    "chromadb (>=0.6.3,<0.7.0)",
+    "ollama (>=0.4.7,<0.5.0)"
 ]
 [tool.poetry]

src/know_lang_bot/code_parser/summarizer.py CHANGED Viewed

@@ -3,6 +3,7 @@ import chromadb
 from chromadb.errors import InvalidCollectionException
 from pydantic_ai import Agent
 from pydantic import BaseModel, Field
 from know_lang_bot.config import AppConfig
 from know_lang_bot.code_parser.parser import CodeChunk
@@ -62,6 +63,17 @@ class CodeSummarizer:
                 metadata={"hnsw:space": "cosine"}
             )
     async def summarize_chunk(self, chunk: CodeChunk) -> str:
         """Summarize a single code chunk using the LLM"""
         prompt = f"""
@@ -96,9 +108,13 @@ class CodeSummarizer:
             docstring=chunk.docstring if chunk.docstring else ''
         )
         # Store in ChromaDB
         self.collection.add(
             documents=[summary],
             metadatas=[metadata.model_dump()],
             ids=[chunk_id]
         )

 from chromadb.errors import InvalidCollectionException
 from pydantic_ai import Agent
 from pydantic import BaseModel, Field
+import ollama
 from know_lang_bot.config import AppConfig
 from know_lang_bot.code_parser.parser import CodeChunk
                 metadata={"hnsw:space": "cosine"}
             )
+    def _get_embedding(self, text: str) -> List[float]:
+        """Get embedding for text using configured provider"""
+        if self.config.llm.embedding_provider == "ollama":
+            response = ollama.embed(
+                model=self.config.llm.embedding_model,
+                input=text
+            )
+            return response['embedding']
+        else:
+            raise ValueError(f"Unsupported embedding provider: {self.config.llm.embedding_provider}")
     async def summarize_chunk(self, chunk: CodeChunk) -> str:
         """Summarize a single code chunk using the LLM"""
         prompt = f"""
             docstring=chunk.docstring if chunk.docstring else ''
         )
+        # Get embedding for the summary
+        embedding = self._get_embedding(summary)
         # Store in ChromaDB
         self.collection.add(
             documents=[summary],
+            embeddings=[embedding],
             metadatas=[metadata.model_dump()],
             ids=[chunk_id]
         )

src/know_lang_bot/config.py CHANGED Viewed

@@ -20,6 +20,14 @@ class LLMConfig(BaseSettings):
         default_factory=dict,
         description="Additional model settings"
     )
 class DBConfig(BaseSettings):
     persist_directory: Path = Field(

         default_factory=dict,
         description="Additional model settings"
     )
+    embedding_model: str = Field(
+        default="mxbai-embed-large",
+        description="Name of the embedding model to use"
+    )
+    embedding_provider: str = Field(
+        default="ollama",
+        description="Provider for embeddings (ollama, openai, etc)"
+    )
 class DBConfig(BaseSettings):
     persist_directory: Path = Field(

tests/test_summarizer.py CHANGED Viewed

@@ -2,7 +2,6 @@ import pytest
 import tempfile
 from unittest.mock import Mock, patch, AsyncMock
 from pathlib import Path
-from pydantic_ai import Agent
 from know_lang_bot.code_parser.summarizer import CodeSummarizer
 from know_lang_bot.code_parser.parser import CodeChunk, ChunkType
 from know_lang_bot.config import AppConfig
@@ -83,4 +82,56 @@ def test_chromadb_initialization(mock_agent_class, config: AppConfig):
     # Verify we can create a new collection
     summarizer.db_client.delete_collection(config.db.collection_name)
     new_summarizer = CodeSummarizer(config)
-    assert new_summarizer.collection is not None

 import tempfile
 from unittest.mock import Mock, patch, AsyncMock
 from pathlib import Path
 from know_lang_bot.code_parser.summarizer import CodeSummarizer
 from know_lang_bot.code_parser.parser import CodeChunk, ChunkType
 from know_lang_bot.config import AppConfig
     # Verify we can create a new collection
     summarizer.db_client.delete_collection(config.db.collection_name)
     new_summarizer = CodeSummarizer(config)
+    assert new_summarizer.collection is not None
+@pytest.mark.asyncio
+@patch('know_lang_bot.code_parser.summarizer.ollama')
+@patch('know_lang_bot.code_parser.summarizer.Agent')
+async def test_process_and_store_chunk_with_embedding(
+    mock_agent_class,
+    mock_ollama,
+    config: AppConfig,
+    sample_chunks: list[CodeChunk],
+    mock_run_result: Mock
+):
+    """Test processing and storing a chunk with embedding"""
+    # Setup the mock agent instance
+    mock_agent = mock_agent_class.return_value
+    mock_agent.run = AsyncMock(return_value=mock_run_result)
+    # Setup mock embedding response
+    mock_embedding = {'embedding': [0.1, 0.2, 0.3]}  # Sample embedding vector
+    mock_ollama.embed = Mock(return_value=mock_embedding)
+    summarizer = CodeSummarizer(config)
+    # Mock the collection's add method
+    summarizer.collection.add = Mock()
+    # Process the chunk
+    await summarizer.process_and_store_chunk(sample_chunks[0])
+    # Verify ollama.embed was called with correct parameters
+    mock_ollama.embed.assert_called_once_with(
+        model=config.llm.embedding_model,
+        input=mock_run_result.data
+    )
+    # Verify collection.add was called with correct parameters
+    add_call = summarizer.collection.add.call_args
+    assert add_call is not None
+    kwargs = add_call[1]
+    assert len(kwargs['embeddings']) == 1
+    assert kwargs['embeddings'][0] == mock_embedding['embedding']
+    assert kwargs['documents'][0] == mock_run_result.data
+    assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
+    # Verify metadata
+    metadata = kwargs['metadatas'][0]
+    assert metadata['file_path'] == sample_chunks[0].file_path
+    assert metadata['start_line'] == sample_chunks[0].start_line
+    assert metadata['end_line'] == sample_chunks[0].end_line
+    assert metadata['type'] == sample_chunks[0].type.value
+    assert metadata['name'] == sample_chunks[0].name
+    assert metadata['docstring'] == sample_chunks[0].docstring