Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

gabykim commited on Feb 8

Commit

3a5efa8

1 Parent(s): 60532a1

fix summarization unittest

Files changed (4) hide show

src/knowlang/models/embeddings.py CHANGED Viewed

@@ -35,10 +35,10 @@ def _process_voiage_batch(inputs: List[str], model_name: str, input_type:Embeddi
     return embeddings_obj.embeddings
 @overload
-def generate_embedding(input: str, config: EmbeddingConfig) -> EmbeddingVector: ...
 @overload
-def generate_embedding(input: List[str], config: EmbeddingConfig) -> List[EmbeddingVector]: ...
 def generate_embedding(
     input: Union[str, List[str]],

     return embeddings_obj.embeddings
 @overload
+def generate_embedding(input: str, config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> EmbeddingVector: ...
 @overload
+def generate_embedding(input: List[str], config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> List[EmbeddingVector]: ...
 def generate_embedding(
     input: Union[str, List[str]],

src/knowlang/summarizer/summarizer.py CHANGED Viewed

@@ -7,7 +7,8 @@ from pprint import pformat
 from rich.progress import Progress
 from knowlang.configs.config import AppConfig
-from knowlang.core.types import CodeChunk, ModelProvider
 from knowlang.utils.fancy_log import FancyLogger
 from knowlang.utils.model_provider import create_pydantic_model
 from knowlang.models.embeddings import generate_embedding
@@ -91,19 +92,11 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
         result = await self.agent.run(prompt)
         LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
-        return result.data
     async def process_and_store_chunk(self, chunk: CodeChunk):
         """Process a chunk and store it in ChromaDB"""
         summary = await self.summarize_chunk(chunk)
-        summary = f"""
-CODE:
-{chunk.content}
-SUMMARY:
-{summary}
-"""
         # Create a unique ID for the chunk
         chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"

 from rich.progress import Progress
 from knowlang.configs.config import AppConfig
+from knowlang.core.types import CodeChunk
+from knowlang.utils.chunking_util import format_code_summary
 from knowlang.utils.fancy_log import FancyLogger
 from knowlang.utils.model_provider import create_pydantic_model
 from knowlang.models.embeddings import generate_embedding
         result = await self.agent.run(prompt)
         LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
+        return format_code_summary(chunk.content, result.data)
     async def process_and_store_chunk(self, chunk: CodeChunk):
         """Process a chunk and store it in ChromaDB"""
         summary = await self.summarize_chunk(chunk)
         # Create a unique ID for the chunk
         chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"

src/knowlang/utils/chunking_util.py CHANGED Viewed

@@ -2,6 +2,10 @@
 MAX_CHARS_PER_CHUNK = 10000  # Approximate 8k tokens limit (very rough estimate)
 def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
     """Truncate text to approximate token limit while preserving structure"""
     if len(text) <= max_chars:

 MAX_CHARS_PER_CHUNK = 10000  # Approximate 8k tokens limit (very rough estimate)
+def format_code_summary(code: str, summary: str) -> str:
+    """Format code and summary into a single string"""
+    return f"CODE:\n{code}\n\nSUMMARY:\n{summary}"
 def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
     """Truncate text to approximate token limit while preserving structure"""
     if len(text) <= max_chars:

tests/test_summarizer.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 from knowlang.summarizer.summarizer import CodeSummarizer
 from knowlang.core.types import CodeChunk, ChunkType
 from knowlang.configs.config import AppConfig
 @pytest.fixture
 def config():
@@ -64,7 +65,7 @@ async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunk
     # Verify result
     assert isinstance(result, str)
-    assert result == mock_run_result.data
     # Verify agent was called with correct prompt
     call_args = mock_agent.run.call_args[0][0]
@@ -110,10 +111,12 @@ async def test_process_and_store_chunk_with_embedding(
     # Process the chunk
     await summarizer.process_and_store_chunk(sample_chunks[0])
     # Verify ollama.embed was called with correct parameters
     mock_embedding_generator.assert_called_once_with(
-        mock_run_result.data,
         config.embedding,
     )
@@ -124,7 +127,7 @@ async def test_process_and_store_chunk_with_embedding(
     kwargs = add_call[1]
     assert len(kwargs['embeddings']) == 3
     assert kwargs['embeddings'] == mock_embedding
-    assert kwargs['documents'][0] == mock_run_result.data
     assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
     # Verify metadata

 from knowlang.summarizer.summarizer import CodeSummarizer
 from knowlang.core.types import CodeChunk, ChunkType
 from knowlang.configs.config import AppConfig
+from knowlang.utils.chunking_util import format_code_summary
 @pytest.fixture
 def config():
     # Verify result
     assert isinstance(result, str)
+    assert result == format_code_summary(sample_chunks[0].content, mock_run_result.data)
     # Verify agent was called with correct prompt
     call_args = mock_agent.run.call_args[0][0]
     # Process the chunk
     await summarizer.process_and_store_chunk(sample_chunks[0])
+    code_summary = format_code_summary(sample_chunks[0].content, mock_run_result.data)
     # Verify ollama.embed was called with correct parameters
     mock_embedding_generator.assert_called_once_with(
+        code_summary,
         config.embedding,
     )
     kwargs = add_call[1]
     assert len(kwargs['embeddings']) == 3
     assert kwargs['embeddings'] == mock_embedding
+    assert kwargs['documents'][0] == code_summary
     assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
     # Verify metadata