fix summarization unittest
Browse files
src/knowlang/models/embeddings.py
CHANGED
@@ -35,10 +35,10 @@ def _process_voiage_batch(inputs: List[str], model_name: str, input_type:Embeddi
|
|
35 |
return embeddings_obj.embeddings
|
36 |
|
37 |
@overload
|
38 |
-
def generate_embedding(input: str, config: EmbeddingConfig) -> EmbeddingVector: ...
|
39 |
|
40 |
@overload
|
41 |
-
def generate_embedding(input: List[str], config: EmbeddingConfig) -> List[EmbeddingVector]: ...
|
42 |
|
43 |
def generate_embedding(
|
44 |
input: Union[str, List[str]],
|
|
|
35 |
return embeddings_obj.embeddings
|
36 |
|
37 |
@overload
|
38 |
+
def generate_embedding(input: str, config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> EmbeddingVector: ...
|
39 |
|
40 |
@overload
|
41 |
+
def generate_embedding(input: List[str], config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> List[EmbeddingVector]: ...
|
42 |
|
43 |
def generate_embedding(
|
44 |
input: Union[str, List[str]],
|
src/knowlang/summarizer/summarizer.py
CHANGED
@@ -7,7 +7,8 @@ from pprint import pformat
|
|
7 |
from rich.progress import Progress
|
8 |
|
9 |
from knowlang.configs.config import AppConfig
|
10 |
-
from knowlang.core.types import CodeChunk
|
|
|
11 |
from knowlang.utils.fancy_log import FancyLogger
|
12 |
from knowlang.utils.model_provider import create_pydantic_model
|
13 |
from knowlang.models.embeddings import generate_embedding
|
@@ -91,19 +92,11 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
|
|
91 |
result = await self.agent.run(prompt)
|
92 |
LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
|
93 |
|
94 |
-
return result.data
|
95 |
-
|
96 |
async def process_and_store_chunk(self, chunk: CodeChunk):
|
97 |
"""Process a chunk and store it in ChromaDB"""
|
98 |
summary = await self.summarize_chunk(chunk)
|
99 |
-
|
100 |
-
summary = f"""
|
101 |
-
CODE:
|
102 |
-
{chunk.content}
|
103 |
-
|
104 |
-
SUMMARY:
|
105 |
-
{summary}
|
106 |
-
"""
|
107 |
|
108 |
# Create a unique ID for the chunk
|
109 |
chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
|
|
7 |
from rich.progress import Progress
|
8 |
|
9 |
from knowlang.configs.config import AppConfig
|
10 |
+
from knowlang.core.types import CodeChunk
|
11 |
+
from knowlang.utils.chunking_util import format_code_summary
|
12 |
from knowlang.utils.fancy_log import FancyLogger
|
13 |
from knowlang.utils.model_provider import create_pydantic_model
|
14 |
from knowlang.models.embeddings import generate_embedding
|
|
|
92 |
result = await self.agent.run(prompt)
|
93 |
LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
|
94 |
|
95 |
+
return format_code_summary(chunk.content, result.data)
|
96 |
+
|
97 |
async def process_and_store_chunk(self, chunk: CodeChunk):
|
98 |
"""Process a chunk and store it in ChromaDB"""
|
99 |
summary = await self.summarize_chunk(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
# Create a unique ID for the chunk
|
102 |
chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
src/knowlang/utils/chunking_util.py
CHANGED
@@ -2,6 +2,10 @@
|
|
2 |
MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
|
3 |
|
4 |
|
|
|
|
|
|
|
|
|
5 |
def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
|
6 |
"""Truncate text to approximate token limit while preserving structure"""
|
7 |
if len(text) <= max_chars:
|
|
|
2 |
MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
|
3 |
|
4 |
|
5 |
+
def format_code_summary(code: str, summary: str) -> str:
|
6 |
+
"""Format code and summary into a single string"""
|
7 |
+
return f"CODE:\n{code}\n\nSUMMARY:\n{summary}"
|
8 |
+
|
9 |
def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
|
10 |
"""Truncate text to approximate token limit while preserving structure"""
|
11 |
if len(text) <= max_chars:
|
tests/test_summarizer.py
CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5 |
from knowlang.summarizer.summarizer import CodeSummarizer
|
6 |
from knowlang.core.types import CodeChunk, ChunkType
|
7 |
from knowlang.configs.config import AppConfig
|
|
|
8 |
|
9 |
@pytest.fixture
|
10 |
def config():
|
@@ -64,7 +65,7 @@ async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunk
|
|
64 |
|
65 |
# Verify result
|
66 |
assert isinstance(result, str)
|
67 |
-
assert result == mock_run_result.data
|
68 |
|
69 |
# Verify agent was called with correct prompt
|
70 |
call_args = mock_agent.run.call_args[0][0]
|
@@ -110,10 +111,12 @@ async def test_process_and_store_chunk_with_embedding(
|
|
110 |
|
111 |
# Process the chunk
|
112 |
await summarizer.process_and_store_chunk(sample_chunks[0])
|
|
|
|
|
113 |
|
114 |
# Verify ollama.embed was called with correct parameters
|
115 |
mock_embedding_generator.assert_called_once_with(
|
116 |
-
|
117 |
config.embedding,
|
118 |
)
|
119 |
|
@@ -124,7 +127,7 @@ async def test_process_and_store_chunk_with_embedding(
|
|
124 |
kwargs = add_call[1]
|
125 |
assert len(kwargs['embeddings']) == 3
|
126 |
assert kwargs['embeddings'] == mock_embedding
|
127 |
-
assert kwargs['documents'][0] ==
|
128 |
assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
|
129 |
|
130 |
# Verify metadata
|
|
|
5 |
from knowlang.summarizer.summarizer import CodeSummarizer
|
6 |
from knowlang.core.types import CodeChunk, ChunkType
|
7 |
from knowlang.configs.config import AppConfig
|
8 |
+
from knowlang.utils.chunking_util import format_code_summary
|
9 |
|
10 |
@pytest.fixture
|
11 |
def config():
|
|
|
65 |
|
66 |
# Verify result
|
67 |
assert isinstance(result, str)
|
68 |
+
assert result == format_code_summary(sample_chunks[0].content, mock_run_result.data)
|
69 |
|
70 |
# Verify agent was called with correct prompt
|
71 |
call_args = mock_agent.run.call_args[0][0]
|
|
|
111 |
|
112 |
# Process the chunk
|
113 |
await summarizer.process_and_store_chunk(sample_chunks[0])
|
114 |
+
|
115 |
+
code_summary = format_code_summary(sample_chunks[0].content, mock_run_result.data)
|
116 |
|
117 |
# Verify ollama.embed was called with correct parameters
|
118 |
mock_embedding_generator.assert_called_once_with(
|
119 |
+
code_summary,
|
120 |
config.embedding,
|
121 |
)
|
122 |
|
|
|
127 |
kwargs = add_call[1]
|
128 |
assert len(kwargs['embeddings']) == 3
|
129 |
assert kwargs['embeddings'] == mock_embedding
|
130 |
+
assert kwargs['documents'][0] == code_summary
|
131 |
assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
|
132 |
|
133 |
# Verify metadata
|