gabykim commited on
Commit
3a5efa8
·
1 Parent(s): 60532a1

fix summarization unittest

Browse files
src/knowlang/models/embeddings.py CHANGED
@@ -35,10 +35,10 @@ def _process_voiage_batch(inputs: List[str], model_name: str, input_type:Embeddi
35
  return embeddings_obj.embeddings
36
 
37
  @overload
38
- def generate_embedding(input: str, config: EmbeddingConfig) -> EmbeddingVector: ...
39
 
40
  @overload
41
- def generate_embedding(input: List[str], config: EmbeddingConfig) -> List[EmbeddingVector]: ...
42
 
43
  def generate_embedding(
44
  input: Union[str, List[str]],
 
35
  return embeddings_obj.embeddings
36
 
37
  @overload
38
+ def generate_embedding(input: str, config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> EmbeddingVector: ...
39
 
40
  @overload
41
+ def generate_embedding(input: List[str], config: EmbeddingConfig, input_type: Optional[EmbeddingInputType]) -> List[EmbeddingVector]: ...
42
 
43
  def generate_embedding(
44
  input: Union[str, List[str]],
src/knowlang/summarizer/summarizer.py CHANGED
@@ -7,7 +7,8 @@ from pprint import pformat
7
  from rich.progress import Progress
8
 
9
  from knowlang.configs.config import AppConfig
10
- from knowlang.core.types import CodeChunk, ModelProvider
 
11
  from knowlang.utils.fancy_log import FancyLogger
12
  from knowlang.utils.model_provider import create_pydantic_model
13
  from knowlang.models.embeddings import generate_embedding
@@ -91,19 +92,11 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
91
  result = await self.agent.run(prompt)
92
  LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
93
 
94
- return result.data
95
-
96
  async def process_and_store_chunk(self, chunk: CodeChunk):
97
  """Process a chunk and store it in ChromaDB"""
98
  summary = await self.summarize_chunk(chunk)
99
-
100
- summary = f"""
101
- CODE:
102
- {chunk.content}
103
-
104
- SUMMARY:
105
- {summary}
106
- """
107
 
108
  # Create a unique ID for the chunk
109
  chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
 
7
  from rich.progress import Progress
8
 
9
  from knowlang.configs.config import AppConfig
10
+ from knowlang.core.types import CodeChunk
11
+ from knowlang.utils.chunking_util import format_code_summary
12
  from knowlang.utils.fancy_log import FancyLogger
13
  from knowlang.utils.model_provider import create_pydantic_model
14
  from knowlang.models.embeddings import generate_embedding
 
92
  result = await self.agent.run(prompt)
93
  LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
94
 
95
+ return format_code_summary(chunk.content, result.data)
96
+
97
  async def process_and_store_chunk(self, chunk: CodeChunk):
98
  """Process a chunk and store it in ChromaDB"""
99
  summary = await self.summarize_chunk(chunk)
 
 
 
 
 
 
 
 
100
 
101
  # Create a unique ID for the chunk
102
  chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
src/knowlang/utils/chunking_util.py CHANGED
@@ -2,6 +2,10 @@
2
  MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
3
 
4
 
 
 
 
 
5
  def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
6
  """Truncate text to approximate token limit while preserving structure"""
7
  if len(text) <= max_chars:
 
2
  MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
3
 
4
 
5
+ def format_code_summary(code: str, summary: str) -> str:
6
+ """Format code and summary into a single string"""
7
+ return f"CODE:\n{code}\n\nSUMMARY:\n{summary}"
8
+
9
  def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
10
  """Truncate text to approximate token limit while preserving structure"""
11
  if len(text) <= max_chars:
tests/test_summarizer.py CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
5
  from knowlang.summarizer.summarizer import CodeSummarizer
6
  from knowlang.core.types import CodeChunk, ChunkType
7
  from knowlang.configs.config import AppConfig
 
8
 
9
  @pytest.fixture
10
  def config():
@@ -64,7 +65,7 @@ async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunk
64
 
65
  # Verify result
66
  assert isinstance(result, str)
67
- assert result == mock_run_result.data
68
 
69
  # Verify agent was called with correct prompt
70
  call_args = mock_agent.run.call_args[0][0]
@@ -110,10 +111,12 @@ async def test_process_and_store_chunk_with_embedding(
110
 
111
  # Process the chunk
112
  await summarizer.process_and_store_chunk(sample_chunks[0])
 
 
113
 
114
  # Verify ollama.embed was called with correct parameters
115
  mock_embedding_generator.assert_called_once_with(
116
- mock_run_result.data,
117
  config.embedding,
118
  )
119
 
@@ -124,7 +127,7 @@ async def test_process_and_store_chunk_with_embedding(
124
  kwargs = add_call[1]
125
  assert len(kwargs['embeddings']) == 3
126
  assert kwargs['embeddings'] == mock_embedding
127
- assert kwargs['documents'][0] == mock_run_result.data
128
  assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
129
 
130
  # Verify metadata
 
5
  from knowlang.summarizer.summarizer import CodeSummarizer
6
  from knowlang.core.types import CodeChunk, ChunkType
7
  from knowlang.configs.config import AppConfig
8
+ from knowlang.utils.chunking_util import format_code_summary
9
 
10
  @pytest.fixture
11
  def config():
 
65
 
66
  # Verify result
67
  assert isinstance(result, str)
68
+ assert result == format_code_summary(sample_chunks[0].content, mock_run_result.data)
69
 
70
  # Verify agent was called with correct prompt
71
  call_args = mock_agent.run.call_args[0][0]
 
111
 
112
  # Process the chunk
113
  await summarizer.process_and_store_chunk(sample_chunks[0])
114
+
115
+ code_summary = format_code_summary(sample_chunks[0].content, mock_run_result.data)
116
 
117
  # Verify ollama.embed was called with correct parameters
118
  mock_embedding_generator.assert_called_once_with(
119
+ code_summary,
120
  config.embedding,
121
  )
122
 
 
127
  kwargs = add_call[1]
128
  assert len(kwargs['embeddings']) == 3
129
  assert kwargs['embeddings'] == mock_embedding
130
+ assert kwargs['documents'][0] == code_summary
131
  assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
132
 
133
  # Verify metadata