from pathlib import Path from typing import List import chromadb from chromadb.errors import InvalidCollectionException from pydantic_ai import Agent from pydantic import BaseModel, Field from pprint import pformat from rich.progress import Progress from knowlang.configs.config import AppConfig from knowlang.core.types import CodeChunk from knowlang.utils.chunking_util import format_code_summary from knowlang.utils.fancy_log import FancyLogger from knowlang.utils.model_provider import create_pydantic_model from knowlang.models.embeddings import generate_embedding LOG = FancyLogger(__name__) class ChunkMetadata(BaseModel): """Model for chunk metadata stored in ChromaDB""" file_path: str start_line: int end_line: int type: str name: str docstring: str = Field(default='') class CodeSummarizer: def __init__(self, config: AppConfig): self.config = config self._init_agent() self._init_db() def _init_agent(self): """Initialize the LLM agent with configuration""" system_prompt = """ You are an expert code analyzer specializing in creating searchable and contextual code summaries. Your summaries will be used in a RAG system to help developers understand complex codebases. Focus on following points: 1. The main purpose and functionality - Use precise technical terms - Preserve class/function/variable names exactly - State the primary purpose 2. Narrow down key implementation details - Focus on key algorithms, patterns, or design choices - Highlight important method signatures and interfaces 3. Any notable dependencies or requirements - Reference related classes/functions by exact name - List external dependencies - Note any inherited or implemented interfaces Provide a clean, concise and focused summary. Don't include unnecessary nor generic details. """ self.agent = Agent( create_pydantic_model( model_provider=self.config.llm.model_provider, model_name=self.config.llm.model_name ), system_prompt=system_prompt, model_settings=self.config.llm.model_settings ) def _init_db(self): """Initialize ChromaDB with configuration""" self.db_client = chromadb.PersistentClient( path=str(self.config.db.persist_directory) ) try: self.collection = self.db_client.get_collection( name=self.config.db.collection_name ) except InvalidCollectionException: LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection") self.collection = self.db_client.create_collection( name=self.config.db.collection_name, metadata={"hnsw:space": "cosine"} ) async def summarize_chunk(self, chunk: CodeChunk) -> str: """Summarize a single code chunk using the LLM""" prompt = f""" Analyze this {chunk.type.value} code chunk: {chunk.content} {f'Docstring: {chunk.docstring}' if chunk.docstring else ''} Provide a concise summary. """ result = await self.agent.run(prompt) LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}") return format_code_summary(chunk.content, result.data) async def process_and_store_chunk(self, chunk: CodeChunk): """Process a chunk and store it in ChromaDB""" summary = await self.summarize_chunk(chunk) # Create a unique ID for the chunk relative_path = Path(chunk.file_path).relative_to(self.config.db.codebase_directory).as_posix() chunk_id = f"{relative_path}:{chunk.start_line}-{chunk.end_line}" # Create metadata using Pydantic model metadata = ChunkMetadata( file_path=relative_path, start_line=chunk.start_line, end_line=chunk.end_line, type=chunk.type.value, name=chunk.name, docstring=chunk.docstring if chunk.docstring else '' ) # Get embedding for the summary embedding = generate_embedding(summary, self.config.embedding) # Store in ChromaDB self.collection.add( documents=[summary], embeddings=embedding, metadatas=[metadata.model_dump()], ids=[chunk_id] ) async def process_chunks(self, chunks: List[CodeChunk]): """Process multiple chunks in parallel""" with Progress() as progress: task = progress.add_task("Summarizing chunks into vector database...", total=len(chunks)) for chunk in chunks: await self.process_and_store_chunk(chunk) progress.advance(task)