File size: 4,806 Bytes
103c97e
 
 
 
 
d1597d8
 
103c97e
60532a1
3a5efa8
 
60532a1
 
 
103c97e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69bc940
 
 
e288cae
 
 
 
69bc940
e288cae
 
 
69bc940
e288cae
 
103c97e
69bc940
e288cae
103c97e
 
183e719
 
 
 
103c97e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e288cae
103c97e
 
 
 
 
3a5efa8
 
103c97e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369993f
3536fc0
369993f
103c97e
 
 
410a99b
103c97e
 
 
 
 
 
d1597d8
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from typing import List
import chromadb
from chromadb.errors import InvalidCollectionException
from pydantic_ai import Agent
from pydantic import BaseModel, Field
from pprint import pformat
from rich.progress import Progress

from knowlang.configs.config import AppConfig
from knowlang.core.types import CodeChunk
from knowlang.utils.chunking_util import format_code_summary
from knowlang.utils.fancy_log import FancyLogger
from knowlang.utils.model_provider import create_pydantic_model
from knowlang.models.embeddings import generate_embedding

LOG = FancyLogger(__name__)

class ChunkMetadata(BaseModel):
    """Model for chunk metadata stored in ChromaDB"""
    file_path: str
    start_line: int
    end_line: int
    type: str
    name: str
    docstring: str = Field(default='')

class CodeSummarizer:
    def __init__(self, config: AppConfig):
        self.config = config
        self._init_agent()
        self._init_db()

    def _init_agent(self):
        """Initialize the LLM agent with configuration"""
        system_prompt = """
You are an expert code analyzer specializing in creating searchable and contextual code summaries. 
Your summaries will be used in a RAG system to help developers understand complex codebases.
Focus on following points:
1. The main purpose and functionality
- Use precise technical terms
- Preserve class/function/variable names exactly
- State the primary purpose
2. Narrow down key implementation details
- Focus on key algorithms, patterns, or design choices
- Highlight important method signatures and interfaces
3. Any notable dependencies or requirements
- Reference related classes/functions by exact name
- List external dependencies
- Note any inherited or implemented interfaces
        
Provide a clean, concise and focused summary. Don't include unnecessary nor generic details.
"""
        
        self.agent = Agent(
            create_pydantic_model(
                model_provider=self.config.llm.model_provider,
                model_name=self.config.llm.model_name
            ),
            system_prompt=system_prompt,
            model_settings=self.config.llm.model_settings
        )

    def _init_db(self):
        """Initialize ChromaDB with configuration"""
        self.db_client = chromadb.PersistentClient(
            path=str(self.config.db.persist_directory)
        )
        
        try:
            self.collection = self.db_client.get_collection(
                name=self.config.db.collection_name
            )
        except InvalidCollectionException:
            LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
            self.collection = self.db_client.create_collection(
                name=self.config.db.collection_name,
                metadata={"hnsw:space": "cosine"}
            )

    async def summarize_chunk(self, chunk: CodeChunk) -> str:
        """Summarize a single code chunk using the LLM"""
        prompt = f"""
        Analyze this {chunk.type.value} code chunk:
        
        {chunk.content}
        
        {f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
        
        Provide a concise summary.
        """
        
        result = await self.agent.run(prompt)
        LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")

        return format_code_summary(chunk.content, result.data)
    
    async def process_and_store_chunk(self, chunk: CodeChunk):
        """Process a chunk and store it in ChromaDB"""
        summary = await self.summarize_chunk(chunk)
        
        # Create a unique ID for the chunk
        chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
        
        # Create metadata using Pydantic model
        metadata = ChunkMetadata(
            file_path=chunk.file_path,
            start_line=chunk.start_line,
            end_line=chunk.end_line,
            type=chunk.type.value,
            name=chunk.name,
            docstring=chunk.docstring if chunk.docstring else ''
        )
        
        # Get embedding for the summary
        embedding = generate_embedding(summary, self.config.embedding)
        
        # Store in ChromaDB
        self.collection.add(
            documents=[summary],
            embeddings=embedding,
            metadatas=[metadata.model_dump()],
            ids=[chunk_id]
        )

    async def process_chunks(self, chunks: List[CodeChunk]):
        """Process multiple chunks in parallel"""
        with Progress() as progress:
            task = progress.add_task("Summarizing chunks into vector database...", total=len(chunks))
            
            for chunk in chunks:
                await self.process_and_store_chunk(chunk)
                progress.advance(task)