gabykim commited on
Commit
103c97e
·
1 Parent(s): d9d9220

code summarizer agent implemented

Browse files
.env.example ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ LLM__MODEL_NAME=llama3.2
2
+ LLM__MODEL_PROVIDER=ollama
3
+ LLM__API_KEY=your_api_key
4
+ DB__PERSIST_DIRECTORY=./my_chroma_db
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  .venv
2
  __pycache__
3
- .pytest_cache
 
 
 
1
  .venv
2
  __pycache__
3
+ .pytest_cache
4
+ .env*
5
+ !.env.example
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -15,7 +15,9 @@ dependencies = [
15
  "pydantic-ai (>=0.0.20,<0.0.21)",
16
  "gitpython (>=3.1.44,<4.0.0)",
17
  "tree-sitter (>=0.24.0,<0.25.0)",
18
- "tree-sitter-python (>=0.23.6,<0.24.0)"
 
 
19
  ]
20
 
21
  [tool.poetry]
 
15
  "pydantic-ai (>=0.0.20,<0.0.21)",
16
  "gitpython (>=3.1.44,<4.0.0)",
17
  "tree-sitter (>=0.24.0,<0.25.0)",
18
+ "tree-sitter-python (>=0.23.6,<0.24.0)",
19
+ "pydantic-settings (>=2.7.1,<3.0.0)",
20
+ "chromadb (>=0.6.3,<0.7.0)"
21
  ]
22
 
23
  [tool.poetry]
src/know_lang_bot/__main__.py CHANGED
@@ -1,8 +1,18 @@
1
  from know_lang_bot.code_parser.parser import CodeParser
 
 
 
2
 
3
- # Usage example:
4
  if __name__ == "__main__":
5
- parser = CodeParser(".")
6
- chunks = parser.parse_repository()
7
- for chunk in chunks:
8
- print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")
 
 
 
 
 
 
 
 
1
  from know_lang_bot.code_parser.parser import CodeParser
2
+ from know_lang_bot.config import AppConfig
3
+ from know_lang_bot.code_parser.summarizer import CodeSummarizer
4
+ import asyncio
5
 
6
+ # Usage example
7
  if __name__ == "__main__":
8
+ async def main():
9
+ config = AppConfig() # Will load from .env file if available
10
+ summarizer = CodeSummarizer(config)
11
+
12
+ # Example usage with your parser
13
+ parser = CodeParser(".")
14
+ chunks = parser.parse_repository()
15
+
16
+ await summarizer.process_chunks(chunks)
17
+
18
+ asyncio.run(main())
src/know_lang_bot/code_parser/summarizer.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import chromadb
3
+ from chromadb.errors import InvalidCollectionException
4
+ from pydantic_ai import Agent
5
+ from pydantic import BaseModel, Field
6
+
7
+ from know_lang_bot.config import AppConfig
8
+ from know_lang_bot.code_parser.parser import CodeChunk
9
+ from know_lang_bot.utils.fancy_log import FancyLogger
10
+ from pprint import pformat
11
+
12
+ LOG = FancyLogger(__name__)
13
+
14
+ class ChunkMetadata(BaseModel):
15
+ """Model for chunk metadata stored in ChromaDB"""
16
+ file_path: str
17
+ start_line: int
18
+ end_line: int
19
+ type: str
20
+ name: str
21
+ docstring: str = Field(default='')
22
+
23
+ class CodeSummarizer:
24
+ def __init__(self, config: AppConfig):
25
+ self.config = config
26
+ self._init_agent()
27
+ self._init_db()
28
+
29
+ def _init_agent(self):
30
+ """Initialize the LLM agent with configuration"""
31
+ system_prompt = """
32
+ You are an expert code analyzer. Your task is to analyze code chunks and provide clear,
33
+ concise summaries. Focus on:
34
+ 1. The main purpose and functionality
35
+ 2. Key implementation details
36
+ 3. Important patterns or techniques used
37
+ 4. Any notable dependencies or requirements
38
+
39
+ Provide a concise summary and list key points separately.
40
+ """
41
+
42
+ self.agent = Agent(
43
+ f"{self.config.llm.model_provider}:{self.config.llm.model_name}",
44
+ system_prompt=system_prompt,
45
+ model_settings=self.config.llm.model_settings
46
+ )
47
+
48
+ def _init_db(self):
49
+ """Initialize ChromaDB with configuration"""
50
+ self.db_client = chromadb.PersistentClient(
51
+ path=str(self.config.db.persist_directory)
52
+ )
53
+
54
+ try:
55
+ self.collection = self.db_client.get_collection(
56
+ name=self.config.db.collection_name
57
+ )
58
+ except InvalidCollectionException:
59
+ LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
60
+ self.collection = self.db_client.create_collection(
61
+ name=self.config.db.collection_name,
62
+ metadata={"hnsw:space": "cosine"}
63
+ )
64
+
65
+ async def summarize_chunk(self, chunk: CodeChunk) -> str:
66
+ """Summarize a single code chunk using the LLM"""
67
+ prompt = f"""
68
+ Analyze this {chunk.type.value} code chunk:
69
+
70
+ {chunk.content}
71
+
72
+ {f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
73
+
74
+ Provide a concise summary and key points about its functionality and implementation.
75
+ """
76
+
77
+ result = await self.agent.run(prompt)
78
+ LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
79
+
80
+ return result.data
81
+
82
+ async def process_and_store_chunk(self, chunk: CodeChunk):
83
+ """Process a chunk and store it in ChromaDB"""
84
+ summary = await self.summarize_chunk(chunk)
85
+
86
+ # Create a unique ID for the chunk
87
+ chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
88
+
89
+ # Create metadata using Pydantic model
90
+ metadata = ChunkMetadata(
91
+ file_path=chunk.file_path,
92
+ start_line=chunk.start_line,
93
+ end_line=chunk.end_line,
94
+ type=chunk.type.value,
95
+ name=chunk.name,
96
+ docstring=chunk.docstring if chunk.docstring else ''
97
+ )
98
+
99
+ # Store in ChromaDB
100
+ self.collection.add(
101
+ documents=[summary],
102
+ metadatas=[metadata.model_dump()],
103
+ ids=[chunk_id]
104
+ )
105
+
106
+ async def process_chunks(self, chunks: List[CodeChunk]):
107
+ """Process multiple chunks in parallel"""
108
+ for chunk in chunks:
109
+ await self.process_and_store_chunk(chunk)
src/know_lang_bot/config.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Dict, Any
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+ from pydantic import Field
4
+ from pathlib import Path
5
+
6
+ class LLMConfig(BaseSettings):
7
+ model_name: str = Field(
8
+ default="llama3.2",
9
+ description="Name of the LLM model to use"
10
+ )
11
+ model_provider: str = Field(
12
+ default="ollama",
13
+ description="Model provider (anthropic, openai, ollama, etc)"
14
+ )
15
+ api_key: Optional[str] = Field(
16
+ default=None,
17
+ description="API key for the model provider"
18
+ )
19
+ model_settings: Dict[str, Any] = Field(
20
+ default_factory=dict,
21
+ description="Additional model settings"
22
+ )
23
+
24
+ class DBConfig(BaseSettings):
25
+ persist_directory: Path = Field(
26
+ default=Path("./chroma_db"),
27
+ description="Directory to store ChromaDB files"
28
+ )
29
+ collection_name: str = Field(
30
+ default="code_chunks",
31
+ description="Name of the ChromaDB collection"
32
+ )
33
+ embedding_model: str = Field(
34
+ default="sentence-transformers/all-mpnet-base-v2",
35
+ description="Embedding model to use"
36
+ )
37
+
38
+ class AppConfig(BaseSettings):
39
+ model_config = SettingsConfigDict(
40
+ env_file='.env',
41
+ env_file_encoding='utf-8',
42
+ env_nested_delimiter='__'
43
+ )
44
+
45
+ llm: LLMConfig = Field(default_factory=LLMConfig)
46
+ db: DBConfig = Field(default_factory=DBConfig)
47
+ chunk_max_size: int = Field(
48
+ default=1500,
49
+ description="Maximum size of code chunks before splitting"
50
+ )