code summarizer agent implemented
Browse files- .env.example +4 -0
- .gitignore +3 -1
- poetry.lock +0 -0
- pyproject.toml +3 -1
- src/know_lang_bot/__main__.py +15 -5
- src/know_lang_bot/code_parser/summarizer.py +109 -0
- src/know_lang_bot/config.py +50 -0
.env.example
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LLM__MODEL_NAME=llama3.2
|
2 |
+
LLM__MODEL_PROVIDER=ollama
|
3 |
+
LLM__API_KEY=your_api_key
|
4 |
+
DB__PERSIST_DIRECTORY=./my_chroma_db
|
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
.venv
|
2 |
__pycache__
|
3 |
-
.pytest_cache
|
|
|
|
|
|
1 |
.venv
|
2 |
__pycache__
|
3 |
+
.pytest_cache
|
4 |
+
.env*
|
5 |
+
!.env.example
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -15,7 +15,9 @@ dependencies = [
|
|
15 |
"pydantic-ai (>=0.0.20,<0.0.21)",
|
16 |
"gitpython (>=3.1.44,<4.0.0)",
|
17 |
"tree-sitter (>=0.24.0,<0.25.0)",
|
18 |
-
"tree-sitter-python (>=0.23.6,<0.24.0)"
|
|
|
|
|
19 |
]
|
20 |
|
21 |
[tool.poetry]
|
|
|
15 |
"pydantic-ai (>=0.0.20,<0.0.21)",
|
16 |
"gitpython (>=3.1.44,<4.0.0)",
|
17 |
"tree-sitter (>=0.24.0,<0.25.0)",
|
18 |
+
"tree-sitter-python (>=0.23.6,<0.24.0)",
|
19 |
+
"pydantic-settings (>=2.7.1,<3.0.0)",
|
20 |
+
"chromadb (>=0.6.3,<0.7.0)"
|
21 |
]
|
22 |
|
23 |
[tool.poetry]
|
src/know_lang_bot/__main__.py
CHANGED
@@ -1,8 +1,18 @@
|
|
1 |
from know_lang_bot.code_parser.parser import CodeParser
|
|
|
|
|
|
|
2 |
|
3 |
-
# Usage example
|
4 |
if __name__ == "__main__":
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from know_lang_bot.code_parser.parser import CodeParser
|
2 |
+
from know_lang_bot.config import AppConfig
|
3 |
+
from know_lang_bot.code_parser.summarizer import CodeSummarizer
|
4 |
+
import asyncio
|
5 |
|
6 |
+
# Usage example
|
7 |
if __name__ == "__main__":
|
8 |
+
async def main():
|
9 |
+
config = AppConfig() # Will load from .env file if available
|
10 |
+
summarizer = CodeSummarizer(config)
|
11 |
+
|
12 |
+
# Example usage with your parser
|
13 |
+
parser = CodeParser(".")
|
14 |
+
chunks = parser.parse_repository()
|
15 |
+
|
16 |
+
await summarizer.process_chunks(chunks)
|
17 |
+
|
18 |
+
asyncio.run(main())
|
src/know_lang_bot/code_parser/summarizer.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import chromadb
|
3 |
+
from chromadb.errors import InvalidCollectionException
|
4 |
+
from pydantic_ai import Agent
|
5 |
+
from pydantic import BaseModel, Field
|
6 |
+
|
7 |
+
from know_lang_bot.config import AppConfig
|
8 |
+
from know_lang_bot.code_parser.parser import CodeChunk
|
9 |
+
from know_lang_bot.utils.fancy_log import FancyLogger
|
10 |
+
from pprint import pformat
|
11 |
+
|
12 |
+
LOG = FancyLogger(__name__)
|
13 |
+
|
14 |
+
class ChunkMetadata(BaseModel):
|
15 |
+
"""Model for chunk metadata stored in ChromaDB"""
|
16 |
+
file_path: str
|
17 |
+
start_line: int
|
18 |
+
end_line: int
|
19 |
+
type: str
|
20 |
+
name: str
|
21 |
+
docstring: str = Field(default='')
|
22 |
+
|
23 |
+
class CodeSummarizer:
|
24 |
+
def __init__(self, config: AppConfig):
|
25 |
+
self.config = config
|
26 |
+
self._init_agent()
|
27 |
+
self._init_db()
|
28 |
+
|
29 |
+
def _init_agent(self):
|
30 |
+
"""Initialize the LLM agent with configuration"""
|
31 |
+
system_prompt = """
|
32 |
+
You are an expert code analyzer. Your task is to analyze code chunks and provide clear,
|
33 |
+
concise summaries. Focus on:
|
34 |
+
1. The main purpose and functionality
|
35 |
+
2. Key implementation details
|
36 |
+
3. Important patterns or techniques used
|
37 |
+
4. Any notable dependencies or requirements
|
38 |
+
|
39 |
+
Provide a concise summary and list key points separately.
|
40 |
+
"""
|
41 |
+
|
42 |
+
self.agent = Agent(
|
43 |
+
f"{self.config.llm.model_provider}:{self.config.llm.model_name}",
|
44 |
+
system_prompt=system_prompt,
|
45 |
+
model_settings=self.config.llm.model_settings
|
46 |
+
)
|
47 |
+
|
48 |
+
def _init_db(self):
|
49 |
+
"""Initialize ChromaDB with configuration"""
|
50 |
+
self.db_client = chromadb.PersistentClient(
|
51 |
+
path=str(self.config.db.persist_directory)
|
52 |
+
)
|
53 |
+
|
54 |
+
try:
|
55 |
+
self.collection = self.db_client.get_collection(
|
56 |
+
name=self.config.db.collection_name
|
57 |
+
)
|
58 |
+
except InvalidCollectionException:
|
59 |
+
LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
|
60 |
+
self.collection = self.db_client.create_collection(
|
61 |
+
name=self.config.db.collection_name,
|
62 |
+
metadata={"hnsw:space": "cosine"}
|
63 |
+
)
|
64 |
+
|
65 |
+
async def summarize_chunk(self, chunk: CodeChunk) -> str:
|
66 |
+
"""Summarize a single code chunk using the LLM"""
|
67 |
+
prompt = f"""
|
68 |
+
Analyze this {chunk.type.value} code chunk:
|
69 |
+
|
70 |
+
{chunk.content}
|
71 |
+
|
72 |
+
{f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
|
73 |
+
|
74 |
+
Provide a concise summary and key points about its functionality and implementation.
|
75 |
+
"""
|
76 |
+
|
77 |
+
result = await self.agent.run(prompt)
|
78 |
+
LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
|
79 |
+
|
80 |
+
return result.data
|
81 |
+
|
82 |
+
async def process_and_store_chunk(self, chunk: CodeChunk):
|
83 |
+
"""Process a chunk and store it in ChromaDB"""
|
84 |
+
summary = await self.summarize_chunk(chunk)
|
85 |
+
|
86 |
+
# Create a unique ID for the chunk
|
87 |
+
chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
88 |
+
|
89 |
+
# Create metadata using Pydantic model
|
90 |
+
metadata = ChunkMetadata(
|
91 |
+
file_path=chunk.file_path,
|
92 |
+
start_line=chunk.start_line,
|
93 |
+
end_line=chunk.end_line,
|
94 |
+
type=chunk.type.value,
|
95 |
+
name=chunk.name,
|
96 |
+
docstring=chunk.docstring if chunk.docstring else ''
|
97 |
+
)
|
98 |
+
|
99 |
+
# Store in ChromaDB
|
100 |
+
self.collection.add(
|
101 |
+
documents=[summary],
|
102 |
+
metadatas=[metadata.model_dump()],
|
103 |
+
ids=[chunk_id]
|
104 |
+
)
|
105 |
+
|
106 |
+
async def process_chunks(self, chunks: List[CodeChunk]):
|
107 |
+
"""Process multiple chunks in parallel"""
|
108 |
+
for chunk in chunks:
|
109 |
+
await self.process_and_store_chunk(chunk)
|
src/know_lang_bot/config.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, Dict, Any
|
2 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
+
from pydantic import Field
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
class LLMConfig(BaseSettings):
|
7 |
+
model_name: str = Field(
|
8 |
+
default="llama3.2",
|
9 |
+
description="Name of the LLM model to use"
|
10 |
+
)
|
11 |
+
model_provider: str = Field(
|
12 |
+
default="ollama",
|
13 |
+
description="Model provider (anthropic, openai, ollama, etc)"
|
14 |
+
)
|
15 |
+
api_key: Optional[str] = Field(
|
16 |
+
default=None,
|
17 |
+
description="API key for the model provider"
|
18 |
+
)
|
19 |
+
model_settings: Dict[str, Any] = Field(
|
20 |
+
default_factory=dict,
|
21 |
+
description="Additional model settings"
|
22 |
+
)
|
23 |
+
|
24 |
+
class DBConfig(BaseSettings):
|
25 |
+
persist_directory: Path = Field(
|
26 |
+
default=Path("./chroma_db"),
|
27 |
+
description="Directory to store ChromaDB files"
|
28 |
+
)
|
29 |
+
collection_name: str = Field(
|
30 |
+
default="code_chunks",
|
31 |
+
description="Name of the ChromaDB collection"
|
32 |
+
)
|
33 |
+
embedding_model: str = Field(
|
34 |
+
default="sentence-transformers/all-mpnet-base-v2",
|
35 |
+
description="Embedding model to use"
|
36 |
+
)
|
37 |
+
|
38 |
+
class AppConfig(BaseSettings):
|
39 |
+
model_config = SettingsConfigDict(
|
40 |
+
env_file='.env',
|
41 |
+
env_file_encoding='utf-8',
|
42 |
+
env_nested_delimiter='__'
|
43 |
+
)
|
44 |
+
|
45 |
+
llm: LLMConfig = Field(default_factory=LLMConfig)
|
46 |
+
db: DBConfig = Field(default_factory=DBConfig)
|
47 |
+
chunk_max_size: int = Field(
|
48 |
+
default=1500,
|
49 |
+
description="Maximum size of code chunks before splitting"
|
50 |
+
)
|