File size: 4,418 Bytes
103c97e 369993f 103c97e 070f7e7 103c97e 369993f 070f7e7 369993f 070f7e7 369993f 410a99b 369993f 070f7e7 369993f 103c97e 369993f 103c97e 410a99b 103c97e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
from typing import List
import chromadb
from chromadb.errors import InvalidCollectionException
from pydantic_ai import Agent
from pydantic import BaseModel, Field
import ollama
from know_lang_bot.config import AppConfig
from know_lang_bot.core.types import CodeChunk, ModelProvider
from know_lang_bot.utils.fancy_log import FancyLogger
from pprint import pformat
LOG = FancyLogger(__name__)
class ChunkMetadata(BaseModel):
"""Model for chunk metadata stored in ChromaDB"""
file_path: str
start_line: int
end_line: int
type: str
name: str
docstring: str = Field(default='')
class CodeSummarizer:
def __init__(self, config: AppConfig):
self.config = config
self._init_agent()
self._init_db()
def _init_agent(self):
"""Initialize the LLM agent with configuration"""
system_prompt = """
You are an expert code analyzer. Your task is to analyze code chunks and provide clear,
concise summaries. Focus on:
1. The main purpose and functionality
2. Key implementation details
3. Important patterns or techniques used
4. Any notable dependencies or requirements
Provide a concise summary and list key points separately.
"""
self.agent = Agent(
f"{self.config.llm.model_provider}:{self.config.llm.model_name}",
system_prompt=system_prompt,
model_settings=self.config.llm.model_settings
)
def _init_db(self):
"""Initialize ChromaDB with configuration"""
self.db_client = chromadb.PersistentClient(
path=str(self.config.db.persist_directory)
)
try:
self.collection = self.db_client.get_collection(
name=self.config.db.collection_name
)
except InvalidCollectionException:
LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
self.collection = self.db_client.create_collection(
name=self.config.db.collection_name,
metadata={"hnsw:space": "cosine"}
)
def _get_embedding(self, text: str) -> List[float]:
"""Get embedding for text using configured provider"""
if self.config.embedding.provider == ModelProvider.OLLAMA:
response = ollama.embed(
model=self.config.embedding.model_name,
input=text
)
return response['embeddings']
else:
raise ValueError(f"Unsupported embedding provider: {self.config.embedding.provider}")
async def summarize_chunk(self, chunk: CodeChunk) -> str:
"""Summarize a single code chunk using the LLM"""
prompt = f"""
Analyze this {chunk.type.value} code chunk:
{chunk.content}
{f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
Provide a concise summary and key points about its functionality and implementation.
"""
result = await self.agent.run(prompt)
LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
return result.data
async def process_and_store_chunk(self, chunk: CodeChunk):
"""Process a chunk and store it in ChromaDB"""
summary = await self.summarize_chunk(chunk)
# Create a unique ID for the chunk
chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
# Create metadata using Pydantic model
metadata = ChunkMetadata(
file_path=chunk.file_path,
start_line=chunk.start_line,
end_line=chunk.end_line,
type=chunk.type.value,
name=chunk.name,
docstring=chunk.docstring if chunk.docstring else ''
)
# Get embedding for the summary
embedding = self._get_embedding(summary)
# Store in ChromaDB
self.collection.add(
documents=[summary],
embeddings=embedding,
metadatas=[metadata.model_dump()],
ids=[chunk_id]
)
async def process_chunks(self, chunks: List[CodeChunk]):
"""Process multiple chunks in parallel"""
for chunk in chunks:
await self.process_and_store_chunk(chunk) |