Spaces:
Sleeping
Sleeping
File size: 4,983 Bytes
103c97e 369993f d1597d8 103c97e 070f7e7 103c97e 69bc940 e288cae 69bc940 e288cae 69bc940 e288cae 103c97e 69bc940 e288cae 103c97e 369993f 070f7e7 369993f 070f7e7 369993f 410a99b 369993f 070f7e7 369993f 103c97e e288cae 103c97e 369993f 103c97e 410a99b 103c97e d1597d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
from typing import List
import chromadb
from chromadb.errors import InvalidCollectionException
from pydantic_ai import Agent
from pydantic import BaseModel, Field
import ollama
from pprint import pformat
from rich.progress import Progress
from know_lang_bot.config import AppConfig
from know_lang_bot.core.types import CodeChunk, ModelProvider
from know_lang_bot.utils.fancy_log import FancyLogger
LOG = FancyLogger(__name__)
class ChunkMetadata(BaseModel):
"""Model for chunk metadata stored in ChromaDB"""
file_path: str
start_line: int
end_line: int
type: str
name: str
docstring: str = Field(default='')
class CodeSummarizer:
def __init__(self, config: AppConfig):
self.config = config
self._init_agent()
self._init_db()
def _init_agent(self):
"""Initialize the LLM agent with configuration"""
system_prompt = """
You are an expert code analyzer specializing in creating searchable and contextual code summaries.
Your summaries will be used in a RAG system to help developers understand complex codebases.
Focus on following points:
1. The main purpose and functionality
- Use precise technical terms
- Preserve class/function/variable names exactly
- State the primary purpose
2. Narrow down key implementation details
- Focus on key algorithms, patterns, or design choices
- Highlight important method signatures and interfaces
3. Any notable dependencies or requirements
- Reference related classes/functions by exact name
- List external dependencies
- Note any inherited or implemented interfaces
Provide a clean, concise and focused summary. Don't include unnecessary nor generic details.
"""
self.agent = Agent(
f"{self.config.llm.model_provider}:{self.config.llm.model_name}",
system_prompt=system_prompt,
model_settings=self.config.llm.model_settings
)
def _init_db(self):
"""Initialize ChromaDB with configuration"""
self.db_client = chromadb.PersistentClient(
path=str(self.config.db.persist_directory)
)
try:
self.collection = self.db_client.get_collection(
name=self.config.db.collection_name
)
except InvalidCollectionException:
LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
self.collection = self.db_client.create_collection(
name=self.config.db.collection_name,
metadata={"hnsw:space": "cosine"}
)
def _get_embedding(self, text: str) -> List[float]:
"""Get embedding for text using configured provider"""
if self.config.embedding.provider == ModelProvider.OLLAMA:
response = ollama.embed(
model=self.config.embedding.model_name,
input=text
)
return response['embeddings']
else:
raise ValueError(f"Unsupported embedding provider: {self.config.embedding.provider}")
async def summarize_chunk(self, chunk: CodeChunk) -> str:
"""Summarize a single code chunk using the LLM"""
prompt = f"""
Analyze this {chunk.type.value} code chunk:
{chunk.content}
{f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
Provide a concise summary.
"""
result = await self.agent.run(prompt)
LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
return result.data
async def process_and_store_chunk(self, chunk: CodeChunk):
"""Process a chunk and store it in ChromaDB"""
summary = await self.summarize_chunk(chunk)
# Create a unique ID for the chunk
chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
# Create metadata using Pydantic model
metadata = ChunkMetadata(
file_path=chunk.file_path,
start_line=chunk.start_line,
end_line=chunk.end_line,
type=chunk.type.value,
name=chunk.name,
docstring=chunk.docstring if chunk.docstring else ''
)
# Get embedding for the summary
embedding = self._get_embedding(summary)
# Store in ChromaDB
self.collection.add(
documents=[summary],
embeddings=embedding,
metadatas=[metadata.model_dump()],
ids=[chunk_id]
)
async def process_chunks(self, chunks: List[CodeChunk]):
"""Process multiple chunks in parallel"""
with Progress() as progress:
task = progress.add_task("Summarizing chunks into vector database...", total=len(chunks))
for chunk in chunks:
await self.process_and_store_chunk(chunk)
progress.advance(task) |