Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

KnowLang_Transformers_Demo / src /knowlang /summarizer /summarizer.py

gabykim

separate codebase dir from file absolute path

667a527 5 months ago

raw

history blame contribute delete

4.93 kB

	from pathlib import Path
	from typing import List
	import chromadb
	from chromadb.errors import InvalidCollectionException
	from pydantic_ai import Agent
	from pydantic import BaseModel, Field
	from pprint import pformat
	from rich.progress import Progress

	from knowlang.configs.config import AppConfig
	from knowlang.core.types import CodeChunk
	from knowlang.utils.chunking_util import format_code_summary
	from knowlang.utils.fancy_log import FancyLogger
	from knowlang.utils.model_provider import create_pydantic_model
	from knowlang.models.embeddings import generate_embedding

	LOG = FancyLogger(__name__)

	class ChunkMetadata(BaseModel):
	"""Model for chunk metadata stored in ChromaDB"""
	file_path: str
	start_line: int
	end_line: int
	type: str
	name: str
	docstring: str = Field(default='')

	class CodeSummarizer:
	def __init__(self, config: AppConfig):
	self.config = config
	self._init_agent()
	self._init_db()

	def _init_agent(self):
	"""Initialize the LLM agent with configuration"""
	system_prompt = """
	You are an expert code analyzer specializing in creating searchable and contextual code summaries.
	Your summaries will be used in a RAG system to help developers understand complex codebases.
	Focus on following points:
	1. The main purpose and functionality
	- Use precise technical terms
	- Preserve class/function/variable names exactly
	- State the primary purpose
	2. Narrow down key implementation details
	- Focus on key algorithms, patterns, or design choices
	- Highlight important method signatures and interfaces
	3. Any notable dependencies or requirements
	- Reference related classes/functions by exact name
	- List external dependencies
	- Note any inherited or implemented interfaces

	Provide a clean, concise and focused summary. Don't include unnecessary nor generic details.
	"""

	self.agent = Agent(
	create_pydantic_model(
	model_provider=self.config.llm.model_provider,
	model_name=self.config.llm.model_name
	),
	system_prompt=system_prompt,
	model_settings=self.config.llm.model_settings
	)

	def _init_db(self):
	"""Initialize ChromaDB with configuration"""
	self.db_client = chromadb.PersistentClient(
	path=str(self.config.db.persist_directory)
	)

	try:
	self.collection = self.db_client.get_collection(
	name=self.config.db.collection_name
	)
	except InvalidCollectionException:
	LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
	self.collection = self.db_client.create_collection(
	name=self.config.db.collection_name,
	metadata={"hnsw:space": "cosine"}
	)

	async def summarize_chunk(self, chunk: CodeChunk) -> str:
	"""Summarize a single code chunk using the LLM"""
	prompt = f"""
	Analyze this {chunk.type.value} code chunk:

	{chunk.content}

	{f'Docstring: {chunk.docstring}' if chunk.docstring else ''}

	Provide a concise summary.
	"""

	result = await self.agent.run(prompt)
	LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")

	return format_code_summary(chunk.content, result.data)

	async def process_and_store_chunk(self, chunk: CodeChunk):
	"""Process a chunk and store it in ChromaDB"""
	summary = await self.summarize_chunk(chunk)

	# Create a unique ID for the chunk
	relative_path = Path(chunk.file_path).relative_to(self.config.db.codebase_directory).as_posix()
	chunk_id = f"{relative_path}:{chunk.start_line}-{chunk.end_line}"

	# Create metadata using Pydantic model
	metadata = ChunkMetadata(
	file_path=relative_path,
	start_line=chunk.start_line,
	end_line=chunk.end_line,
	type=chunk.type.value,
	name=chunk.name,
	docstring=chunk.docstring if chunk.docstring else ''
	)

	# Get embedding for the summary
	embedding = generate_embedding(summary, self.config.embedding)

	# Store in ChromaDB
	self.collection.add(
	documents=[summary],
	embeddings=embedding,
	metadatas=[metadata.model_dump()],
	ids=[chunk_id]
	)

	async def process_chunks(self, chunks: List[CodeChunk]):
	"""Process multiple chunks in parallel"""
	with Progress() as progress:
	task = progress.add_task("Summarizing chunks into vector database...", total=len(chunks))

	for chunk in chunks:
	await self.process_and_store_chunk(chunk)
	progress.advance(task)