Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

@@ -15,7 +15,9 @@ dependencies = [
     "pydantic-ai (>=0.0.20,<0.0.21)",
     "gitpython (>=3.1.44,<4.0.0)",
     "tree-sitter (>=0.24.0,<0.25.0)",
-    "tree-sitter-python (>=0.23.6,<0.24.0)"
 ]
 [tool.poetry]

     "pydantic-ai (>=0.0.20,<0.0.21)",
     "gitpython (>=3.1.44,<4.0.0)",
     "tree-sitter (>=0.24.0,<0.25.0)",
+    "tree-sitter-python (>=0.23.6,<0.24.0)",
+    "pydantic-settings (>=2.7.1,<3.0.0)",
+    "chromadb (>=0.6.3,<0.7.0)"
 ]
 [tool.poetry]

src/know_lang_bot/__main__.py CHANGED Viewed

@@ -1,8 +1,18 @@
 from know_lang_bot.code_parser.parser import CodeParser
-# Usage example:
 if __name__ == "__main__":
-    parser = CodeParser(".")
-    chunks = parser.parse_repository()
-    for chunk in chunks:
-        print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")

 from know_lang_bot.code_parser.parser import CodeParser
+from know_lang_bot.config import AppConfig
+from know_lang_bot.code_parser.summarizer import CodeSummarizer
+import asyncio
+# Usage example
 if __name__ == "__main__":
+    async def main():
+        config = AppConfig()  # Will load from .env file if available
+        summarizer = CodeSummarizer(config)
+        # Example usage with your parser
+        parser = CodeParser(".")
+        chunks = parser.parse_repository()
+        await summarizer.process_chunks(chunks)
+    asyncio.run(main())

src/know_lang_bot/code_parser/summarizer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import List
+import chromadb
+from chromadb.errors import InvalidCollectionException
+from pydantic_ai import Agent
+from pydantic import BaseModel, Field
+from know_lang_bot.config import AppConfig
+from know_lang_bot.code_parser.parser import CodeChunk
+from know_lang_bot.utils.fancy_log import FancyLogger
+from pprint import pformat
+LOG = FancyLogger(__name__)
+class ChunkMetadata(BaseModel):
+    """Model for chunk metadata stored in ChromaDB"""
+    file_path: str
+    start_line: int
+    end_line: int
+    type: str
+    name: str
+    docstring: str = Field(default='')
+class CodeSummarizer:
+    def __init__(self, config: AppConfig):
+        self.config = config
+        self._init_agent()
+        self._init_db()
+    def _init_agent(self):
+        """Initialize the LLM agent with configuration"""
+        system_prompt = """
+        You are an expert code analyzer. Your task is to analyze code chunks and provide clear,
+        concise summaries. Focus on:
+        1. The main purpose and functionality
+        2. Key implementation details
+        3. Important patterns or techniques used
+        4. Any notable dependencies or requirements
+        Provide a concise summary and list key points separately.
+        """
+        self.agent = Agent(
+            f"{self.config.llm.model_provider}:{self.config.llm.model_name}",
+            system_prompt=system_prompt,
+            model_settings=self.config.llm.model_settings
+        )
+    def _init_db(self):
+        """Initialize ChromaDB with configuration"""
+        self.db_client = chromadb.PersistentClient(
+            path=str(self.config.db.persist_directory)
+        )
+        try:
+            self.collection = self.db_client.get_collection(
+                name=self.config.db.collection_name
+            )
+        except InvalidCollectionException:
+            LOG.debug(f"Collection {self.config.db.collection_name} not found, creating new collection")
+            self.collection = self.db_client.create_collection(
+                name=self.config.db.collection_name,
+                metadata={"hnsw:space": "cosine"}
+            )
+    async def summarize_chunk(self, chunk: CodeChunk) -> str:
+        """Summarize a single code chunk using the LLM"""
+        prompt = f"""
+        Analyze this {chunk.type.value} code chunk:
+        {chunk.content}
+        {f'Docstring: {chunk.docstring}' if chunk.docstring else ''}
+        Provide a concise summary and key points about its functionality and implementation.
+        """
+        result = await self.agent.run(prompt)
+        LOG.debug(f"Summary for chunk {chunk.file_path}:{chunk.start_line}-{chunk.end_line}:\n{pformat(result.data)}")
+        return result.data
+    async def process_and_store_chunk(self, chunk: CodeChunk):
+        """Process a chunk and store it in ChromaDB"""
+        summary = await self.summarize_chunk(chunk)
+        # Create a unique ID for the chunk
+        chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
+        # Create metadata using Pydantic model
+        metadata = ChunkMetadata(
+            file_path=chunk.file_path,
+            start_line=chunk.start_line,
+            end_line=chunk.end_line,
+            type=chunk.type.value,
+            name=chunk.name,
+            docstring=chunk.docstring if chunk.docstring else ''
+        )
+        # Store in ChromaDB
+        self.collection.add(
+            documents=[summary],
+            metadatas=[metadata.model_dump()],
+            ids=[chunk_id]
+        )
+    async def process_chunks(self, chunks: List[CodeChunk]):
+        """Process multiple chunks in parallel"""
+        for chunk in chunks:
+            await self.process_and_store_chunk(chunk)

src/know_lang_bot/config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Optional, Dict, Any
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+from pathlib import Path
+class LLMConfig(BaseSettings):
+    model_name: str = Field(
+        default="llama3.2",
+        description="Name of the LLM model to use"
+    )
+    model_provider: str = Field(
+        default="ollama",
+        description="Model provider (anthropic, openai, ollama, etc)"
+    )
+    api_key: Optional[str] = Field(
+        default=None,
+        description="API key for the model provider"
+    )
+    model_settings: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional model settings"
+    )
+class DBConfig(BaseSettings):
+    persist_directory: Path = Field(
+        default=Path("./chroma_db"),
+        description="Directory to store ChromaDB files"
+    )
+    collection_name: str = Field(
+        default="code_chunks",
+        description="Name of the ChromaDB collection"
+    )
+    embedding_model: str = Field(
+        default="sentence-transformers/all-mpnet-base-v2",
+        description="Embedding model to use"
+    )
+class AppConfig(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file='.env',
+        env_file_encoding='utf-8',
+        env_nested_delimiter='__'
+    )
+    llm: LLMConfig = Field(default_factory=LLMConfig)
+    db: DBConfig = Field(default_factory=DBConfig)
+    chunk_max_size: int = Field(
+        default=1500,
+        description="Maximum size of code chunks before splitting"
+    )