Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

gabykim commited on Feb 9

Commit

667a527

1 Parent(s): f6b1033

separate codebase dir from file absolute path

Browse files

Files changed (5) hide show

.env.example +0 -1
src/knowlang/chat_bot/chat_interface.py +6 -32
src/knowlang/configs/chat_config.py +0 -4
src/knowlang/summarizer/summarizer.py +5 -3
tests/test_summarizer.py +6 -5

.env.example CHANGED Viewed

@@ -38,7 +38,6 @@ CHAT__SIMILARITY_THRESHOLD=0.7
 CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
 CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
 CHAT__MAX_LENGTH_PER_CHUNK=8000
-CHAT__CODE_PATH_PREFIX=''
 # Embedding Configuration
 # Settings for text embedding generation

 CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
 CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
 CHAT__MAX_LENGTH_PER_CHUNK=8000
 # Embedding Configuration
 # Settings for text embedding generation

src/knowlang/chat_bot/chat_interface.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from dataclasses import dataclass
 import gradio as gr
-from knowlang.configs.chat_config import ChatConfig
 from knowlang.configs.config import AppConfig
 from knowlang.utils.fancy_log import FancyLogger
 from knowlang.utils.rate_limiter import RateLimiter
@@ -20,11 +19,9 @@ class CodeContext:
     start_line: int
     end_line: int
-    def to_title(self, config: ChatConfig) -> str:
         """Format code context as a title string"""
-        truncated_file_path = self.file_path[len(config.code_path_prefix):]
-        title = f"📄 {truncated_file_path} (lines {self.start_line}-{self.end_line})"
-        return title
     @classmethod
     def from_metadata(cls, metadata: Dict) -> "CodeContext":
@@ -39,7 +36,6 @@ class CodeQAChatInterface:
     def __init__(self, config: AppConfig):
         self.config = config
         self._init_chroma()
-        self.codebase_dir = Path(config.db.codebase_directory)
         self.rate_limiter = RateLimiter()
         self.chat_analytics = ChatAnalytics(config.chat_analytics)
@@ -51,33 +47,11 @@ class CodeQAChatInterface:
         self.collection = self.db_client.get_collection(
             name=self.config.db.collection_name
         )
-    def _get_code_block(self, file_path: str, start_line: int, end_line: int) -> str:
-        """Read the specified lines from a file and return as a code block"""
-        try:
-            full_path = self.codebase_dir / file_path[len(self.config.chat.code_path_prefix):]
-            print(f"Reading code block from {full_path}")
-            with open(full_path, 'r') as f:
-                lines = f.readlines()
-                code_lines = lines[start_line-1:end_line]
-                return ''.join(code_lines)
-        except Exception as e:
-            LOG.error(f"Error reading code block: {e}")
-            return "Error reading code"
-    def _format_code_block(self, metadata: Dict) -> str:
         """Format a single code block with metadata"""
         context = CodeContext.from_metadata(metadata)
-        code = self._get_code_block(
-            context.file_path,
-            context.start_line,
-            context.end_line
-        )
-        if not code:
-            return None
-        return f"<details><summary>{context.to_title(self.config.chat)}</summary>\n\n```python\n{code}\n```\n\n</details>"
     def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
          # Get the query and response pair
@@ -150,8 +124,8 @@ class CodeQAChatInterface:
             # Add code blocks before final answer if not added yet
             if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
                 total_code_blocks = []
-                for metadata in result.retrieved_context.metadatas:
-                    code_block = self._format_code_block(metadata)
                     if code_block:
                         total_code_blocks.append(code_block)

 from dataclasses import dataclass
 import gradio as gr
 from knowlang.configs.config import AppConfig
 from knowlang.utils.fancy_log import FancyLogger
 from knowlang.utils.rate_limiter import RateLimiter
     start_line: int
     end_line: int
+    def to_title(self) -> str:
         """Format code context as a title string"""
+        return f"📄 {self.file_path} (lines {self.start_line}-{self.end_line})"
     @classmethod
     def from_metadata(cls, metadata: Dict) -> "CodeContext":
     def __init__(self, config: AppConfig):
         self.config = config
         self._init_chroma()
         self.rate_limiter = RateLimiter()
         self.chat_analytics = ChatAnalytics(config.chat_analytics)
         self.collection = self.db_client.get_collection(
             name=self.config.db.collection_name
         )
+    def _format_code_block(self, code : str,  metadata: Dict) -> str:
         """Format a single code block with metadata"""
         context = CodeContext.from_metadata(metadata)
+        return f"<details><summary>{context.to_title()}</summary>\n\n```python\n{code}\n```\n\n</details>"
     def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
          # Get the query and response pair
             # Add code blocks before final answer if not added yet
             if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
                 total_code_blocks = []
+                for chunk, metadata in zip(result.retrieved_context.chunks, result.retrieved_context.metadatas):
+                    code_block = self._format_code_block(chunk, metadata)
                     if code_block:
                         total_code_blocks.append(code_block)

src/knowlang/configs/chat_config.py CHANGED Viewed

@@ -29,10 +29,6 @@ class ChatConfig(BaseSettings):
         default=8000,
         description="Maximum number of characters per chunk"
     )
-    code_path_prefix: str = Field(
-        default="",
-        description="Prefix of code paths in the chat interface"
-    )
 class AnalyticsProvider(str, Enum):
     MIXPANEL = "mixpanel"

         default=8000,
         description="Maximum number of characters per chunk"
     )
 class AnalyticsProvider(str, Enum):
     MIXPANEL = "mixpanel"

src/knowlang/summarizer/summarizer.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import List
 import chromadb
 from chromadb.errors import InvalidCollectionException
@@ -99,11 +100,12 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
         summary = await self.summarize_chunk(chunk)
         # Create a unique ID for the chunk
-        chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
         # Create metadata using Pydantic model
-        metadata = ChunkMetadata(
-            file_path=chunk.file_path,
             start_line=chunk.start_line,
             end_line=chunk.end_line,
             type=chunk.type.value,

+from pathlib import Path
 from typing import List
 import chromadb
 from chromadb.errors import InvalidCollectionException
         summary = await self.summarize_chunk(chunk)
         # Create a unique ID for the chunk
+        relative_path = Path(chunk.file_path).relative_to(self.config.db.codebase_directory).as_posix()
+        chunk_id = f"{relative_path}:{chunk.start_line}-{chunk.end_line}"
         # Create metadata using Pydantic model
+        metadata = ChunkMetadata(
+            file_path=relative_path,
             start_line=chunk.start_line,
             end_line=chunk.end_line,
             type=chunk.type.value,

tests/test_summarizer.py CHANGED Viewed

@@ -17,7 +17,7 @@ def config():
         )
 @pytest.fixture
-def sample_chunks():
     """Create sample code chunks for testing"""
     return [
         CodeChunk(
@@ -25,7 +25,7 @@ def sample_chunks():
             content="def hello(): return 'world'",
             start_line=1,
             end_line=2,
-            file_path="test.py",
             name="hello",
             docstring="Says hello"
         ),
@@ -34,7 +34,7 @@ def sample_chunks():
             content="class TestClass:\n    def __init__(self):\n        pass",
             start_line=4,
             end_line=6,
-            file_path="test.py",
             name="TestClass",
             docstring="A test class"
         )
@@ -125,14 +125,15 @@ async def test_process_and_store_chunk_with_embedding(
     assert add_call is not None
     kwargs = add_call[1]
     assert len(kwargs['embeddings']) == 3
     assert kwargs['embeddings'] == mock_embedding
     assert kwargs['documents'][0] == code_summary
-    assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
     # Verify metadata
     metadata = kwargs['metadatas'][0]
-    assert metadata['file_path'] == sample_chunks[0].file_path
     assert metadata['start_line'] == sample_chunks[0].start_line
     assert metadata['end_line'] == sample_chunks[0].end_line
     assert metadata['type'] == sample_chunks[0].type.value

         )
 @pytest.fixture
+def sample_chunks(config: AppConfig):
     """Create sample code chunks for testing"""
     return [
         CodeChunk(
             content="def hello(): return 'world'",
             start_line=1,
             end_line=2,
+            file_path=str(config.db.codebase_directory / "test.py"),
             name="hello",
             docstring="Says hello"
         ),
             content="class TestClass:\n    def __init__(self):\n        pass",
             start_line=4,
             end_line=6,
+            file_path=str(config.db.codebase_directory / "test.py"),
             name="TestClass",
             docstring="A test class"
         )
     assert add_call is not None
     kwargs = add_call[1]
+    relative_path = Path(sample_chunks[0].file_path).relative_to(config.db.codebase_directory).as_posix()
     assert len(kwargs['embeddings']) == 3
     assert kwargs['embeddings'] == mock_embedding
     assert kwargs['documents'][0] == code_summary
+    assert kwargs['ids'][0] == f"{relative_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
     # Verify metadata
     metadata = kwargs['metadatas'][0]
+    assert metadata['file_path'] == relative_path, "File path must be relative"
     assert metadata['start_line'] == sample_chunks[0].start_line
     assert metadata['end_line'] == sample_chunks[0].end_line
     assert metadata['type'] == sample_chunks[0].type.value