gabykim commited on
Commit
667a527
·
1 Parent(s): f6b1033

separate codebase dir from file absolute path

Browse files
.env.example CHANGED
@@ -38,7 +38,6 @@ CHAT__SIMILARITY_THRESHOLD=0.7
38
  CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
39
  CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
40
  CHAT__MAX_LENGTH_PER_CHUNK=8000
41
- CHAT__CODE_PATH_PREFIX=''
42
 
43
  # Embedding Configuration
44
  # Settings for text embedding generation
 
38
  CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
39
  CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
40
  CHAT__MAX_LENGTH_PER_CHUNK=8000
 
41
 
42
  # Embedding Configuration
43
  # Settings for text embedding generation
src/knowlang/chat_bot/chat_interface.py CHANGED
@@ -1,6 +1,5 @@
1
  from dataclasses import dataclass
2
  import gradio as gr
3
- from knowlang.configs.chat_config import ChatConfig
4
  from knowlang.configs.config import AppConfig
5
  from knowlang.utils.fancy_log import FancyLogger
6
  from knowlang.utils.rate_limiter import RateLimiter
@@ -20,11 +19,9 @@ class CodeContext:
20
  start_line: int
21
  end_line: int
22
 
23
- def to_title(self, config: ChatConfig) -> str:
24
  """Format code context as a title string"""
25
- truncated_file_path = self.file_path[len(config.code_path_prefix):]
26
- title = f"📄 {truncated_file_path} (lines {self.start_line}-{self.end_line})"
27
- return title
28
 
29
  @classmethod
30
  def from_metadata(cls, metadata: Dict) -> "CodeContext":
@@ -39,7 +36,6 @@ class CodeQAChatInterface:
39
  def __init__(self, config: AppConfig):
40
  self.config = config
41
  self._init_chroma()
42
- self.codebase_dir = Path(config.db.codebase_directory)
43
  self.rate_limiter = RateLimiter()
44
  self.chat_analytics = ChatAnalytics(config.chat_analytics)
45
 
@@ -51,33 +47,11 @@ class CodeQAChatInterface:
51
  self.collection = self.db_client.get_collection(
52
  name=self.config.db.collection_name
53
  )
54
-
55
- def _get_code_block(self, file_path: str, start_line: int, end_line: int) -> str:
56
- """Read the specified lines from a file and return as a code block"""
57
- try:
58
- full_path = self.codebase_dir / file_path[len(self.config.chat.code_path_prefix):]
59
-
60
- print(f"Reading code block from {full_path}")
61
- with open(full_path, 'r') as f:
62
- lines = f.readlines()
63
- code_lines = lines[start_line-1:end_line]
64
- return ''.join(code_lines)
65
- except Exception as e:
66
- LOG.error(f"Error reading code block: {e}")
67
- return "Error reading code"
68
-
69
- def _format_code_block(self, metadata: Dict) -> str:
70
  """Format a single code block with metadata"""
71
  context = CodeContext.from_metadata(metadata)
72
- code = self._get_code_block(
73
- context.file_path,
74
- context.start_line,
75
- context.end_line
76
- )
77
- if not code:
78
- return None
79
 
80
- return f"<details><summary>{context.to_title(self.config.chat)}</summary>\n\n```python\n{code}\n```\n\n</details>"
81
 
82
  def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
83
  # Get the query and response pair
@@ -150,8 +124,8 @@ class CodeQAChatInterface:
150
  # Add code blocks before final answer if not added yet
151
  if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
152
  total_code_blocks = []
153
- for metadata in result.retrieved_context.metadatas:
154
- code_block = self._format_code_block(metadata)
155
  if code_block:
156
  total_code_blocks.append(code_block)
157
 
 
1
  from dataclasses import dataclass
2
  import gradio as gr
 
3
  from knowlang.configs.config import AppConfig
4
  from knowlang.utils.fancy_log import FancyLogger
5
  from knowlang.utils.rate_limiter import RateLimiter
 
19
  start_line: int
20
  end_line: int
21
 
22
+ def to_title(self) -> str:
23
  """Format code context as a title string"""
24
+ return f"📄 {self.file_path} (lines {self.start_line}-{self.end_line})"
 
 
25
 
26
  @classmethod
27
  def from_metadata(cls, metadata: Dict) -> "CodeContext":
 
36
  def __init__(self, config: AppConfig):
37
  self.config = config
38
  self._init_chroma()
 
39
  self.rate_limiter = RateLimiter()
40
  self.chat_analytics = ChatAnalytics(config.chat_analytics)
41
 
 
47
  self.collection = self.db_client.get_collection(
48
  name=self.config.db.collection_name
49
  )
50
+ def _format_code_block(self, code : str, metadata: Dict) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  """Format a single code block with metadata"""
52
  context = CodeContext.from_metadata(metadata)
 
 
 
 
 
 
 
53
 
54
+ return f"<details><summary>{context.to_title()}</summary>\n\n```python\n{code}\n```\n\n</details>"
55
 
56
  def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
57
  # Get the query and response pair
 
124
  # Add code blocks before final answer if not added yet
125
  if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
126
  total_code_blocks = []
127
+ for chunk, metadata in zip(result.retrieved_context.chunks, result.retrieved_context.metadatas):
128
+ code_block = self._format_code_block(chunk, metadata)
129
  if code_block:
130
  total_code_blocks.append(code_block)
131
 
src/knowlang/configs/chat_config.py CHANGED
@@ -29,10 +29,6 @@ class ChatConfig(BaseSettings):
29
  default=8000,
30
  description="Maximum number of characters per chunk"
31
  )
32
- code_path_prefix: str = Field(
33
- default="",
34
- description="Prefix of code paths in the chat interface"
35
- )
36
 
37
  class AnalyticsProvider(str, Enum):
38
  MIXPANEL = "mixpanel"
 
29
  default=8000,
30
  description="Maximum number of characters per chunk"
31
  )
 
 
 
 
32
 
33
  class AnalyticsProvider(str, Enum):
34
  MIXPANEL = "mixpanel"
src/knowlang/summarizer/summarizer.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from typing import List
2
  import chromadb
3
  from chromadb.errors import InvalidCollectionException
@@ -99,11 +100,12 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
99
  summary = await self.summarize_chunk(chunk)
100
 
101
  # Create a unique ID for the chunk
102
- chunk_id = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
 
103
 
104
  # Create metadata using Pydantic model
105
- metadata = ChunkMetadata(
106
- file_path=chunk.file_path,
107
  start_line=chunk.start_line,
108
  end_line=chunk.end_line,
109
  type=chunk.type.value,
 
1
+ from pathlib import Path
2
  from typing import List
3
  import chromadb
4
  from chromadb.errors import InvalidCollectionException
 
100
  summary = await self.summarize_chunk(chunk)
101
 
102
  # Create a unique ID for the chunk
103
+ relative_path = Path(chunk.file_path).relative_to(self.config.db.codebase_directory).as_posix()
104
+ chunk_id = f"{relative_path}:{chunk.start_line}-{chunk.end_line}"
105
 
106
  # Create metadata using Pydantic model
107
+ metadata = ChunkMetadata(
108
+ file_path=relative_path,
109
  start_line=chunk.start_line,
110
  end_line=chunk.end_line,
111
  type=chunk.type.value,
tests/test_summarizer.py CHANGED
@@ -17,7 +17,7 @@ def config():
17
  )
18
 
19
  @pytest.fixture
20
- def sample_chunks():
21
  """Create sample code chunks for testing"""
22
  return [
23
  CodeChunk(
@@ -25,7 +25,7 @@ def sample_chunks():
25
  content="def hello(): return 'world'",
26
  start_line=1,
27
  end_line=2,
28
- file_path="test.py",
29
  name="hello",
30
  docstring="Says hello"
31
  ),
@@ -34,7 +34,7 @@ def sample_chunks():
34
  content="class TestClass:\n def __init__(self):\n pass",
35
  start_line=4,
36
  end_line=6,
37
- file_path="test.py",
38
  name="TestClass",
39
  docstring="A test class"
40
  )
@@ -125,14 +125,15 @@ async def test_process_and_store_chunk_with_embedding(
125
  assert add_call is not None
126
 
127
  kwargs = add_call[1]
 
128
  assert len(kwargs['embeddings']) == 3
129
  assert kwargs['embeddings'] == mock_embedding
130
  assert kwargs['documents'][0] == code_summary
131
- assert kwargs['ids'][0] == f"{sample_chunks[0].file_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
132
 
133
  # Verify metadata
134
  metadata = kwargs['metadatas'][0]
135
- assert metadata['file_path'] == sample_chunks[0].file_path
136
  assert metadata['start_line'] == sample_chunks[0].start_line
137
  assert metadata['end_line'] == sample_chunks[0].end_line
138
  assert metadata['type'] == sample_chunks[0].type.value
 
17
  )
18
 
19
  @pytest.fixture
20
+ def sample_chunks(config: AppConfig):
21
  """Create sample code chunks for testing"""
22
  return [
23
  CodeChunk(
 
25
  content="def hello(): return 'world'",
26
  start_line=1,
27
  end_line=2,
28
+ file_path=str(config.db.codebase_directory / "test.py"),
29
  name="hello",
30
  docstring="Says hello"
31
  ),
 
34
  content="class TestClass:\n def __init__(self):\n pass",
35
  start_line=4,
36
  end_line=6,
37
+ file_path=str(config.db.codebase_directory / "test.py"),
38
  name="TestClass",
39
  docstring="A test class"
40
  )
 
125
  assert add_call is not None
126
 
127
  kwargs = add_call[1]
128
+ relative_path = Path(sample_chunks[0].file_path).relative_to(config.db.codebase_directory).as_posix()
129
  assert len(kwargs['embeddings']) == 3
130
  assert kwargs['embeddings'] == mock_embedding
131
  assert kwargs['documents'][0] == code_summary
132
+ assert kwargs['ids'][0] == f"{relative_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
133
 
134
  # Verify metadata
135
  metadata = kwargs['metadatas'][0]
136
+ assert metadata['file_path'] == relative_path, "File path must be relative"
137
  assert metadata['start_line'] == sample_chunks[0].start_line
138
  assert metadata['end_line'] == sample_chunks[0].end_line
139
  assert metadata['type'] == sample_chunks[0].type.value