Spaces:
Sleeping
Sleeping
separate codebase dir from file absolute path
Browse files
.env.example
CHANGED
@@ -38,7 +38,6 @@ CHAT__SIMILARITY_THRESHOLD=0.7
|
|
38 |
CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
|
39 |
CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
|
40 |
CHAT__MAX_LENGTH_PER_CHUNK=8000
|
41 |
-
CHAT__CODE_PATH_PREFIX=''
|
42 |
|
43 |
# Embedding Configuration
|
44 |
# Settings for text embedding generation
|
|
|
38 |
CHAT__INTERFACE_TITLE='Code Repository Q&A Assistant'
|
39 |
CHAT__INTERFACE_DESCRIPTION="Ask questions about the codebase and I'll help you understand it!"
|
40 |
CHAT__MAX_LENGTH_PER_CHUNK=8000
|
|
|
41 |
|
42 |
# Embedding Configuration
|
43 |
# Settings for text embedding generation
|
src/knowlang/chat_bot/chat_interface.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
import gradio as gr
|
3 |
-
from knowlang.configs.chat_config import ChatConfig
|
4 |
from knowlang.configs.config import AppConfig
|
5 |
from knowlang.utils.fancy_log import FancyLogger
|
6 |
from knowlang.utils.rate_limiter import RateLimiter
|
@@ -20,11 +19,9 @@ class CodeContext:
|
|
20 |
start_line: int
|
21 |
end_line: int
|
22 |
|
23 |
-
def to_title(self
|
24 |
"""Format code context as a title string"""
|
25 |
-
|
26 |
-
title = f"📄 {truncated_file_path} (lines {self.start_line}-{self.end_line})"
|
27 |
-
return title
|
28 |
|
29 |
@classmethod
|
30 |
def from_metadata(cls, metadata: Dict) -> "CodeContext":
|
@@ -39,7 +36,6 @@ class CodeQAChatInterface:
|
|
39 |
def __init__(self, config: AppConfig):
|
40 |
self.config = config
|
41 |
self._init_chroma()
|
42 |
-
self.codebase_dir = Path(config.db.codebase_directory)
|
43 |
self.rate_limiter = RateLimiter()
|
44 |
self.chat_analytics = ChatAnalytics(config.chat_analytics)
|
45 |
|
@@ -51,33 +47,11 @@ class CodeQAChatInterface:
|
|
51 |
self.collection = self.db_client.get_collection(
|
52 |
name=self.config.db.collection_name
|
53 |
)
|
54 |
-
|
55 |
-
def _get_code_block(self, file_path: str, start_line: int, end_line: int) -> str:
|
56 |
-
"""Read the specified lines from a file and return as a code block"""
|
57 |
-
try:
|
58 |
-
full_path = self.codebase_dir / file_path[len(self.config.chat.code_path_prefix):]
|
59 |
-
|
60 |
-
print(f"Reading code block from {full_path}")
|
61 |
-
with open(full_path, 'r') as f:
|
62 |
-
lines = f.readlines()
|
63 |
-
code_lines = lines[start_line-1:end_line]
|
64 |
-
return ''.join(code_lines)
|
65 |
-
except Exception as e:
|
66 |
-
LOG.error(f"Error reading code block: {e}")
|
67 |
-
return "Error reading code"
|
68 |
-
|
69 |
-
def _format_code_block(self, metadata: Dict) -> str:
|
70 |
"""Format a single code block with metadata"""
|
71 |
context = CodeContext.from_metadata(metadata)
|
72 |
-
code = self._get_code_block(
|
73 |
-
context.file_path,
|
74 |
-
context.start_line,
|
75 |
-
context.end_line
|
76 |
-
)
|
77 |
-
if not code:
|
78 |
-
return None
|
79 |
|
80 |
-
return f"<details><summary>{context.to_title(
|
81 |
|
82 |
def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
|
83 |
# Get the query and response pair
|
@@ -150,8 +124,8 @@ class CodeQAChatInterface:
|
|
150 |
# Add code blocks before final answer if not added yet
|
151 |
if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
|
152 |
total_code_blocks = []
|
153 |
-
for metadata in result.retrieved_context.metadatas:
|
154 |
-
code_block = self._format_code_block(metadata)
|
155 |
if code_block:
|
156 |
total_code_blocks.append(code_block)
|
157 |
|
|
|
1 |
from dataclasses import dataclass
|
2 |
import gradio as gr
|
|
|
3 |
from knowlang.configs.config import AppConfig
|
4 |
from knowlang.utils.fancy_log import FancyLogger
|
5 |
from knowlang.utils.rate_limiter import RateLimiter
|
|
|
19 |
start_line: int
|
20 |
end_line: int
|
21 |
|
22 |
+
def to_title(self) -> str:
|
23 |
"""Format code context as a title string"""
|
24 |
+
return f"📄 {self.file_path} (lines {self.start_line}-{self.end_line})"
|
|
|
|
|
25 |
|
26 |
@classmethod
|
27 |
def from_metadata(cls, metadata: Dict) -> "CodeContext":
|
|
|
36 |
def __init__(self, config: AppConfig):
|
37 |
self.config = config
|
38 |
self._init_chroma()
|
|
|
39 |
self.rate_limiter = RateLimiter()
|
40 |
self.chat_analytics = ChatAnalytics(config.chat_analytics)
|
41 |
|
|
|
47 |
self.collection = self.db_client.get_collection(
|
48 |
name=self.config.db.collection_name
|
49 |
)
|
50 |
+
def _format_code_block(self, code : str, metadata: Dict) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
"""Format a single code block with metadata"""
|
52 |
context = CodeContext.from_metadata(metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
return f"<details><summary>{context.to_title()}</summary>\n\n```python\n{code}\n```\n\n</details>"
|
55 |
|
56 |
def _handle_feedback(self, like_data: gr.LikeData, history: List[ChatMessage], request: gr.Request):
|
57 |
# Get the query and response pair
|
|
|
124 |
# Add code blocks before final answer if not added yet
|
125 |
if not code_blocks_added and result.retrieved_context and result.retrieved_context.metadatas:
|
126 |
total_code_blocks = []
|
127 |
+
for chunk, metadata in zip(result.retrieved_context.chunks, result.retrieved_context.metadatas):
|
128 |
+
code_block = self._format_code_block(chunk, metadata)
|
129 |
if code_block:
|
130 |
total_code_blocks.append(code_block)
|
131 |
|
src/knowlang/configs/chat_config.py
CHANGED
@@ -29,10 +29,6 @@ class ChatConfig(BaseSettings):
|
|
29 |
default=8000,
|
30 |
description="Maximum number of characters per chunk"
|
31 |
)
|
32 |
-
code_path_prefix: str = Field(
|
33 |
-
default="",
|
34 |
-
description="Prefix of code paths in the chat interface"
|
35 |
-
)
|
36 |
|
37 |
class AnalyticsProvider(str, Enum):
|
38 |
MIXPANEL = "mixpanel"
|
|
|
29 |
default=8000,
|
30 |
description="Maximum number of characters per chunk"
|
31 |
)
|
|
|
|
|
|
|
|
|
32 |
|
33 |
class AnalyticsProvider(str, Enum):
|
34 |
MIXPANEL = "mixpanel"
|
src/knowlang/summarizer/summarizer.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from typing import List
|
2 |
import chromadb
|
3 |
from chromadb.errors import InvalidCollectionException
|
@@ -99,11 +100,12 @@ Provide a clean, concise and focused summary. Don't include unnecessary nor gene
|
|
99 |
summary = await self.summarize_chunk(chunk)
|
100 |
|
101 |
# Create a unique ID for the chunk
|
102 |
-
|
|
|
103 |
|
104 |
# Create metadata using Pydantic model
|
105 |
-
metadata = ChunkMetadata(
|
106 |
-
file_path=
|
107 |
start_line=chunk.start_line,
|
108 |
end_line=chunk.end_line,
|
109 |
type=chunk.type.value,
|
|
|
1 |
+
from pathlib import Path
|
2 |
from typing import List
|
3 |
import chromadb
|
4 |
from chromadb.errors import InvalidCollectionException
|
|
|
100 |
summary = await self.summarize_chunk(chunk)
|
101 |
|
102 |
# Create a unique ID for the chunk
|
103 |
+
relative_path = Path(chunk.file_path).relative_to(self.config.db.codebase_directory).as_posix()
|
104 |
+
chunk_id = f"{relative_path}:{chunk.start_line}-{chunk.end_line}"
|
105 |
|
106 |
# Create metadata using Pydantic model
|
107 |
+
metadata = ChunkMetadata(
|
108 |
+
file_path=relative_path,
|
109 |
start_line=chunk.start_line,
|
110 |
end_line=chunk.end_line,
|
111 |
type=chunk.type.value,
|
tests/test_summarizer.py
CHANGED
@@ -17,7 +17,7 @@ def config():
|
|
17 |
)
|
18 |
|
19 |
@pytest.fixture
|
20 |
-
def sample_chunks():
|
21 |
"""Create sample code chunks for testing"""
|
22 |
return [
|
23 |
CodeChunk(
|
@@ -25,7 +25,7 @@ def sample_chunks():
|
|
25 |
content="def hello(): return 'world'",
|
26 |
start_line=1,
|
27 |
end_line=2,
|
28 |
-
file_path="test.py",
|
29 |
name="hello",
|
30 |
docstring="Says hello"
|
31 |
),
|
@@ -34,7 +34,7 @@ def sample_chunks():
|
|
34 |
content="class TestClass:\n def __init__(self):\n pass",
|
35 |
start_line=4,
|
36 |
end_line=6,
|
37 |
-
file_path="test.py",
|
38 |
name="TestClass",
|
39 |
docstring="A test class"
|
40 |
)
|
@@ -125,14 +125,15 @@ async def test_process_and_store_chunk_with_embedding(
|
|
125 |
assert add_call is not None
|
126 |
|
127 |
kwargs = add_call[1]
|
|
|
128 |
assert len(kwargs['embeddings']) == 3
|
129 |
assert kwargs['embeddings'] == mock_embedding
|
130 |
assert kwargs['documents'][0] == code_summary
|
131 |
-
assert kwargs['ids'][0] == f"{
|
132 |
|
133 |
# Verify metadata
|
134 |
metadata = kwargs['metadatas'][0]
|
135 |
-
assert metadata['file_path'] ==
|
136 |
assert metadata['start_line'] == sample_chunks[0].start_line
|
137 |
assert metadata['end_line'] == sample_chunks[0].end_line
|
138 |
assert metadata['type'] == sample_chunks[0].type.value
|
|
|
17 |
)
|
18 |
|
19 |
@pytest.fixture
|
20 |
+
def sample_chunks(config: AppConfig):
|
21 |
"""Create sample code chunks for testing"""
|
22 |
return [
|
23 |
CodeChunk(
|
|
|
25 |
content="def hello(): return 'world'",
|
26 |
start_line=1,
|
27 |
end_line=2,
|
28 |
+
file_path=str(config.db.codebase_directory / "test.py"),
|
29 |
name="hello",
|
30 |
docstring="Says hello"
|
31 |
),
|
|
|
34 |
content="class TestClass:\n def __init__(self):\n pass",
|
35 |
start_line=4,
|
36 |
end_line=6,
|
37 |
+
file_path=str(config.db.codebase_directory / "test.py"),
|
38 |
name="TestClass",
|
39 |
docstring="A test class"
|
40 |
)
|
|
|
125 |
assert add_call is not None
|
126 |
|
127 |
kwargs = add_call[1]
|
128 |
+
relative_path = Path(sample_chunks[0].file_path).relative_to(config.db.codebase_directory).as_posix()
|
129 |
assert len(kwargs['embeddings']) == 3
|
130 |
assert kwargs['embeddings'] == mock_embedding
|
131 |
assert kwargs['documents'][0] == code_summary
|
132 |
+
assert kwargs['ids'][0] == f"{relative_path}:{sample_chunks[0].start_line}-{sample_chunks[0].end_line}"
|
133 |
|
134 |
# Verify metadata
|
135 |
metadata = kwargs['metadatas'][0]
|
136 |
+
assert metadata['file_path'] == relative_path, "File path must be relative"
|
137 |
assert metadata['start_line'] == sample_chunks[0].start_line
|
138 |
assert metadata['end_line'] == sample_chunks[0].end_line
|
139 |
assert metadata['type'] == sample_chunks[0].type.value
|