Spaces:
Sleeping
Sleeping
chunk truncate utility
Browse files
src/know_lang_bot/chat_bot/chat_graph.py
CHANGED
@@ -13,6 +13,7 @@ from pprint import pformat
|
|
13 |
from enum import Enum
|
14 |
from rich.console import Console
|
15 |
from know_lang_bot.utils.model_provider import create_pydantic_model
|
|
|
16 |
from know_lang_bot.models.embeddings import generate_embedding
|
17 |
import voyageai
|
18 |
from voyageai.object.reranking import RerankingObject
|
@@ -324,16 +325,16 @@ Remember: Your primary goal is answering the user's specific question, not expla
|
|
324 |
))
|
325 |
|
326 |
context = ctx.state.retrieved_context
|
|
|
|
|
|
|
327 |
prompt = f"""
|
328 |
Question: {ctx.state.original_question}
|
329 |
|
330 |
Relevant Code Context:
|
331 |
{context.chunks}
|
332 |
|
333 |
-
Provide a focused answer to the question
|
334 |
-
1. Direct Answer: Start with a clear, concise answer to the question
|
335 |
-
2. Supporting Evidence: Reference specific code with file locations
|
336 |
-
3. Limitations (if any): Note any missing context or uncertainties
|
337 |
|
338 |
Important: Stay focused on answering the specific question asked.
|
339 |
"""
|
|
|
13 |
from enum import Enum
|
14 |
from rich.console import Console
|
15 |
from know_lang_bot.utils.model_provider import create_pydantic_model
|
16 |
+
from know_lang_bot.utils.chunking_util import truncate_chunk
|
17 |
from know_lang_bot.models.embeddings import generate_embedding
|
18 |
import voyageai
|
19 |
from voyageai.object.reranking import RerankingObject
|
|
|
325 |
))
|
326 |
|
327 |
context = ctx.state.retrieved_context
|
328 |
+
for chunk in context.chunks:
|
329 |
+
chunk = truncate_chunk(chunk, ctx.deps.config.chat.max_length_per_chunk)
|
330 |
+
|
331 |
prompt = f"""
|
332 |
Question: {ctx.state.original_question}
|
333 |
|
334 |
Relevant Code Context:
|
335 |
{context.chunks}
|
336 |
|
337 |
+
Provide a focused answer to the question based on the provided context.
|
|
|
|
|
|
|
338 |
|
339 |
Important: Stay focused on answering the specific question asked.
|
340 |
"""
|
src/know_lang_bot/config.py
CHANGED
@@ -191,6 +191,10 @@ class ChatConfig(BaseSettings):
|
|
191 |
default="Ask questions about the codebase and I'll help you understand it!",
|
192 |
description="Description shown in the chat interface"
|
193 |
)
|
|
|
|
|
|
|
|
|
194 |
|
195 |
class AppConfig(BaseSettings):
|
196 |
model_config = SettingsConfigDict(
|
|
|
191 |
default="Ask questions about the codebase and I'll help you understand it!",
|
192 |
description="Description shown in the chat interface"
|
193 |
)
|
194 |
+
max_length_per_chunk: int = Field(
|
195 |
+
default=8000,
|
196 |
+
description="Maximum number of characters per chunk"
|
197 |
+
)
|
198 |
|
199 |
class AppConfig(BaseSettings):
|
200 |
model_config = SettingsConfigDict(
|
src/know_lang_bot/utils/chunking_util.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
|
3 |
+
|
4 |
+
|
5 |
+
def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
|
6 |
+
"""Truncate text to approximate token limit while preserving structure"""
|
7 |
+
if len(text) <= max_chars:
|
8 |
+
return text
|
9 |
+
|
10 |
+
# Split into CODE and SUMMARY sections
|
11 |
+
parts = text.split("\nSUMMARY:\n")
|
12 |
+
if len(parts) != 2:
|
13 |
+
# If structure not found, just truncate
|
14 |
+
return text[:max_chars]
|
15 |
+
|
16 |
+
code, summary = parts
|
17 |
+
|
18 |
+
# Calculate available space for each section (proportionally)
|
19 |
+
total_len = len(code) + len(summary)
|
20 |
+
code_ratio = len(code) / total_len
|
21 |
+
|
22 |
+
# Allocate characters proportionally
|
23 |
+
code_chars = int(max_chars * code_ratio)
|
24 |
+
summary_chars = max_chars - code_chars
|
25 |
+
|
26 |
+
truncated_code = code[:code_chars]
|
27 |
+
truncated_summary = summary[:summary_chars]
|
28 |
+
|
29 |
+
return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"
|
src/know_lang_bot/utils/migration/embedding_migrations.py
CHANGED
@@ -5,18 +5,17 @@ import chromadb
|
|
5 |
from chromadb.errors import InvalidCollectionException
|
6 |
from rich.progress import Progress
|
7 |
from rich.console import Console
|
8 |
-
from typing import List, Dict,
|
9 |
-
import openai
|
10 |
from openai import OpenAI
|
11 |
from datetime import datetime
|
12 |
from know_lang_bot.config import AppConfig
|
13 |
from know_lang_bot.utils.fancy_log import FancyLogger
|
|
|
14 |
|
15 |
LOG = FancyLogger(__name__)
|
16 |
console = Console()
|
17 |
|
18 |
BATCH_SIZE = 2000 # Max items per batch
|
19 |
-
MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
|
20 |
|
21 |
class BatchState:
|
22 |
"""Class to track batch processing state"""
|
@@ -35,31 +34,7 @@ class BatchState:
|
|
35 |
with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
|
36 |
json.dump(metadata, f, indent=2)
|
37 |
|
38 |
-
|
39 |
-
"""Truncate text to approximate token limit while preserving structure"""
|
40 |
-
if len(text) <= max_chars:
|
41 |
-
return text
|
42 |
-
|
43 |
-
# Split into CODE and SUMMARY sections
|
44 |
-
parts = text.split("\nSUMMARY:\n")
|
45 |
-
if len(parts) != 2:
|
46 |
-
# If structure not found, just truncate
|
47 |
-
return text[:max_chars]
|
48 |
-
|
49 |
-
code, summary = parts
|
50 |
-
|
51 |
-
# Calculate available space for each section (proportionally)
|
52 |
-
total_len = len(code) + len(summary)
|
53 |
-
code_ratio = len(code) / total_len
|
54 |
-
|
55 |
-
# Allocate characters proportionally
|
56 |
-
code_chars = int(max_chars * code_ratio)
|
57 |
-
summary_chars = max_chars - code_chars
|
58 |
-
|
59 |
-
truncated_code = code[:code_chars]
|
60 |
-
truncated_summary = summary[:summary_chars]
|
61 |
-
|
62 |
-
return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"
|
63 |
|
64 |
async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
|
65 |
"""Prepare batch files from ChromaDB and return batch IDs"""
|
|
|
5 |
from chromadb.errors import InvalidCollectionException
|
6 |
from rich.progress import Progress
|
7 |
from rich.console import Console
|
8 |
+
from typing import List, Dict, Optional
|
|
|
9 |
from openai import OpenAI
|
10 |
from datetime import datetime
|
11 |
from know_lang_bot.config import AppConfig
|
12 |
from know_lang_bot.utils.fancy_log import FancyLogger
|
13 |
+
from know_lang_bot.utils.chunking_util import truncate_chunk
|
14 |
|
15 |
LOG = FancyLogger(__name__)
|
16 |
console = Console()
|
17 |
|
18 |
BATCH_SIZE = 2000 # Max items per batch
|
|
|
19 |
|
20 |
class BatchState:
|
21 |
"""Class to track batch processing state"""
|
|
|
34 |
with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
|
35 |
json.dump(metadata, f, indent=2)
|
36 |
|
37 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
|
40 |
"""Prepare batch files from ChromaDB and return batch IDs"""
|