gabykim commited on
Commit
028eb6e
·
1 Parent(s): 484f007

chunk truncate utility

Browse files
src/know_lang_bot/chat_bot/chat_graph.py CHANGED
@@ -13,6 +13,7 @@ from pprint import pformat
13
  from enum import Enum
14
  from rich.console import Console
15
  from know_lang_bot.utils.model_provider import create_pydantic_model
 
16
  from know_lang_bot.models.embeddings import generate_embedding
17
  import voyageai
18
  from voyageai.object.reranking import RerankingObject
@@ -324,16 +325,16 @@ Remember: Your primary goal is answering the user's specific question, not expla
324
  ))
325
 
326
  context = ctx.state.retrieved_context
 
 
 
327
  prompt = f"""
328
  Question: {ctx.state.original_question}
329
 
330
  Relevant Code Context:
331
  {context.chunks}
332
 
333
- Provide a focused answer to the question above. Structure your response as:
334
- 1. Direct Answer: Start with a clear, concise answer to the question
335
- 2. Supporting Evidence: Reference specific code with file locations
336
- 3. Limitations (if any): Note any missing context or uncertainties
337
 
338
  Important: Stay focused on answering the specific question asked.
339
  """
 
13
  from enum import Enum
14
  from rich.console import Console
15
  from know_lang_bot.utils.model_provider import create_pydantic_model
16
+ from know_lang_bot.utils.chunking_util import truncate_chunk
17
  from know_lang_bot.models.embeddings import generate_embedding
18
  import voyageai
19
  from voyageai.object.reranking import RerankingObject
 
325
  ))
326
 
327
  context = ctx.state.retrieved_context
328
+ for chunk in context.chunks:
329
+ chunk = truncate_chunk(chunk, ctx.deps.config.chat.max_length_per_chunk)
330
+
331
  prompt = f"""
332
  Question: {ctx.state.original_question}
333
 
334
  Relevant Code Context:
335
  {context.chunks}
336
 
337
+ Provide a focused answer to the question based on the provided context.
 
 
 
338
 
339
  Important: Stay focused on answering the specific question asked.
340
  """
src/know_lang_bot/config.py CHANGED
@@ -191,6 +191,10 @@ class ChatConfig(BaseSettings):
191
  default="Ask questions about the codebase and I'll help you understand it!",
192
  description="Description shown in the chat interface"
193
  )
 
 
 
 
194
 
195
  class AppConfig(BaseSettings):
196
  model_config = SettingsConfigDict(
 
191
  default="Ask questions about the codebase and I'll help you understand it!",
192
  description="Description shown in the chat interface"
193
  )
194
+ max_length_per_chunk: int = Field(
195
+ default=8000,
196
+ description="Maximum number of characters per chunk"
197
+ )
198
 
199
  class AppConfig(BaseSettings):
200
  model_config = SettingsConfigDict(
src/know_lang_bot/utils/chunking_util.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
3
+
4
+
5
+ def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
6
+ """Truncate text to approximate token limit while preserving structure"""
7
+ if len(text) <= max_chars:
8
+ return text
9
+
10
+ # Split into CODE and SUMMARY sections
11
+ parts = text.split("\nSUMMARY:\n")
12
+ if len(parts) != 2:
13
+ # If structure not found, just truncate
14
+ return text[:max_chars]
15
+
16
+ code, summary = parts
17
+
18
+ # Calculate available space for each section (proportionally)
19
+ total_len = len(code) + len(summary)
20
+ code_ratio = len(code) / total_len
21
+
22
+ # Allocate characters proportionally
23
+ code_chars = int(max_chars * code_ratio)
24
+ summary_chars = max_chars - code_chars
25
+
26
+ truncated_code = code[:code_chars]
27
+ truncated_summary = summary[:summary_chars]
28
+
29
+ return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"
src/know_lang_bot/utils/migration/embedding_migrations.py CHANGED
@@ -5,18 +5,17 @@ import chromadb
5
  from chromadb.errors import InvalidCollectionException
6
  from rich.progress import Progress
7
  from rich.console import Console
8
- from typing import List, Dict, Any, Optional
9
- import openai
10
  from openai import OpenAI
11
  from datetime import datetime
12
  from know_lang_bot.config import AppConfig
13
  from know_lang_bot.utils.fancy_log import FancyLogger
 
14
 
15
  LOG = FancyLogger(__name__)
16
  console = Console()
17
 
18
  BATCH_SIZE = 2000 # Max items per batch
19
- MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
20
 
21
  class BatchState:
22
  """Class to track batch processing state"""
@@ -35,31 +34,7 @@ class BatchState:
35
  with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
36
  json.dump(metadata, f, indent=2)
37
 
38
- def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
39
- """Truncate text to approximate token limit while preserving structure"""
40
- if len(text) <= max_chars:
41
- return text
42
-
43
- # Split into CODE and SUMMARY sections
44
- parts = text.split("\nSUMMARY:\n")
45
- if len(parts) != 2:
46
- # If structure not found, just truncate
47
- return text[:max_chars]
48
-
49
- code, summary = parts
50
-
51
- # Calculate available space for each section (proportionally)
52
- total_len = len(code) + len(summary)
53
- code_ratio = len(code) / total_len
54
-
55
- # Allocate characters proportionally
56
- code_chars = int(max_chars * code_ratio)
57
- summary_chars = max_chars - code_chars
58
-
59
- truncated_code = code[:code_chars]
60
- truncated_summary = summary[:summary_chars]
61
-
62
- return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"
63
 
64
  async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
65
  """Prepare batch files from ChromaDB and return batch IDs"""
 
5
  from chromadb.errors import InvalidCollectionException
6
  from rich.progress import Progress
7
  from rich.console import Console
8
+ from typing import List, Dict, Optional
 
9
  from openai import OpenAI
10
  from datetime import datetime
11
  from know_lang_bot.config import AppConfig
12
  from know_lang_bot.utils.fancy_log import FancyLogger
13
+ from know_lang_bot.utils.chunking_util import truncate_chunk
14
 
15
  LOG = FancyLogger(__name__)
16
  console = Console()
17
 
18
  BATCH_SIZE = 2000 # Max items per batch
 
19
 
20
  class BatchState:
21
  """Class to track batch processing state"""
 
34
  with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
35
  json.dump(metadata, f, indent=2)
36
 
37
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
40
  """Prepare batch files from ChromaDB and return batch IDs"""