Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

gabykim commited on Feb 6

Commit

028eb6e

1 Parent(s): 484f007

chunk truncate utility

Browse files

Files changed (4) hide show

src/know_lang_bot/chat_bot/chat_graph.py +5 -4
src/know_lang_bot/config.py +4 -0
src/know_lang_bot/utils/chunking_util.py +29 -0
src/know_lang_bot/utils/migration/embedding_migrations.py +3 -28

src/know_lang_bot/chat_bot/chat_graph.py CHANGED Viewed

@@ -13,6 +13,7 @@ from pprint import pformat
 from enum import Enum
 from rich.console import Console
 from know_lang_bot.utils.model_provider import create_pydantic_model
 from know_lang_bot.models.embeddings import generate_embedding
 import voyageai
 from voyageai.object.reranking import RerankingObject
@@ -324,16 +325,16 @@ Remember: Your primary goal is answering the user's specific question, not expla
             ))
         context = ctx.state.retrieved_context
         prompt = f"""
 Question: {ctx.state.original_question}
 Relevant Code Context:
 {context.chunks}
-Provide a focused answer to the question above. Structure your response as:
-1. Direct Answer: Start with a clear, concise answer to the question
-2. Supporting Evidence: Reference specific code with file locations
-3. Limitations (if any): Note any missing context or uncertainties
 Important: Stay focused on answering the specific question asked.
         """

 from enum import Enum
 from rich.console import Console
 from know_lang_bot.utils.model_provider import create_pydantic_model
+from know_lang_bot.utils.chunking_util import truncate_chunk
 from know_lang_bot.models.embeddings import generate_embedding
 import voyageai
 from voyageai.object.reranking import RerankingObject
             ))
         context = ctx.state.retrieved_context
+        for chunk in context.chunks:
+            chunk = truncate_chunk(chunk, ctx.deps.config.chat.max_length_per_chunk)
         prompt = f"""
 Question: {ctx.state.original_question}
 Relevant Code Context:
 {context.chunks}
+Provide a focused answer to the question based on the provided context.
 Important: Stay focused on answering the specific question asked.
         """

src/know_lang_bot/config.py CHANGED Viewed

@@ -191,6 +191,10 @@ class ChatConfig(BaseSettings):
         default="Ask questions about the codebase and I'll help you understand it!",
         description="Description shown in the chat interface"
     )
 class AppConfig(BaseSettings):
     model_config = SettingsConfigDict(

         default="Ask questions about the codebase and I'll help you understand it!",
         description="Description shown in the chat interface"
     )
+    max_length_per_chunk: int = Field(
+        default=8000,
+        description="Maximum number of characters per chunk"
+    )
 class AppConfig(BaseSettings):
     model_config = SettingsConfigDict(

src/know_lang_bot/utils/chunking_util.py ADDED Viewed

	@@ -0,0 +1,29 @@

+MAX_CHARS_PER_CHUNK = 10000  # Approximate 8k tokens limit (very rough estimate)
+def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
+    """Truncate text to approximate token limit while preserving structure"""
+    if len(text) <= max_chars:
+        return text
+    # Split into CODE and SUMMARY sections
+    parts = text.split("\nSUMMARY:\n")
+    if len(parts) != 2:
+        # If structure not found, just truncate
+        return text[:max_chars]
+    code, summary = parts
+    # Calculate available space for each section (proportionally)
+    total_len = len(code) + len(summary)
+    code_ratio = len(code) / total_len
+    # Allocate characters proportionally
+    code_chars = int(max_chars * code_ratio)
+    summary_chars = max_chars - code_chars
+    truncated_code = code[:code_chars]
+    truncated_summary = summary[:summary_chars]
+    return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"

src/know_lang_bot/utils/migration/embedding_migrations.py CHANGED Viewed

@@ -5,18 +5,17 @@ import chromadb
 from chromadb.errors import InvalidCollectionException
 from rich.progress import Progress
 from rich.console import Console
-from typing import List, Dict, Any, Optional
-import openai
 from openai import OpenAI
 from datetime import datetime
 from know_lang_bot.config import AppConfig
 from know_lang_bot.utils.fancy_log import FancyLogger
 LOG = FancyLogger(__name__)
 console = Console()
 BATCH_SIZE = 2000  # Max items per batch
-MAX_CHARS_PER_CHUNK = 10000  # Approximate 8k tokens limit (very rough estimate)
 class BatchState:
     """Class to track batch processing state"""
@@ -35,31 +34,7 @@ class BatchState:
         with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
             json.dump(metadata, f, indent=2)
-def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
-    """Truncate text to approximate token limit while preserving structure"""
-    if len(text) <= max_chars:
-        return text
-    # Split into CODE and SUMMARY sections
-    parts = text.split("\nSUMMARY:\n")
-    if len(parts) != 2:
-        # If structure not found, just truncate
-        return text[:max_chars]
-    code, summary = parts
-    # Calculate available space for each section (proportionally)
-    total_len = len(code) + len(summary)
-    code_ratio = len(code) / total_len
-    # Allocate characters proportionally
-    code_chars = int(max_chars * code_ratio)
-    summary_chars = max_chars - code_chars
-    truncated_code = code[:code_chars]
-    truncated_summary = summary[:summary_chars]
-    return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"
 async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
     """Prepare batch files from ChromaDB and return batch IDs"""

 from chromadb.errors import InvalidCollectionException
 from rich.progress import Progress
 from rich.console import Console
+from typing import List, Dict, Optional
 from openai import OpenAI
 from datetime import datetime
 from know_lang_bot.config import AppConfig
 from know_lang_bot.utils.fancy_log import FancyLogger
+from know_lang_bot.utils.chunking_util import truncate_chunk
 LOG = FancyLogger(__name__)
 console = Console()
 BATCH_SIZE = 2000  # Max items per batch
 class BatchState:
     """Class to track batch processing state"""
         with open(self.metadata_dir / f"{batch_id}.json", "w") as f:
             json.dump(metadata, f, indent=2)
 async def prepare_batches(config: AppConfig, batch_state: BatchState) -> List[str]:
     """Prepare batch files from ChromaDB and return batch IDs"""