gabykim's picture
fix summarization unittest
3a5efa8
raw
history blame
1.12 kB
MAX_CHARS_PER_CHUNK = 10000 # Approximate 8k tokens limit (very rough estimate)
def format_code_summary(code: str, summary: str) -> str:
"""Format code and summary into a single string"""
return f"CODE:\n{code}\n\nSUMMARY:\n{summary}"
def truncate_chunk(text: str, max_chars: int = MAX_CHARS_PER_CHUNK) -> str:
"""Truncate text to approximate token limit while preserving structure"""
if len(text) <= max_chars:
return text
# Split into CODE and SUMMARY sections
parts = text.split("\nSUMMARY:\n")
if len(parts) != 2:
# If structure not found, just truncate
return text[:max_chars]
code, summary = parts
# Calculate available space for each section (proportionally)
total_len = len(code) + len(summary)
code_ratio = len(code) / total_len
# Allocate characters proportionally
code_chars = int(max_chars * code_ratio)
summary_chars = max_chars - code_chars
truncated_code = code[:code_chars]
truncated_summary = summary[:summary_chars]
return f"{truncated_code}\nSUMMARY:\n{truncated_summary}"