File size: 6,858 Bytes
71fa0c7
103c97e
070f7e7
103c97e
71fa0c7
070f7e7
c436c2a
71fa0c7
212ff4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71fa0c7
 
 
 
 
 
d403664
 
 
 
 
 
71fa0c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103c97e
070f7e7
 
 
 
 
 
212ff4c
070f7e7
 
 
 
 
 
 
 
 
 
 
8e1fac7
 
 
 
 
 
 
 
212ff4c
070f7e7
103c97e
 
 
 
 
 
070f7e7
103c97e
 
 
 
 
 
 
 
 
 
070f7e7
 
 
 
212ff4c
103c97e
 
 
 
 
 
 
 
 
 
aad4327
 
 
 
103c97e
212ff4c
f0bc02d
 
 
 
212ff4c
 
 
 
 
 
 
 
 
 
 
 
 
5f5f1b6
212ff4c
 
5f5f1b6
 
 
 
212ff4c
 
 
 
 
 
070f7e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103c97e
 
 
 
 
 
 
 
c436c2a
212ff4c
103c97e
070f7e7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from typing import Optional, Dict, Any, List
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import Field, field_validator, ValidationInfo
from pathlib import Path
import fnmatch
from know_lang_bot.core.types import ModelProvider
import os

def _validate_api_key(v: Optional[str], info: ValidationInfo) -> Optional[str]:
    """Validate API key is present when required"""
    if info.data['model_provider'] in [
        ModelProvider.OPENAI, 
        ModelProvider.ANTHROPIC,
        ModelProvider.VOYAGE
    ]:
        if not v:
            raise ValueError(f"API key required for {info.data['model_provider']}")
        elif info.data['model_provider'] == ModelProvider.ANTHROPIC:
            os.environ["ANTHROPIC_API_KEY"] = v
        elif info.data['model_provider'] == ModelProvider.OPENAI:
            os.environ["OPENAI_API_KEY"] = v
        elif info.data['model_provider'] == ModelProvider.VOYAGE:
            os.environ["VOYAGE_API_KEY"] = v
            
    return v

class PathPatterns(BaseSettings):
    include: List[str] = Field(
        default=["**/*"],
        description="Glob patterns for paths to include"
    )
    exclude: List[str] = Field(
        default=[
            "**/venv/**", 
            "**/.git/**", 
            "**/__pycache__/**", 
            "**/tests/**",
        ],
        description="Glob patterns for paths to exclude"
    )

    def should_process_path(self, path: str) -> bool:
        """Check if a path should be processed based on include/exclude patterns"""
        path_str = str(path)
        
        # First check exclusions
        for pattern in self.exclude:
            if fnmatch.fnmatch(path_str, pattern):
                return False
        
        # Then check inclusions
        for pattern in self.include:
            if fnmatch.fnmatch(path_str, pattern):
                return True
        
        return False

class LanguageConfig(BaseSettings):
    enabled: bool = True
    file_extensions: List[str]
    tree_sitter_language: str
    chunk_types: List[str]
    max_file_size: int = Field(
        default=1_000_000,  # 1MB
        description="Maximum file size to process in bytes"
    )

class ParserConfig(BaseSettings):
    languages: Dict[str, LanguageConfig] = Field(
        default={
            "python": LanguageConfig(
                file_extensions=[".py"],
                tree_sitter_language="python",
                chunk_types=["class_definition", "function_definition"]
            )
        }
    )
    path_patterns: PathPatterns = Field(default_factory=PathPatterns)


class EmbeddingConfig(BaseSettings):
    """Shared embedding configuration"""
    model_name: str = Field(
        default="mxbai-embed-large",
        description="Name of the embedding model"
    )
    model_provider: ModelProvider = Field(
        default=ModelProvider.OLLAMA,
        description="Provider for embeddings"
    )
    dimension: int = Field(
        default=768,
        description="Embedding dimension"
    )
    settings: Dict[str, Any] = Field(
        default_factory=dict,
        description="Provider-specific settings"
    )
    api_key: Optional[str] = Field(
        default=None,
        description="API key for the model provider"
    )

    @field_validator('api_key', mode='after')
    @classmethod
    def validate_api_key(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]:
        return _validate_api_key(v, info)

class LLMConfig(BaseSettings):
    model_name: str = Field(
        default="llama3.2",
        description="Name of the LLM model to use"
    )
    model_provider: str = Field(
        default=ModelProvider.OLLAMA,
        description="Model provider (anthropic, openai, ollama, etc)"
    )
    api_key: Optional[str] = Field(
        default=None,
        description="API key for the model provider"
    )
    model_settings: Dict[str, Any] = Field(
        default_factory=dict,
        description="Additional model settings"
    )

    @field_validator('api_key', mode='after')
    @classmethod
    def validate_api_key(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]:
        return _validate_api_key(v, info)

class DBConfig(BaseSettings):
    persist_directory: Path = Field(
        default=Path("./chroma_db"),
        description="Directory to store ChromaDB files"
    )
    collection_name: str = Field(
        default="code_chunks",
        description="Name of the ChromaDB collection"
    )
    codebase_directory: Path = Field(
        default=Path("./"),
        description="Root directory of the codebase to analyze"
    )

class RerankerConfig(BaseSettings):
    enabled: bool = Field(
        default=True,
        description="Enable reranking"
    )
    model_name: str = Field(
        default="reranker",
        description="Name of the reranker model to use"
    )
    model_provider: str = Field(
        default=ModelProvider.OLLAMA,
        description="Model provider (anthropic, openai, ollama, etc)"
    )
    api_key: Optional[str] = Field(
        default=None,
        description="API key for the model provider"
    )
    top_k: int = Field(
        default=5,
        description="Number of most relevant documents to return from reranking"
    )
    relevance_threshold: float = Field(
        default=0.5,
        description="Minimum relevance score to include a document in reranking"
    )

    @field_validator('api_key', mode='after')
    @classmethod
    def validate_api_key(cls, v: Optional[str], info: ValidationInfo) -> Optional[str]:
        return _validate_api_key(v, info)

class ChatConfig(BaseSettings):
    max_context_chunks: int = Field(
        default=5,
        description="Maximum number of similar chunks to include in context"
    )
    similarity_threshold: float = Field(
        default=0.7,
        description="Minimum similarity score to include a chunk"
    )
    interface_title: str = Field(
        default="Code Repository Q&A Assistant",
        description="Title shown in the chat interface"
    )
    interface_description: str = Field(
        default="Ask questions about the codebase and I'll help you understand it!",
        description="Description shown in the chat interface"
    )

class AppConfig(BaseSettings):
    model_config = SettingsConfigDict(
        env_file='.env',
        env_file_encoding='utf-8',
        env_nested_delimiter='__'
    )
    
    llm: LLMConfig = Field(default_factory=LLMConfig)
    evaluator: LLMConfig = Field(default_factory=LLMConfig)
    reranker: RerankerConfig = Field(default_factory=RerankerConfig)
    db: DBConfig = Field(default_factory=DBConfig)
    parser: ParserConfig = Field(default_factory=ParserConfig)
    chat: ChatConfig = Field(default_factory=ChatConfig)
    embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig)