gabykim commited on
Commit
71fa0c7
·
1 Parent(s): d141c1f

multi language & source file support draft

Browse files
src/know_lang_bot/config.py CHANGED
@@ -1,7 +1,61 @@
1
- from typing import Optional, Dict, Any
2
  from pydantic_settings import BaseSettings, SettingsConfigDict
3
  from pydantic import Field
4
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  class LLMConfig(BaseSettings):
7
  model_name: str = Field(
@@ -56,7 +110,4 @@ class AppConfig(BaseSettings):
56
 
57
  llm: LLMConfig = Field(default_factory=LLMConfig)
58
  db: DBConfig = Field(default_factory=DBConfig)
59
- chunk_max_size: int = Field(
60
- default=1500,
61
- description="Maximum size of code chunks before splitting"
62
- )
 
1
+ from typing import Optional, Dict, Any, List
2
  from pydantic_settings import BaseSettings, SettingsConfigDict
3
  from pydantic import Field
4
  from pathlib import Path
5
+ import fnmatch
6
+
7
+ class PathPatterns(BaseSettings):
8
+ include: List[str] = Field(
9
+ default=["**/*"],
10
+ description="Glob patterns for paths to include"
11
+ )
12
+ exclude: List[str] = Field(
13
+ default=["**/venv/**", "**/.git/**", "**/__pycache__/**"],
14
+ description="Glob patterns for paths to exclude"
15
+ )
16
+
17
+ def should_process_path(self, path: str) -> bool:
18
+ """Check if a path should be processed based on include/exclude patterns"""
19
+ path_str = str(path)
20
+
21
+ # First check exclusions
22
+ for pattern in self.exclude:
23
+ if fnmatch.fnmatch(path_str, pattern):
24
+ return False
25
+
26
+ # Then check inclusions
27
+ for pattern in self.include:
28
+ if fnmatch.fnmatch(path_str, pattern):
29
+ return True
30
+
31
+ return False
32
+
33
+ class LanguageConfig(BaseSettings):
34
+ enabled: bool = True
35
+ file_extensions: List[str]
36
+ tree_sitter_language: str
37
+ chunk_types: List[str]
38
+ max_file_size: int = Field(
39
+ default=1_000_000, # 1MB
40
+ description="Maximum file size to process in bytes"
41
+ )
42
+
43
+ def supports_extension(self, ext: str) -> bool:
44
+ """Check if the language supports a given file extension"""
45
+ return ext in self.file_extensions
46
+
47
+ class ParserConfig(BaseSettings):
48
+ languages: Dict[str, LanguageConfig] = Field(
49
+ default={
50
+ "python": LanguageConfig(
51
+ file_extensions=[".py"],
52
+ tree_sitter_language="python",
53
+ chunk_types=["class_definition", "function_definition"]
54
+ )
55
+ }
56
+ )
57
+ path_patterns: PathPatterns = Field(default_factory=PathPatterns)
58
+
59
 
60
  class LLMConfig(BaseSettings):
61
  model_name: str = Field(
 
110
 
111
  llm: LLMConfig = Field(default_factory=LLMConfig)
112
  db: DBConfig = Field(default_factory=DBConfig)
113
+ parser: ParserConfig = Field(default_factory=ParserConfig)
 
 
 
src/know_lang_bot/core/types.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import Optional
3
+ from enum import Enum
4
+
5
+ class ChunkType(str, Enum):
6
+ CLASS = "class"
7
+ FUNCTION = "function"
8
+ OTHER = "other"
9
+
10
+ class CodeChunk(BaseModel):
11
+ """Represents a chunk of code with its metadata"""
12
+ type: ChunkType
13
+ content: str
14
+ start_line: int
15
+ end_line: int
16
+ file_path: str
17
+ name: Optional[str] = None
18
+ parent_name: Optional[str] = None # For nested classes/functions
19
+ docstring: Optional[str] = None
src/know_lang_bot/parser/base/parser.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import List
3
+ from pathlib import Path
4
+ from know_lang_bot.core.types import CodeChunk
5
+ from know_lang_bot.config import AppConfig, LanguageConfig
6
+ from tree_sitter import Language, Parser
7
+
8
+ class LanguageParser(ABC):
9
+ """Abstract base class for language-specific parsers"""
10
+
11
+ @abstractmethod
12
+ def __init__(self, config: AppConfig):
13
+ self.config : AppConfig = config
14
+ self.language : Language = None
15
+ self.parser : Parser = None
16
+ self.language_config : LanguageConfig = None
17
+
18
+ @abstractmethod
19
+ def setup(self) -> None:
20
+ """Set up the parser (e.g., initialize tree-sitter)"""
21
+ pass
22
+
23
+ @abstractmethod
24
+ def parse_file(self, file_path: Path) -> List[CodeChunk]:
25
+ """Parse a single file and return code chunks"""
26
+ pass
src/know_lang_bot/parser/base/provider.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Generator
3
+ from pathlib import Path
4
+ from know_lang_bot.config import AppConfig
5
+
6
+ class CodeProvider(ABC):
7
+ """Abstract base class for code source providers"""
8
+
9
+ @abstractmethod
10
+ def __init__(self, source_path: Path, config: AppConfig):
11
+ self.source_path = source_path
12
+ self.config = config
13
+
14
+ @abstractmethod
15
+ def get_files(self) -> Generator[Path, None, None]:
16
+ """Yield paths to code files that should be processed"""
17
+ pass
src/know_lang_bot/parser/factory.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Type, Optional
2
+ from pathlib import Path
3
+
4
+ from know_lang_bot.parser.base.parser import LanguageParser
5
+ from know_lang_bot.parser.languages.python.parser import PythonParser
6
+ from know_lang_bot.config import AppConfig
7
+
8
+ class CodeParserFactory():
9
+ """Concrete implementation of parser factory"""
10
+
11
+ def __init__(self, config: AppConfig):
12
+ self.config = config
13
+ self._parsers: Dict[str, LanguageParser] = {}
14
+ self._parser_classes = self._register_parsers()
15
+
16
+ def _register_parsers(self) -> Dict[str, Type[LanguageParser]]:
17
+ """Register available parser implementations"""
18
+ return {
19
+ "python": PythonParser,
20
+ # Add more languages here
21
+ }
22
+
23
+ def get_parser(self, file_path: Path) -> Optional[LanguageParser]:
24
+ """Get appropriate parser for a file"""
25
+ extension = file_path.suffix
26
+
27
+ # Find parser class for this extension
28
+ for lang, parser_class in self._parser_classes.items():
29
+ if not self.config.parser.languages[lang].enabled:
30
+ continue
31
+
32
+ parser = parser_class(self.config)
33
+ if extension in parser.supported_extensions():
34
+ if lang not in self._parsers:
35
+ parser.setup()
36
+ self._parsers[lang] = parser
37
+ return self._parsers[lang]
38
+
39
+ return None
src/know_lang_bot/parser/languages/python/parser.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from pathlib import Path
3
+ import tree_sitter_python
4
+ from tree_sitter import Language, Parser, Node
5
+
6
+ from know_lang_bot.parser.base.parser import LanguageParser
7
+ from know_lang_bot.core.types import CodeChunk, ChunkType
8
+ from know_lang_bot.utils.fancy_log import FancyLogger
9
+
10
+ LOG = FancyLogger(__name__)
11
+
12
+ class PythonParser(LanguageParser):
13
+ """Python-specific implementation of LanguageParser"""
14
+
15
+ def setup(self) -> None:
16
+ """Initialize tree-sitter with Python language support"""
17
+ self.language = Language(tree_sitter_python.language())
18
+ self.parser = Parser(self.language)
19
+ self.language_config = self.config.parser.languages["python"]
20
+
21
+ def _get_preceding_docstring(self, node: Node, source_code: bytes) -> Optional[str]:
22
+ """Extract docstring from comments"""
23
+ docstring_parts = []
24
+ current_node = node.prev_sibling
25
+
26
+ while current_node:
27
+ if current_node.type == "comment":
28
+ comment = source_code[current_node.start_byte:current_node.end_byte].decode('utf-8')
29
+ docstring_parts.insert(0, comment)
30
+ elif current_node.type == "expression_statement":
31
+ string_node = current_node.children[0] if current_node.children else None
32
+ if string_node and string_node.type in ("string", "string_literal"):
33
+ docstring = source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
34
+ docstring_parts.insert(0, docstring)
35
+ break
36
+ elif current_node.type not in ("empty_statement", "newline"):
37
+ break
38
+ current_node = current_node.prev_sibling
39
+
40
+ return '\n'.join(docstring_parts) if docstring_parts else None
41
+
42
+ def _has_syntax_error(self, node: Node) -> bool:
43
+ """Check if the node or its children contain syntax errors"""
44
+ if node.type == "ERROR":
45
+ return True
46
+ if node.has_error:
47
+ return True
48
+ return any(self._has_syntax_error(child) for child in node.children)
49
+
50
+ def _process_class(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
51
+ """Process a class node and return a CodeChunk"""
52
+ name = next(
53
+ (child.text.decode('utf-8')
54
+ for child in node.children
55
+ if child.type == "identifier"),
56
+ None
57
+ )
58
+
59
+ if not name:
60
+ raise ValueError(f"Could not find class name in node: {node.text}")
61
+
62
+ return CodeChunk(
63
+ type=ChunkType.CLASS,
64
+ name=name,
65
+ content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
66
+ start_line=node.start_point[0],
67
+ end_line=node.end_point[0],
68
+ file_path=str(file_path),
69
+ docstring=self._get_preceding_docstring(node, source_code)
70
+ )
71
+
72
+ def _process_function(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
73
+ """Process a function node and return a CodeChunk"""
74
+ name = next(
75
+ (child.text.decode('utf-8')
76
+ for child in node.children
77
+ if child.type == "identifier"),
78
+ None
79
+ )
80
+
81
+ if not name:
82
+ raise ValueError(f"Could not find function name in node: {node.text}")
83
+
84
+ # Determine if this is a method within a class
85
+ parent_node = node.parent
86
+ parent_name = None
87
+ if parent_node and parent_node.type == "class_definition":
88
+ parent_name = next(
89
+ (child.text.decode('utf-8')
90
+ for child in parent_node.children
91
+ if child.type == "identifier"),
92
+ None
93
+ )
94
+
95
+ return CodeChunk(
96
+ type=ChunkType.FUNCTION,
97
+ name=name,
98
+ content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
99
+ start_line=node.start_point[0],
100
+ end_line=node.end_point[0],
101
+ file_path=str(file_path),
102
+ parent_name=parent_name,
103
+ docstring=self._get_preceding_docstring(node, source_code)
104
+ )
105
+
106
+ def parse_file(self, file_path: Path) -> List[CodeChunk]:
107
+ """Parse a single Python file and return list of code chunks"""
108
+ if not self.language_config.supports_extension(file_path.suffix):
109
+ return []
110
+
111
+ # Check file size limit
112
+ if file_path.stat().st_size > self.language_config.max_file_size:
113
+ LOG.warning(f"Skipping file {file_path}: exceeds size limit of {self.language_config.max_file_size} bytes")
114
+ return []
115
+
116
+ try:
117
+ with open(file_path, 'rb') as f:
118
+ source_code = f.read()
119
+
120
+ if not self.parser:
121
+ raise RuntimeError("Parser not initialized. Call setup() first.")
122
+
123
+ tree = self.parser.parse(source_code)
124
+
125
+ # Check for overall syntax validity
126
+ if self._has_syntax_error(tree.root_node):
127
+ LOG.warning(f"Syntax errors found in {file_path}")
128
+ return []
129
+
130
+ chunks: List[CodeChunk] = []
131
+
132
+ # Process the syntax tree
133
+ for node in tree.root_node.children:
134
+ if node.type == "class_definition":
135
+ chunks.append(self._process_class(node, source_code, file_path))
136
+ elif node.type == "function_definition":
137
+ chunks.append(self._process_function(node, source_code, file_path))
138
+ else:
139
+ # Skip other node types for now
140
+ pass
141
+
142
+ return chunks
143
+
144
+ except Exception as e:
145
+ LOG.error(f"Error parsing file {file_path}: {str(e)}")
146
+ return []