Spaces:
Sleeping
Sleeping
multi language & source file support draft
Browse files
src/know_lang_bot/config.py
CHANGED
@@ -1,7 +1,61 @@
|
|
1 |
-
from typing import Optional, Dict, Any
|
2 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
from pydantic import Field
|
4 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
class LLMConfig(BaseSettings):
|
7 |
model_name: str = Field(
|
@@ -56,7 +110,4 @@ class AppConfig(BaseSettings):
|
|
56 |
|
57 |
llm: LLMConfig = Field(default_factory=LLMConfig)
|
58 |
db: DBConfig = Field(default_factory=DBConfig)
|
59 |
-
|
60 |
-
default=1500,
|
61 |
-
description="Maximum size of code chunks before splitting"
|
62 |
-
)
|
|
|
1 |
+
from typing import Optional, Dict, Any, List
|
2 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
3 |
from pydantic import Field
|
4 |
from pathlib import Path
|
5 |
+
import fnmatch
|
6 |
+
|
7 |
+
class PathPatterns(BaseSettings):
|
8 |
+
include: List[str] = Field(
|
9 |
+
default=["**/*"],
|
10 |
+
description="Glob patterns for paths to include"
|
11 |
+
)
|
12 |
+
exclude: List[str] = Field(
|
13 |
+
default=["**/venv/**", "**/.git/**", "**/__pycache__/**"],
|
14 |
+
description="Glob patterns for paths to exclude"
|
15 |
+
)
|
16 |
+
|
17 |
+
def should_process_path(self, path: str) -> bool:
|
18 |
+
"""Check if a path should be processed based on include/exclude patterns"""
|
19 |
+
path_str = str(path)
|
20 |
+
|
21 |
+
# First check exclusions
|
22 |
+
for pattern in self.exclude:
|
23 |
+
if fnmatch.fnmatch(path_str, pattern):
|
24 |
+
return False
|
25 |
+
|
26 |
+
# Then check inclusions
|
27 |
+
for pattern in self.include:
|
28 |
+
if fnmatch.fnmatch(path_str, pattern):
|
29 |
+
return True
|
30 |
+
|
31 |
+
return False
|
32 |
+
|
33 |
+
class LanguageConfig(BaseSettings):
|
34 |
+
enabled: bool = True
|
35 |
+
file_extensions: List[str]
|
36 |
+
tree_sitter_language: str
|
37 |
+
chunk_types: List[str]
|
38 |
+
max_file_size: int = Field(
|
39 |
+
default=1_000_000, # 1MB
|
40 |
+
description="Maximum file size to process in bytes"
|
41 |
+
)
|
42 |
+
|
43 |
+
def supports_extension(self, ext: str) -> bool:
|
44 |
+
"""Check if the language supports a given file extension"""
|
45 |
+
return ext in self.file_extensions
|
46 |
+
|
47 |
+
class ParserConfig(BaseSettings):
|
48 |
+
languages: Dict[str, LanguageConfig] = Field(
|
49 |
+
default={
|
50 |
+
"python": LanguageConfig(
|
51 |
+
file_extensions=[".py"],
|
52 |
+
tree_sitter_language="python",
|
53 |
+
chunk_types=["class_definition", "function_definition"]
|
54 |
+
)
|
55 |
+
}
|
56 |
+
)
|
57 |
+
path_patterns: PathPatterns = Field(default_factory=PathPatterns)
|
58 |
+
|
59 |
|
60 |
class LLMConfig(BaseSettings):
|
61 |
model_name: str = Field(
|
|
|
110 |
|
111 |
llm: LLMConfig = Field(default_factory=LLMConfig)
|
112 |
db: DBConfig = Field(default_factory=DBConfig)
|
113 |
+
parser: ParserConfig = Field(default_factory=ParserConfig)
|
|
|
|
|
|
src/know_lang_bot/core/types.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydantic import BaseModel
|
2 |
+
from typing import Optional
|
3 |
+
from enum import Enum
|
4 |
+
|
5 |
+
class ChunkType(str, Enum):
|
6 |
+
CLASS = "class"
|
7 |
+
FUNCTION = "function"
|
8 |
+
OTHER = "other"
|
9 |
+
|
10 |
+
class CodeChunk(BaseModel):
|
11 |
+
"""Represents a chunk of code with its metadata"""
|
12 |
+
type: ChunkType
|
13 |
+
content: str
|
14 |
+
start_line: int
|
15 |
+
end_line: int
|
16 |
+
file_path: str
|
17 |
+
name: Optional[str] = None
|
18 |
+
parent_name: Optional[str] = None # For nested classes/functions
|
19 |
+
docstring: Optional[str] = None
|
src/know_lang_bot/parser/base/parser.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import List
|
3 |
+
from pathlib import Path
|
4 |
+
from know_lang_bot.core.types import CodeChunk
|
5 |
+
from know_lang_bot.config import AppConfig, LanguageConfig
|
6 |
+
from tree_sitter import Language, Parser
|
7 |
+
|
8 |
+
class LanguageParser(ABC):
|
9 |
+
"""Abstract base class for language-specific parsers"""
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def __init__(self, config: AppConfig):
|
13 |
+
self.config : AppConfig = config
|
14 |
+
self.language : Language = None
|
15 |
+
self.parser : Parser = None
|
16 |
+
self.language_config : LanguageConfig = None
|
17 |
+
|
18 |
+
@abstractmethod
|
19 |
+
def setup(self) -> None:
|
20 |
+
"""Set up the parser (e.g., initialize tree-sitter)"""
|
21 |
+
pass
|
22 |
+
|
23 |
+
@abstractmethod
|
24 |
+
def parse_file(self, file_path: Path) -> List[CodeChunk]:
|
25 |
+
"""Parse a single file and return code chunks"""
|
26 |
+
pass
|
src/know_lang_bot/parser/base/provider.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from typing import Generator
|
3 |
+
from pathlib import Path
|
4 |
+
from know_lang_bot.config import AppConfig
|
5 |
+
|
6 |
+
class CodeProvider(ABC):
|
7 |
+
"""Abstract base class for code source providers"""
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def __init__(self, source_path: Path, config: AppConfig):
|
11 |
+
self.source_path = source_path
|
12 |
+
self.config = config
|
13 |
+
|
14 |
+
@abstractmethod
|
15 |
+
def get_files(self) -> Generator[Path, None, None]:
|
16 |
+
"""Yield paths to code files that should be processed"""
|
17 |
+
pass
|
src/know_lang_bot/parser/factory.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, Type, Optional
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from know_lang_bot.parser.base.parser import LanguageParser
|
5 |
+
from know_lang_bot.parser.languages.python.parser import PythonParser
|
6 |
+
from know_lang_bot.config import AppConfig
|
7 |
+
|
8 |
+
class CodeParserFactory():
|
9 |
+
"""Concrete implementation of parser factory"""
|
10 |
+
|
11 |
+
def __init__(self, config: AppConfig):
|
12 |
+
self.config = config
|
13 |
+
self._parsers: Dict[str, LanguageParser] = {}
|
14 |
+
self._parser_classes = self._register_parsers()
|
15 |
+
|
16 |
+
def _register_parsers(self) -> Dict[str, Type[LanguageParser]]:
|
17 |
+
"""Register available parser implementations"""
|
18 |
+
return {
|
19 |
+
"python": PythonParser,
|
20 |
+
# Add more languages here
|
21 |
+
}
|
22 |
+
|
23 |
+
def get_parser(self, file_path: Path) -> Optional[LanguageParser]:
|
24 |
+
"""Get appropriate parser for a file"""
|
25 |
+
extension = file_path.suffix
|
26 |
+
|
27 |
+
# Find parser class for this extension
|
28 |
+
for lang, parser_class in self._parser_classes.items():
|
29 |
+
if not self.config.parser.languages[lang].enabled:
|
30 |
+
continue
|
31 |
+
|
32 |
+
parser = parser_class(self.config)
|
33 |
+
if extension in parser.supported_extensions():
|
34 |
+
if lang not in self._parsers:
|
35 |
+
parser.setup()
|
36 |
+
self._parsers[lang] = parser
|
37 |
+
return self._parsers[lang]
|
38 |
+
|
39 |
+
return None
|
src/know_lang_bot/parser/languages/python/parser.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional
|
2 |
+
from pathlib import Path
|
3 |
+
import tree_sitter_python
|
4 |
+
from tree_sitter import Language, Parser, Node
|
5 |
+
|
6 |
+
from know_lang_bot.parser.base.parser import LanguageParser
|
7 |
+
from know_lang_bot.core.types import CodeChunk, ChunkType
|
8 |
+
from know_lang_bot.utils.fancy_log import FancyLogger
|
9 |
+
|
10 |
+
LOG = FancyLogger(__name__)
|
11 |
+
|
12 |
+
class PythonParser(LanguageParser):
|
13 |
+
"""Python-specific implementation of LanguageParser"""
|
14 |
+
|
15 |
+
def setup(self) -> None:
|
16 |
+
"""Initialize tree-sitter with Python language support"""
|
17 |
+
self.language = Language(tree_sitter_python.language())
|
18 |
+
self.parser = Parser(self.language)
|
19 |
+
self.language_config = self.config.parser.languages["python"]
|
20 |
+
|
21 |
+
def _get_preceding_docstring(self, node: Node, source_code: bytes) -> Optional[str]:
|
22 |
+
"""Extract docstring from comments"""
|
23 |
+
docstring_parts = []
|
24 |
+
current_node = node.prev_sibling
|
25 |
+
|
26 |
+
while current_node:
|
27 |
+
if current_node.type == "comment":
|
28 |
+
comment = source_code[current_node.start_byte:current_node.end_byte].decode('utf-8')
|
29 |
+
docstring_parts.insert(0, comment)
|
30 |
+
elif current_node.type == "expression_statement":
|
31 |
+
string_node = current_node.children[0] if current_node.children else None
|
32 |
+
if string_node and string_node.type in ("string", "string_literal"):
|
33 |
+
docstring = source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
|
34 |
+
docstring_parts.insert(0, docstring)
|
35 |
+
break
|
36 |
+
elif current_node.type not in ("empty_statement", "newline"):
|
37 |
+
break
|
38 |
+
current_node = current_node.prev_sibling
|
39 |
+
|
40 |
+
return '\n'.join(docstring_parts) if docstring_parts else None
|
41 |
+
|
42 |
+
def _has_syntax_error(self, node: Node) -> bool:
|
43 |
+
"""Check if the node or its children contain syntax errors"""
|
44 |
+
if node.type == "ERROR":
|
45 |
+
return True
|
46 |
+
if node.has_error:
|
47 |
+
return True
|
48 |
+
return any(self._has_syntax_error(child) for child in node.children)
|
49 |
+
|
50 |
+
def _process_class(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
|
51 |
+
"""Process a class node and return a CodeChunk"""
|
52 |
+
name = next(
|
53 |
+
(child.text.decode('utf-8')
|
54 |
+
for child in node.children
|
55 |
+
if child.type == "identifier"),
|
56 |
+
None
|
57 |
+
)
|
58 |
+
|
59 |
+
if not name:
|
60 |
+
raise ValueError(f"Could not find class name in node: {node.text}")
|
61 |
+
|
62 |
+
return CodeChunk(
|
63 |
+
type=ChunkType.CLASS,
|
64 |
+
name=name,
|
65 |
+
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
|
66 |
+
start_line=node.start_point[0],
|
67 |
+
end_line=node.end_point[0],
|
68 |
+
file_path=str(file_path),
|
69 |
+
docstring=self._get_preceding_docstring(node, source_code)
|
70 |
+
)
|
71 |
+
|
72 |
+
def _process_function(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
|
73 |
+
"""Process a function node and return a CodeChunk"""
|
74 |
+
name = next(
|
75 |
+
(child.text.decode('utf-8')
|
76 |
+
for child in node.children
|
77 |
+
if child.type == "identifier"),
|
78 |
+
None
|
79 |
+
)
|
80 |
+
|
81 |
+
if not name:
|
82 |
+
raise ValueError(f"Could not find function name in node: {node.text}")
|
83 |
+
|
84 |
+
# Determine if this is a method within a class
|
85 |
+
parent_node = node.parent
|
86 |
+
parent_name = None
|
87 |
+
if parent_node and parent_node.type == "class_definition":
|
88 |
+
parent_name = next(
|
89 |
+
(child.text.decode('utf-8')
|
90 |
+
for child in parent_node.children
|
91 |
+
if child.type == "identifier"),
|
92 |
+
None
|
93 |
+
)
|
94 |
+
|
95 |
+
return CodeChunk(
|
96 |
+
type=ChunkType.FUNCTION,
|
97 |
+
name=name,
|
98 |
+
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
|
99 |
+
start_line=node.start_point[0],
|
100 |
+
end_line=node.end_point[0],
|
101 |
+
file_path=str(file_path),
|
102 |
+
parent_name=parent_name,
|
103 |
+
docstring=self._get_preceding_docstring(node, source_code)
|
104 |
+
)
|
105 |
+
|
106 |
+
def parse_file(self, file_path: Path) -> List[CodeChunk]:
|
107 |
+
"""Parse a single Python file and return list of code chunks"""
|
108 |
+
if not self.language_config.supports_extension(file_path.suffix):
|
109 |
+
return []
|
110 |
+
|
111 |
+
# Check file size limit
|
112 |
+
if file_path.stat().st_size > self.language_config.max_file_size:
|
113 |
+
LOG.warning(f"Skipping file {file_path}: exceeds size limit of {self.language_config.max_file_size} bytes")
|
114 |
+
return []
|
115 |
+
|
116 |
+
try:
|
117 |
+
with open(file_path, 'rb') as f:
|
118 |
+
source_code = f.read()
|
119 |
+
|
120 |
+
if not self.parser:
|
121 |
+
raise RuntimeError("Parser not initialized. Call setup() first.")
|
122 |
+
|
123 |
+
tree = self.parser.parse(source_code)
|
124 |
+
|
125 |
+
# Check for overall syntax validity
|
126 |
+
if self._has_syntax_error(tree.root_node):
|
127 |
+
LOG.warning(f"Syntax errors found in {file_path}")
|
128 |
+
return []
|
129 |
+
|
130 |
+
chunks: List[CodeChunk] = []
|
131 |
+
|
132 |
+
# Process the syntax tree
|
133 |
+
for node in tree.root_node.children:
|
134 |
+
if node.type == "class_definition":
|
135 |
+
chunks.append(self._process_class(node, source_code, file_path))
|
136 |
+
elif node.type == "function_definition":
|
137 |
+
chunks.append(self._process_function(node, source_code, file_path))
|
138 |
+
else:
|
139 |
+
# Skip other node types for now
|
140 |
+
pass
|
141 |
+
|
142 |
+
return chunks
|
143 |
+
|
144 |
+
except Exception as e:
|
145 |
+
LOG.error(f"Error parsing file {file_path}: {str(e)}")
|
146 |
+
return []
|