gabykim commited on
Commit
2d4f211
Β·
1 Parent(s): f22ea30

python code parsing draft

Browse files
full_requirements.txt CHANGED
@@ -47,6 +47,7 @@ sniffio==1.3.1
47
  tokenizers==0.21.0
48
  tqdm==4.67.1
49
  tree-sitter==0.24.0
 
50
  types-requests==2.32.0.20241016
51
  typing-inspect==0.9.0
52
  typing_extensions==4.12.2
 
47
  tokenizers==0.21.0
48
  tqdm==4.67.1
49
  tree-sitter==0.24.0
50
+ tree-sitter-languages==1.10.2
51
  types-requests==2.32.0.20241016
52
  typing-inspect==0.9.0
53
  typing_extensions==4.12.2
parser/parse.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict
2
+ from enum import Enum
3
+ from pathlib import Path
4
+ import tree_sitter
5
+ from tree_sitter_languages import get_language, get_parser
6
+ from pydantic import BaseModel, Field
7
+ from git import Repo
8
+ from utils.fancy_log import FancyLogger
9
+
10
+ LOG = FancyLogger(__name__)
11
+
12
+
13
+ class ChunkType(str, Enum):
14
+ CLASS = "class"
15
+ FUNCTION = "function"
16
+ MODULE = "module"
17
+ OTHER = "other"
18
+
19
+ class CodeChunk(BaseModel):
20
+ """Represents a chunk of code with its metadata"""
21
+ type: ChunkType
22
+ content: str
23
+ start_line: int
24
+ end_line: int
25
+ file_path: str
26
+ name: Optional[str] = None
27
+ parent_name: Optional[str] = None # For nested classes/functions
28
+ docstring: Optional[str] = None
29
+
30
+ class CodeParser:
31
+ parser : tree_sitter.Parser = None
32
+ laguage: tree_sitter.Language = None
33
+
34
+ def __init__(self, repo_path: str):
35
+ """Initialize the parser with a repository path"""
36
+ self.repo_path = Path(repo_path)
37
+ self._init_tree_sitter()
38
+
39
+ def _init_tree_sitter(self):
40
+ """Initialize tree-sitter with Python language support"""
41
+ # In real implementation, we'd need to handle language loading more robustly
42
+ # For MVP, we'll assume Python parser is available
43
+ self.parser = get_parser('python')
44
+ self.language = get_language('python')
45
+
46
+ def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
47
+ """Extract docstring from a class or function node"""
48
+ for child in node.children:
49
+ if child.type == "expression_statement":
50
+ string_node = child.children[0]
51
+ if string_node.type in ("string", "string_literal"):
52
+ return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
53
+ return None
54
+
55
+ def parse_file(self, file_path: Path) -> List[CodeChunk]:
56
+ """Parse a single file and return list of code chunks"""
57
+ if not file_path.suffix == '.py':
58
+ LOG.warning(f"Skipping non-Python file: {file_path}")
59
+ return []
60
+
61
+ try:
62
+ with open(file_path, 'rb') as f:
63
+ source_code = f.read()
64
+
65
+ tree = self.parser.parse(source_code)
66
+ chunks: List[CodeChunk] = []
67
+
68
+ # Process the syntax tree
69
+ for node in tree.root_node.children:
70
+ if node.type == "class_definition":
71
+ chunks.append(self._process_class(node, source_code, file_path))
72
+ elif node.type == "function_definition":
73
+ chunks.append(self._process_function(node, source_code, file_path))
74
+ else:
75
+ # Store other top-level code as separate chunks
76
+ if node.type not in ("comment", "empty_statement"):
77
+ chunks.append(CodeChunk(
78
+ type=ChunkType.OTHER,
79
+ content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
80
+ start_line=node.start_point[0],
81
+ end_line=node.end_point[0],
82
+ file_path=str(file_path)
83
+ ))
84
+
85
+ return chunks
86
+ except Exception as e:
87
+ LOG.error(f"Error parsing file {file_path}: {str(e)}")
88
+ return []
89
+
90
+ def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
91
+ """Process a class node and return a CodeChunk"""
92
+ name = next(child.text.decode('utf-8')
93
+ for child in node.children
94
+ if child.type == "identifier")
95
+
96
+ return CodeChunk(
97
+ type=ChunkType.CLASS,
98
+ name=name,
99
+ content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
100
+ start_line=node.start_point[0],
101
+ end_line=node.end_point[0],
102
+ file_path=str(file_path),
103
+ docstring=self._extract_docstring(node, source_code)
104
+ )
105
+
106
+ def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
107
+ """Process a function node and return a CodeChunk"""
108
+ name = next(child.text.decode('utf-8')
109
+ for child in node.children
110
+ if child.type == "identifier")
111
+
112
+ return CodeChunk(
113
+ type=ChunkType.FUNCTION,
114
+ name=name,
115
+ content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
116
+ start_line=node.start_point[0],
117
+ end_line=node.end_point[0],
118
+ file_path=str(file_path),
119
+ docstring=self._extract_docstring(node, source_code)
120
+ )
121
+
122
+ def parse_repository(self) -> List[CodeChunk]:
123
+ """Parse all Python files in the repository"""
124
+ chunks: List[CodeChunk] = []
125
+
126
+ try:
127
+ repo = Repo(self.repo_path)
128
+ for root, _, files in repo.working_tree_traverse():
129
+ for file in files:
130
+ if file.endswith('.py'):
131
+ file_path = Path(root) / file
132
+ chunks.extend(self.parse_file(file_path))
133
+ except Exception as e:
134
+ LOG.error(f"Error processing repository: {str(e)}")
135
+
136
+ return chunks
137
+
138
+
139
+ # Usage example:
140
+ if __name__ == "__main__":
141
+ parser = CodeParser("path/to/repo")
142
+ chunks = parser.parse_repository()
143
+ for chunk in chunks:
144
+ print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")
simple_requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  pydantic-ai==0.0.20
2
- tree-sitter==0.24.0
3
  GitPython==3.1.44
 
1
  pydantic-ai==0.0.20
2
+ tree-sitter-languages==1.10.2
3
  GitPython==3.1.44
utils/fancy_log.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import logging.config
4
+ import logging.handlers
5
+ import os
6
+ import queue
7
+
8
+ JSON_LOGGING = os.environ.get("JSON_LOGGING", "false").lower() == "true"
9
+
10
+ CHAT = 29
11
+ logging.addLevelName(CHAT, "CHAT")
12
+
13
+ RESET_SEQ: str = "\033[0m"
14
+ COLOR_SEQ: str = "\033[1;%dm"
15
+ BOLD_SEQ: str = "\033[1m"
16
+ UNDERLINE_SEQ: str = "\033[04m"
17
+
18
+ ORANGE: str = "\033[33m"
19
+ YELLOW: str = "\033[93m"
20
+ WHITE: str = "\33[37m"
21
+ BLUE: str = "\033[34m"
22
+ LIGHT_BLUE: str = "\033[94m"
23
+ RED: str = "\033[91m"
24
+ GREY: str = "\33[90m"
25
+ GREEN: str = "\033[92m"
26
+
27
+ EMOJIS: dict[str, str] = {
28
+ "DEBUG": "πŸ›",
29
+ "INFO": "πŸ“",
30
+ "CHAT": "πŸ’¬",
31
+ "WARNING": "⚠️",
32
+ "ERROR": "❌",
33
+ "CRITICAL": "πŸ’₯",
34
+ }
35
+
36
+ KEYWORD_COLORS: dict[str, str] = {
37
+ "DEBUG": WHITE,
38
+ "INFO": LIGHT_BLUE,
39
+ "CHAT": GREEN,
40
+ "WARNING": YELLOW,
41
+ "ERROR": ORANGE,
42
+ "CRITICAL": RED,
43
+ }
44
+
45
+
46
+ class JsonFormatter(logging.Formatter):
47
+ def format(self, record):
48
+ return json.dumps(record.__dict__)
49
+
50
+
51
+ def formatter_message(message: str, use_color: bool = True) -> str:
52
+ """
53
+ Syntax highlight certain keywords
54
+ """
55
+ if use_color:
56
+ message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
57
+ else:
58
+ message = message.replace("$RESET", "").replace("$BOLD", "")
59
+ return message
60
+
61
+
62
+ def format_word(
63
+ message: str, word: str, color_seq: str, bold: bool = False, underline: bool = False
64
+ ) -> str:
65
+ """
66
+ Surround the fiven word with a sequence
67
+ """
68
+ replacer = color_seq + word + RESET_SEQ
69
+ if underline:
70
+ replacer = UNDERLINE_SEQ + replacer
71
+ if bold:
72
+ replacer = BOLD_SEQ + replacer
73
+ return message.replace(word, replacer)
74
+
75
+
76
+ class ConsoleFormatter(logging.Formatter):
77
+ """
78
+ This Formatted simply colors in the levelname i.e 'INFO', 'DEBUG'
79
+ """
80
+
81
+ def __init__(
82
+ self, fmt: str, datefmt: str = None, style: str = "%", use_color: bool = True
83
+ ):
84
+ super().__init__(fmt, datefmt, style)
85
+ self.use_color = use_color
86
+
87
+ def format(self, record: logging.LogRecord) -> str:
88
+ """
89
+ Format and highlight certain keywords
90
+ """
91
+ rec = record
92
+ levelname = rec.levelname
93
+ if self.use_color and levelname in KEYWORD_COLORS:
94
+ levelname_color = KEYWORD_COLORS[levelname] + levelname + RESET_SEQ
95
+ rec.levelname = levelname_color
96
+ rec.name = f"{GREY}{rec.name:<15}{RESET_SEQ}"
97
+ rec.msg = (
98
+ KEYWORD_COLORS[levelname] + EMOJIS[levelname] + " " + rec.msg + RESET_SEQ
99
+ )
100
+ return logging.Formatter.format(self, rec)
101
+
102
+
103
+ class FancyLogger(logging.Logger):
104
+ """
105
+ This adds extra logging functions such as logger.trade and also
106
+ sets the logger to use the custom formatter
107
+ """
108
+
109
+ CONSOLE_FORMAT: str = (
110
+ "[%(asctime)s] [$BOLD%(name)-15s$RESET] [%(levelname)-8s]\t%(message)s"
111
+ )
112
+ FORMAT: str = "%(asctime)s %(name)-15s %(levelname)-8s %(message)s"
113
+ COLOR_FORMAT: str = formatter_message(CONSOLE_FORMAT, True)
114
+ JSON_FORMAT: str = '{"time": "%(asctime)s", "name": "%(name)s", "level": "%(levelname)s", "message": "%(message)s"}'
115
+
116
+ def __init__(self, name: str, logLevel: str = "DEBUG"):
117
+ logging.Logger.__init__(self, name, logLevel)
118
+
119
+ # Queue Handler
120
+ queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
121
+ json_formatter = logging.Formatter(self.JSON_FORMAT)
122
+ queue_handler.setFormatter(json_formatter)
123
+ self.addHandler(queue_handler)
124
+
125
+ if JSON_LOGGING:
126
+ console_formatter = JsonFormatter()
127
+ else:
128
+ console_formatter = ConsoleFormatter(self.COLOR_FORMAT)
129
+ console = logging.StreamHandler()
130
+ console.setFormatter(console_formatter)
131
+ self.addHandler(console)
132
+
133
+ def chat(self, role: str, openai_repsonse: dict, messages=None, *args, **kws):
134
+ """
135
+ Parse the content, log the message and extract the usage into prometheus metrics
136
+ """
137
+ role_emojis = {
138
+ "system": "πŸ–₯️",
139
+ "user": "πŸ‘€",
140
+ "assistant": "πŸ€–",
141
+ "function": "βš™οΈ",
142
+ }
143
+ if self.isEnabledFor(CHAT):
144
+ if messages:
145
+ for message in messages:
146
+ self._log(
147
+ CHAT,
148
+ f"{role_emojis.get(message['role'], 'πŸ”΅')}: {message['content']}",
149
+ )
150
+ else:
151
+ response = json.loads(openai_repsonse)
152
+
153
+ self._log(
154
+ CHAT,
155
+ f"{role_emojis.get(role, 'πŸ”΅')}: {response['choices'][0]['message']['content']}",
156
+ )
157
+
158
+
159
+ class QueueLogger(logging.Logger):
160
+ """
161
+ Custom logger class with queue
162
+ """
163
+
164
+ def __init__(self, name: str, level: int = logging.NOTSET):
165
+ super().__init__(name, level)
166
+ queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
167
+ self.addHandler(queue_handler)
168
+
169
+
170
+ logging_config: dict = dict(
171
+ version=1,
172
+ formatters={
173
+ "console": {
174
+ "()": ConsoleFormatter,
175
+ "format": FancyLogger.COLOR_FORMAT,
176
+ },
177
+ },
178
+ handlers={
179
+ "h": {
180
+ "class": "logging.StreamHandler",
181
+ "formatter": "console",
182
+ "level": logging.INFO,
183
+ },
184
+ },
185
+ root={
186
+ "handlers": ["h"],
187
+ "level": logging.INFO,
188
+ },
189
+ loggers={
190
+ "autogpt": {
191
+ "handlers": ["h"],
192
+ "level": logging.INFO,
193
+ "propagate": False,
194
+ },
195
+ },
196
+ )
197
+
198
+
199
+ def setup_logger():
200
+ """
201
+ Setup the logger with the specified format
202
+ """
203
+ logging.config.dictConfig(logging_config)