python code parsing draft
Browse files- full_requirements.txt +1 -0
- parser/parse.py +144 -0
- simple_requirements.txt +1 -1
- utils/fancy_log.py +203 -0
full_requirements.txt
CHANGED
@@ -47,6 +47,7 @@ sniffio==1.3.1
|
|
47 |
tokenizers==0.21.0
|
48 |
tqdm==4.67.1
|
49 |
tree-sitter==0.24.0
|
|
|
50 |
types-requests==2.32.0.20241016
|
51 |
typing-inspect==0.9.0
|
52 |
typing_extensions==4.12.2
|
|
|
47 |
tokenizers==0.21.0
|
48 |
tqdm==4.67.1
|
49 |
tree-sitter==0.24.0
|
50 |
+
tree-sitter-languages==1.10.2
|
51 |
types-requests==2.32.0.20241016
|
52 |
typing-inspect==0.9.0
|
53 |
typing_extensions==4.12.2
|
parser/parse.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Optional, Dict
|
2 |
+
from enum import Enum
|
3 |
+
from pathlib import Path
|
4 |
+
import tree_sitter
|
5 |
+
from tree_sitter_languages import get_language, get_parser
|
6 |
+
from pydantic import BaseModel, Field
|
7 |
+
from git import Repo
|
8 |
+
from utils.fancy_log import FancyLogger
|
9 |
+
|
10 |
+
LOG = FancyLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
class ChunkType(str, Enum):
|
14 |
+
CLASS = "class"
|
15 |
+
FUNCTION = "function"
|
16 |
+
MODULE = "module"
|
17 |
+
OTHER = "other"
|
18 |
+
|
19 |
+
class CodeChunk(BaseModel):
|
20 |
+
"""Represents a chunk of code with its metadata"""
|
21 |
+
type: ChunkType
|
22 |
+
content: str
|
23 |
+
start_line: int
|
24 |
+
end_line: int
|
25 |
+
file_path: str
|
26 |
+
name: Optional[str] = None
|
27 |
+
parent_name: Optional[str] = None # For nested classes/functions
|
28 |
+
docstring: Optional[str] = None
|
29 |
+
|
30 |
+
class CodeParser:
|
31 |
+
parser : tree_sitter.Parser = None
|
32 |
+
laguage: tree_sitter.Language = None
|
33 |
+
|
34 |
+
def __init__(self, repo_path: str):
|
35 |
+
"""Initialize the parser with a repository path"""
|
36 |
+
self.repo_path = Path(repo_path)
|
37 |
+
self._init_tree_sitter()
|
38 |
+
|
39 |
+
def _init_tree_sitter(self):
|
40 |
+
"""Initialize tree-sitter with Python language support"""
|
41 |
+
# In real implementation, we'd need to handle language loading more robustly
|
42 |
+
# For MVP, we'll assume Python parser is available
|
43 |
+
self.parser = get_parser('python')
|
44 |
+
self.language = get_language('python')
|
45 |
+
|
46 |
+
def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
|
47 |
+
"""Extract docstring from a class or function node"""
|
48 |
+
for child in node.children:
|
49 |
+
if child.type == "expression_statement":
|
50 |
+
string_node = child.children[0]
|
51 |
+
if string_node.type in ("string", "string_literal"):
|
52 |
+
return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
|
53 |
+
return None
|
54 |
+
|
55 |
+
def parse_file(self, file_path: Path) -> List[CodeChunk]:
|
56 |
+
"""Parse a single file and return list of code chunks"""
|
57 |
+
if not file_path.suffix == '.py':
|
58 |
+
LOG.warning(f"Skipping non-Python file: {file_path}")
|
59 |
+
return []
|
60 |
+
|
61 |
+
try:
|
62 |
+
with open(file_path, 'rb') as f:
|
63 |
+
source_code = f.read()
|
64 |
+
|
65 |
+
tree = self.parser.parse(source_code)
|
66 |
+
chunks: List[CodeChunk] = []
|
67 |
+
|
68 |
+
# Process the syntax tree
|
69 |
+
for node in tree.root_node.children:
|
70 |
+
if node.type == "class_definition":
|
71 |
+
chunks.append(self._process_class(node, source_code, file_path))
|
72 |
+
elif node.type == "function_definition":
|
73 |
+
chunks.append(self._process_function(node, source_code, file_path))
|
74 |
+
else:
|
75 |
+
# Store other top-level code as separate chunks
|
76 |
+
if node.type not in ("comment", "empty_statement"):
|
77 |
+
chunks.append(CodeChunk(
|
78 |
+
type=ChunkType.OTHER,
|
79 |
+
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
|
80 |
+
start_line=node.start_point[0],
|
81 |
+
end_line=node.end_point[0],
|
82 |
+
file_path=str(file_path)
|
83 |
+
))
|
84 |
+
|
85 |
+
return chunks
|
86 |
+
except Exception as e:
|
87 |
+
LOG.error(f"Error parsing file {file_path}: {str(e)}")
|
88 |
+
return []
|
89 |
+
|
90 |
+
def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
|
91 |
+
"""Process a class node and return a CodeChunk"""
|
92 |
+
name = next(child.text.decode('utf-8')
|
93 |
+
for child in node.children
|
94 |
+
if child.type == "identifier")
|
95 |
+
|
96 |
+
return CodeChunk(
|
97 |
+
type=ChunkType.CLASS,
|
98 |
+
name=name,
|
99 |
+
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
|
100 |
+
start_line=node.start_point[0],
|
101 |
+
end_line=node.end_point[0],
|
102 |
+
file_path=str(file_path),
|
103 |
+
docstring=self._extract_docstring(node, source_code)
|
104 |
+
)
|
105 |
+
|
106 |
+
def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
|
107 |
+
"""Process a function node and return a CodeChunk"""
|
108 |
+
name = next(child.text.decode('utf-8')
|
109 |
+
for child in node.children
|
110 |
+
if child.type == "identifier")
|
111 |
+
|
112 |
+
return CodeChunk(
|
113 |
+
type=ChunkType.FUNCTION,
|
114 |
+
name=name,
|
115 |
+
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
|
116 |
+
start_line=node.start_point[0],
|
117 |
+
end_line=node.end_point[0],
|
118 |
+
file_path=str(file_path),
|
119 |
+
docstring=self._extract_docstring(node, source_code)
|
120 |
+
)
|
121 |
+
|
122 |
+
def parse_repository(self) -> List[CodeChunk]:
|
123 |
+
"""Parse all Python files in the repository"""
|
124 |
+
chunks: List[CodeChunk] = []
|
125 |
+
|
126 |
+
try:
|
127 |
+
repo = Repo(self.repo_path)
|
128 |
+
for root, _, files in repo.working_tree_traverse():
|
129 |
+
for file in files:
|
130 |
+
if file.endswith('.py'):
|
131 |
+
file_path = Path(root) / file
|
132 |
+
chunks.extend(self.parse_file(file_path))
|
133 |
+
except Exception as e:
|
134 |
+
LOG.error(f"Error processing repository: {str(e)}")
|
135 |
+
|
136 |
+
return chunks
|
137 |
+
|
138 |
+
|
139 |
+
# Usage example:
|
140 |
+
if __name__ == "__main__":
|
141 |
+
parser = CodeParser("path/to/repo")
|
142 |
+
chunks = parser.parse_repository()
|
143 |
+
for chunk in chunks:
|
144 |
+
print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")
|
simple_requirements.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
pydantic-ai==0.0.20
|
2 |
-
tree-sitter==
|
3 |
GitPython==3.1.44
|
|
|
1 |
pydantic-ai==0.0.20
|
2 |
+
tree-sitter-languages==1.10.2
|
3 |
GitPython==3.1.44
|
utils/fancy_log.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import logging.config
|
4 |
+
import logging.handlers
|
5 |
+
import os
|
6 |
+
import queue
|
7 |
+
|
8 |
+
JSON_LOGGING = os.environ.get("JSON_LOGGING", "false").lower() == "true"
|
9 |
+
|
10 |
+
CHAT = 29
|
11 |
+
logging.addLevelName(CHAT, "CHAT")
|
12 |
+
|
13 |
+
RESET_SEQ: str = "\033[0m"
|
14 |
+
COLOR_SEQ: str = "\033[1;%dm"
|
15 |
+
BOLD_SEQ: str = "\033[1m"
|
16 |
+
UNDERLINE_SEQ: str = "\033[04m"
|
17 |
+
|
18 |
+
ORANGE: str = "\033[33m"
|
19 |
+
YELLOW: str = "\033[93m"
|
20 |
+
WHITE: str = "\33[37m"
|
21 |
+
BLUE: str = "\033[34m"
|
22 |
+
LIGHT_BLUE: str = "\033[94m"
|
23 |
+
RED: str = "\033[91m"
|
24 |
+
GREY: str = "\33[90m"
|
25 |
+
GREEN: str = "\033[92m"
|
26 |
+
|
27 |
+
EMOJIS: dict[str, str] = {
|
28 |
+
"DEBUG": "π",
|
29 |
+
"INFO": "π",
|
30 |
+
"CHAT": "π¬",
|
31 |
+
"WARNING": "β οΈ",
|
32 |
+
"ERROR": "β",
|
33 |
+
"CRITICAL": "π₯",
|
34 |
+
}
|
35 |
+
|
36 |
+
KEYWORD_COLORS: dict[str, str] = {
|
37 |
+
"DEBUG": WHITE,
|
38 |
+
"INFO": LIGHT_BLUE,
|
39 |
+
"CHAT": GREEN,
|
40 |
+
"WARNING": YELLOW,
|
41 |
+
"ERROR": ORANGE,
|
42 |
+
"CRITICAL": RED,
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
class JsonFormatter(logging.Formatter):
|
47 |
+
def format(self, record):
|
48 |
+
return json.dumps(record.__dict__)
|
49 |
+
|
50 |
+
|
51 |
+
def formatter_message(message: str, use_color: bool = True) -> str:
|
52 |
+
"""
|
53 |
+
Syntax highlight certain keywords
|
54 |
+
"""
|
55 |
+
if use_color:
|
56 |
+
message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
|
57 |
+
else:
|
58 |
+
message = message.replace("$RESET", "").replace("$BOLD", "")
|
59 |
+
return message
|
60 |
+
|
61 |
+
|
62 |
+
def format_word(
|
63 |
+
message: str, word: str, color_seq: str, bold: bool = False, underline: bool = False
|
64 |
+
) -> str:
|
65 |
+
"""
|
66 |
+
Surround the fiven word with a sequence
|
67 |
+
"""
|
68 |
+
replacer = color_seq + word + RESET_SEQ
|
69 |
+
if underline:
|
70 |
+
replacer = UNDERLINE_SEQ + replacer
|
71 |
+
if bold:
|
72 |
+
replacer = BOLD_SEQ + replacer
|
73 |
+
return message.replace(word, replacer)
|
74 |
+
|
75 |
+
|
76 |
+
class ConsoleFormatter(logging.Formatter):
|
77 |
+
"""
|
78 |
+
This Formatted simply colors in the levelname i.e 'INFO', 'DEBUG'
|
79 |
+
"""
|
80 |
+
|
81 |
+
def __init__(
|
82 |
+
self, fmt: str, datefmt: str = None, style: str = "%", use_color: bool = True
|
83 |
+
):
|
84 |
+
super().__init__(fmt, datefmt, style)
|
85 |
+
self.use_color = use_color
|
86 |
+
|
87 |
+
def format(self, record: logging.LogRecord) -> str:
|
88 |
+
"""
|
89 |
+
Format and highlight certain keywords
|
90 |
+
"""
|
91 |
+
rec = record
|
92 |
+
levelname = rec.levelname
|
93 |
+
if self.use_color and levelname in KEYWORD_COLORS:
|
94 |
+
levelname_color = KEYWORD_COLORS[levelname] + levelname + RESET_SEQ
|
95 |
+
rec.levelname = levelname_color
|
96 |
+
rec.name = f"{GREY}{rec.name:<15}{RESET_SEQ}"
|
97 |
+
rec.msg = (
|
98 |
+
KEYWORD_COLORS[levelname] + EMOJIS[levelname] + " " + rec.msg + RESET_SEQ
|
99 |
+
)
|
100 |
+
return logging.Formatter.format(self, rec)
|
101 |
+
|
102 |
+
|
103 |
+
class FancyLogger(logging.Logger):
|
104 |
+
"""
|
105 |
+
This adds extra logging functions such as logger.trade and also
|
106 |
+
sets the logger to use the custom formatter
|
107 |
+
"""
|
108 |
+
|
109 |
+
CONSOLE_FORMAT: str = (
|
110 |
+
"[%(asctime)s] [$BOLD%(name)-15s$RESET] [%(levelname)-8s]\t%(message)s"
|
111 |
+
)
|
112 |
+
FORMAT: str = "%(asctime)s %(name)-15s %(levelname)-8s %(message)s"
|
113 |
+
COLOR_FORMAT: str = formatter_message(CONSOLE_FORMAT, True)
|
114 |
+
JSON_FORMAT: str = '{"time": "%(asctime)s", "name": "%(name)s", "level": "%(levelname)s", "message": "%(message)s"}'
|
115 |
+
|
116 |
+
def __init__(self, name: str, logLevel: str = "DEBUG"):
|
117 |
+
logging.Logger.__init__(self, name, logLevel)
|
118 |
+
|
119 |
+
# Queue Handler
|
120 |
+
queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
|
121 |
+
json_formatter = logging.Formatter(self.JSON_FORMAT)
|
122 |
+
queue_handler.setFormatter(json_formatter)
|
123 |
+
self.addHandler(queue_handler)
|
124 |
+
|
125 |
+
if JSON_LOGGING:
|
126 |
+
console_formatter = JsonFormatter()
|
127 |
+
else:
|
128 |
+
console_formatter = ConsoleFormatter(self.COLOR_FORMAT)
|
129 |
+
console = logging.StreamHandler()
|
130 |
+
console.setFormatter(console_formatter)
|
131 |
+
self.addHandler(console)
|
132 |
+
|
133 |
+
def chat(self, role: str, openai_repsonse: dict, messages=None, *args, **kws):
|
134 |
+
"""
|
135 |
+
Parse the content, log the message and extract the usage into prometheus metrics
|
136 |
+
"""
|
137 |
+
role_emojis = {
|
138 |
+
"system": "π₯οΈ",
|
139 |
+
"user": "π€",
|
140 |
+
"assistant": "π€",
|
141 |
+
"function": "βοΈ",
|
142 |
+
}
|
143 |
+
if self.isEnabledFor(CHAT):
|
144 |
+
if messages:
|
145 |
+
for message in messages:
|
146 |
+
self._log(
|
147 |
+
CHAT,
|
148 |
+
f"{role_emojis.get(message['role'], 'π΅')}: {message['content']}",
|
149 |
+
)
|
150 |
+
else:
|
151 |
+
response = json.loads(openai_repsonse)
|
152 |
+
|
153 |
+
self._log(
|
154 |
+
CHAT,
|
155 |
+
f"{role_emojis.get(role, 'π΅')}: {response['choices'][0]['message']['content']}",
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
class QueueLogger(logging.Logger):
|
160 |
+
"""
|
161 |
+
Custom logger class with queue
|
162 |
+
"""
|
163 |
+
|
164 |
+
def __init__(self, name: str, level: int = logging.NOTSET):
|
165 |
+
super().__init__(name, level)
|
166 |
+
queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
|
167 |
+
self.addHandler(queue_handler)
|
168 |
+
|
169 |
+
|
170 |
+
logging_config: dict = dict(
|
171 |
+
version=1,
|
172 |
+
formatters={
|
173 |
+
"console": {
|
174 |
+
"()": ConsoleFormatter,
|
175 |
+
"format": FancyLogger.COLOR_FORMAT,
|
176 |
+
},
|
177 |
+
},
|
178 |
+
handlers={
|
179 |
+
"h": {
|
180 |
+
"class": "logging.StreamHandler",
|
181 |
+
"formatter": "console",
|
182 |
+
"level": logging.INFO,
|
183 |
+
},
|
184 |
+
},
|
185 |
+
root={
|
186 |
+
"handlers": ["h"],
|
187 |
+
"level": logging.INFO,
|
188 |
+
},
|
189 |
+
loggers={
|
190 |
+
"autogpt": {
|
191 |
+
"handlers": ["h"],
|
192 |
+
"level": logging.INFO,
|
193 |
+
"propagate": False,
|
194 |
+
},
|
195 |
+
},
|
196 |
+
)
|
197 |
+
|
198 |
+
|
199 |
+
def setup_logger():
|
200 |
+
"""
|
201 |
+
Setup the logger with the specified format
|
202 |
+
"""
|
203 |
+
logging.config.dictConfig(logging_config)
|