Spaces:
Sleeping
Sleeping
File size: 5,562 Bytes
2d4f211 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from typing import List, Optional, Dict
from enum import Enum
from pathlib import Path
import tree_sitter
from tree_sitter_languages import get_language, get_parser
from pydantic import BaseModel, Field
from git import Repo
from utils.fancy_log import FancyLogger
LOG = FancyLogger(__name__)
class ChunkType(str, Enum):
CLASS = "class"
FUNCTION = "function"
MODULE = "module"
OTHER = "other"
class CodeChunk(BaseModel):
"""Represents a chunk of code with its metadata"""
type: ChunkType
content: str
start_line: int
end_line: int
file_path: str
name: Optional[str] = None
parent_name: Optional[str] = None # For nested classes/functions
docstring: Optional[str] = None
class CodeParser:
parser : tree_sitter.Parser = None
laguage: tree_sitter.Language = None
def __init__(self, repo_path: str):
"""Initialize the parser with a repository path"""
self.repo_path = Path(repo_path)
self._init_tree_sitter()
def _init_tree_sitter(self):
"""Initialize tree-sitter with Python language support"""
# In real implementation, we'd need to handle language loading more robustly
# For MVP, we'll assume Python parser is available
self.parser = get_parser('python')
self.language = get_language('python')
def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
"""Extract docstring from a class or function node"""
for child in node.children:
if child.type == "expression_statement":
string_node = child.children[0]
if string_node.type in ("string", "string_literal"):
return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
return None
def parse_file(self, file_path: Path) -> List[CodeChunk]:
"""Parse a single file and return list of code chunks"""
if not file_path.suffix == '.py':
LOG.warning(f"Skipping non-Python file: {file_path}")
return []
try:
with open(file_path, 'rb') as f:
source_code = f.read()
tree = self.parser.parse(source_code)
chunks: List[CodeChunk] = []
# Process the syntax tree
for node in tree.root_node.children:
if node.type == "class_definition":
chunks.append(self._process_class(node, source_code, file_path))
elif node.type == "function_definition":
chunks.append(self._process_function(node, source_code, file_path))
else:
# Store other top-level code as separate chunks
if node.type not in ("comment", "empty_statement"):
chunks.append(CodeChunk(
type=ChunkType.OTHER,
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
start_line=node.start_point[0],
end_line=node.end_point[0],
file_path=str(file_path)
))
return chunks
except Exception as e:
LOG.error(f"Error parsing file {file_path}: {str(e)}")
return []
def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
"""Process a class node and return a CodeChunk"""
name = next(child.text.decode('utf-8')
for child in node.children
if child.type == "identifier")
return CodeChunk(
type=ChunkType.CLASS,
name=name,
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
start_line=node.start_point[0],
end_line=node.end_point[0],
file_path=str(file_path),
docstring=self._extract_docstring(node, source_code)
)
def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
"""Process a function node and return a CodeChunk"""
name = next(child.text.decode('utf-8')
for child in node.children
if child.type == "identifier")
return CodeChunk(
type=ChunkType.FUNCTION,
name=name,
content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
start_line=node.start_point[0],
end_line=node.end_point[0],
file_path=str(file_path),
docstring=self._extract_docstring(node, source_code)
)
def parse_repository(self) -> List[CodeChunk]:
"""Parse all Python files in the repository"""
chunks: List[CodeChunk] = []
try:
repo = Repo(self.repo_path)
for root, _, files in repo.working_tree_traverse():
for file in files:
if file.endswith('.py'):
file_path = Path(root) / file
chunks.extend(self.parse_file(file_path))
except Exception as e:
LOG.error(f"Error processing repository: {str(e)}")
return chunks
# Usage example:
if __name__ == "__main__":
parser = CodeParser("path/to/repo")
chunks = parser.parse_repository()
for chunk in chunks:
print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})") |