File size: 5,562 Bytes
2d4f211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import List, Optional, Dict
from enum import Enum
from pathlib import Path
import tree_sitter
from tree_sitter_languages import get_language, get_parser
from pydantic import BaseModel, Field
from git import Repo
from utils.fancy_log import FancyLogger

LOG = FancyLogger(__name__)


class ChunkType(str, Enum):
    CLASS = "class"
    FUNCTION = "function"
    MODULE = "module"
    OTHER = "other"

class CodeChunk(BaseModel):
    """Represents a chunk of code with its metadata"""
    type: ChunkType
    content: str
    start_line: int
    end_line: int
    file_path: str
    name: Optional[str] = None
    parent_name: Optional[str] = None  # For nested classes/functions
    docstring: Optional[str] = None

class CodeParser:
    parser : tree_sitter.Parser = None
    laguage: tree_sitter.Language = None

    def __init__(self, repo_path: str):
        """Initialize the parser with a repository path"""
        self.repo_path = Path(repo_path)
        self._init_tree_sitter()

    def _init_tree_sitter(self):
        """Initialize tree-sitter with Python language support"""
        # In real implementation, we'd need to handle language loading more robustly
        # For MVP, we'll assume Python parser is available
        self.parser = get_parser('python')
        self.language = get_language('python')

    def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
        """Extract docstring from a class or function node"""
        for child in node.children:
            if child.type == "expression_statement":
                string_node = child.children[0]
                if string_node.type in ("string", "string_literal"):
                    return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
        return None

    def parse_file(self, file_path: Path) -> List[CodeChunk]:
        """Parse a single file and return list of code chunks"""
        if not file_path.suffix == '.py':
            LOG.warning(f"Skipping non-Python file: {file_path}")
            return []

        try:
            with open(file_path, 'rb') as f:
                source_code = f.read()
            
            tree = self.parser.parse(source_code)
            chunks: List[CodeChunk] = []
            
            # Process the syntax tree
            for node in tree.root_node.children:
                if node.type == "class_definition":
                    chunks.append(self._process_class(node, source_code, file_path))
                elif node.type == "function_definition":
                    chunks.append(self._process_function(node, source_code, file_path))
                else:
                    # Store other top-level code as separate chunks
                    if node.type not in ("comment", "empty_statement"):
                        chunks.append(CodeChunk(
                            type=ChunkType.OTHER,
                            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
                            start_line=node.start_point[0],
                            end_line=node.end_point[0],
                            file_path=str(file_path)
                        ))
            
            return chunks
        except Exception as e:
            LOG.error(f"Error parsing file {file_path}: {str(e)}")
            return []

    def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
        """Process a class node and return a CodeChunk"""
        name = next(child.text.decode('utf-8') 
                   for child in node.children 
                   if child.type == "identifier")
        
        return CodeChunk(
            type=ChunkType.CLASS,
            name=name,
            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
            start_line=node.start_point[0],
            end_line=node.end_point[0],
            file_path=str(file_path),
            docstring=self._extract_docstring(node, source_code)
        )

    def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
        """Process a function node and return a CodeChunk"""
        name = next(child.text.decode('utf-8') 
                   for child in node.children 
                   if child.type == "identifier")
        
        return CodeChunk(
            type=ChunkType.FUNCTION,
            name=name,
            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
            start_line=node.start_point[0],
            end_line=node.end_point[0],
            file_path=str(file_path),
            docstring=self._extract_docstring(node, source_code)
        )

    def parse_repository(self) -> List[CodeChunk]:
        """Parse all Python files in the repository"""
        chunks: List[CodeChunk] = []
        
        try:
            repo = Repo(self.repo_path)
            for root, _, files in repo.working_tree_traverse():
                for file in files:
                    if file.endswith('.py'):
                        file_path = Path(root) / file
                        chunks.extend(self.parse_file(file_path))
        except Exception as e:
            LOG.error(f"Error processing repository: {str(e)}")
        
        return chunks


# Usage example:
if __name__ == "__main__":
    parser = CodeParser("path/to/repo")
    chunks = parser.parse_repository()
    for chunk in chunks:
        print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")