gabykim commited on
Commit
0e9e5fc
·
1 Parent(s): 2182a08

refactor summarizer file position

Browse files
src/know_lang_bot/code_parser/parser.py DELETED
@@ -1,176 +0,0 @@
1
- import os
2
- from typing import List, Optional, Dict
3
- from enum import Enum
4
- from pathlib import Path
5
- from tree_sitter import Language, Parser, Node
6
- import tree_sitter_python
7
- from pydantic import BaseModel
8
- from git import Repo
9
- from know_lang_bot.utils.fancy_log import FancyLogger
10
-
11
- LOG = FancyLogger(__name__)
12
-
13
-
14
- class ChunkType(str, Enum):
15
- CLASS = "class"
16
- FUNCTION = "function"
17
- OTHER = "other"
18
-
19
- class CodeChunk(BaseModel):
20
- """Represents a chunk of code with its metadata"""
21
- type: ChunkType
22
- content: str
23
- start_line: int
24
- end_line: int
25
- file_path: str
26
- name: Optional[str] = None
27
- parent_name: Optional[str] = None # For nested classes/functions
28
- docstring: Optional[str] = None
29
-
30
- class CodeParser:
31
- parser : Parser = None
32
- laguage: Language = None
33
-
34
- def __init__(self, repo_path: str):
35
- """Initialize the parser with a repository path"""
36
- self.repo_path = Path(repo_path)
37
- self._init_tree_sitter()
38
-
39
- def _init_tree_sitter(self):
40
- """Initialize tree-sitter with Python language support"""
41
- # In real implementation, we'd need to handle language loading more robustly
42
- # For MVP, we'll assume Python parser is available
43
- self.language = Language(tree_sitter_python.language())
44
- self.parser = Parser(self.language)
45
-
46
- def _get_preceding_docstring(self, node: Node, source_code: bytes) -> Optional[str]:
47
- """Extract docstring from comments"""
48
- docstring_parts = []
49
- current_node : Node = node.prev_sibling
50
-
51
- while current_node:
52
- print(current_node.text)
53
- if current_node.type == "comment":
54
- comment = source_code[current_node.start_byte:current_node.end_byte].decode('utf-8')
55
- docstring_parts.insert(0, comment)
56
- elif current_node.type == "expression_statement":
57
- string_node = current_node.children[0] if current_node.children else None
58
- if string_node and string_node.type in ("string", "string_literal"):
59
- docstring = source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
60
- docstring_parts.insert(0, docstring)
61
-
62
- break
63
- elif current_node.type not in ("empty_statement", "newline"):
64
- break
65
- current_node = current_node.prev_sibling
66
-
67
- return '\n'.join(docstring_parts) if docstring_parts else None
68
-
69
- def _has_syntax_error(self, node: Node) -> bool:
70
- """Check if the node or its children contain syntax errors"""
71
- if node.type == "ERROR":
72
- return True
73
- if node.has_error:
74
- return True
75
- return any(self._has_syntax_error(child) for child in node.children)
76
-
77
- def parse_file(self, file_path: Path) -> List[CodeChunk]:
78
- """Parse a single file and return list of code chunks"""
79
- if not file_path.suffix == '.py':
80
- LOG.warning(f"Skipping non-Python file: {file_path}")
81
- return []
82
-
83
- try:
84
- with open(file_path, 'rb') as f:
85
- source_code = f.read()
86
-
87
- tree = self.parser.parse(source_code)
88
-
89
- # Check for overall syntax validity
90
- if self._has_syntax_error(tree.root_node):
91
- LOG.warning(f"Syntax errors found in {file_path}")
92
- return []
93
-
94
- chunks: List[CodeChunk] = []
95
-
96
- # Process the syntax tree
97
- for node in tree.root_node.children:
98
- if node.type == "class_definition":
99
- chunks.append(self._process_class(node, source_code, file_path))
100
- elif node.type == "function_definition":
101
- chunks.append(self._process_function(node, source_code, file_path))
102
- else:
103
- # Skip other node types for now
104
- pass
105
-
106
- return chunks
107
- except Exception as e:
108
- LOG.error(f"Error parsing file {file_path}: {str(e)}")
109
- return []
110
-
111
- def _process_class(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
112
- """Process a class node and return a CodeChunk"""
113
- name = next(child.text.decode('utf-8')
114
- for child in node.children
115
- if child.type == "identifier")
116
-
117
- if not name:
118
- raise ValueError(f"Could not find class name in node: {node.text}")
119
-
120
- return CodeChunk(
121
- type=ChunkType.CLASS,
122
- name=name,
123
- content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
124
- start_line=node.start_point[0],
125
- end_line=node.end_point[0],
126
- file_path=str(file_path),
127
- docstring=self._get_preceding_docstring(node, source_code)
128
- )
129
-
130
- def _process_function(self, node: Node, source_code: bytes, file_path: Path) -> CodeChunk:
131
- """Process a function node and return a CodeChunk"""
132
- name = next(child.text.decode('utf-8')
133
- for child in node.children
134
- if child.type == "identifier")
135
-
136
- if not name:
137
- raise ValueError(f"Could not find function name in node: {node.text}")
138
-
139
- return CodeChunk(
140
- type=ChunkType.FUNCTION,
141
- name=name,
142
- content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
143
- start_line=node.start_point[0],
144
- end_line=node.end_point[0],
145
- file_path=str(file_path),
146
- docstring=self._get_preceding_docstring(node, source_code)
147
- )
148
-
149
- def parse_repository(self) -> List[CodeChunk]:
150
- """Parse all Python files in the repository"""
151
- chunks: List[CodeChunk] = []
152
-
153
- try:
154
- repo = Repo(self.repo_path)
155
-
156
- if repo.bare:
157
- raise ValueError(f"Repository {self.repo_path} is bare and has no working directory")
158
-
159
- for dirpath, _, filenames in os.walk(repo.working_tree_dir):
160
- if repo.ignored(dirpath):
161
- LOG.debug(f"Skipping ignored directory: {dirpath}")
162
- continue
163
-
164
- for file in filenames:
165
- file_path = Path(dirpath) / file
166
-
167
- if repo.ignored(file_path):
168
- LOG.debug(f"Skipping ignored file: {file_path}")
169
- continue
170
-
171
- if file.endswith('.py'):
172
- chunks.extend(self.parse_file(file_path))
173
- except Exception as e:
174
- LOG.error(f"Error processing repository: {str(e)}")
175
-
176
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/know_lang_bot/{code_parser → summarizer}/summarizer.py RENAMED
File without changes
tests/test_summarizer.py CHANGED
@@ -2,8 +2,8 @@ import pytest
2
  import tempfile
3
  from unittest.mock import Mock, patch, AsyncMock
4
  from pathlib import Path
5
- from know_lang_bot.code_parser.summarizer import CodeSummarizer
6
- from know_lang_bot.code_parser.parser import CodeChunk, ChunkType
7
  from know_lang_bot.config import AppConfig
8
 
9
  @pytest.fixture
@@ -52,7 +52,7 @@ def mock_run_result(mock_summary):
52
  return mock_result
53
 
54
  @pytest.mark.asyncio
55
- @patch('know_lang_bot.code_parser.summarizer.Agent')
56
  async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunks: list[CodeChunk], mock_run_result: Mock):
57
  """Test summarizing a single chunk"""
58
  # Setup the mock agent instance
@@ -71,7 +71,7 @@ async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunk
71
  assert "def hello()" in call_args
72
  assert "Says hello" in call_args
73
 
74
- @patch('know_lang_bot.code_parser.summarizer.Agent')
75
  def test_chromadb_initialization(mock_agent_class, config: AppConfig):
76
  """Test ChromaDB initialization"""
77
  mock_agent = mock_agent_class.return_value
@@ -85,8 +85,8 @@ def test_chromadb_initialization(mock_agent_class, config: AppConfig):
85
  assert new_summarizer.collection is not None
86
 
87
  @pytest.mark.asyncio
88
- @patch('know_lang_bot.code_parser.summarizer.ollama')
89
- @patch('know_lang_bot.code_parser.summarizer.Agent')
90
  async def test_process_and_store_chunk_with_embedding(
91
  mock_agent_class,
92
  mock_ollama,
 
2
  import tempfile
3
  from unittest.mock import Mock, patch, AsyncMock
4
  from pathlib import Path
5
+ from know_lang_bot.summarizer.summarizer import CodeSummarizer
6
+ from know_lang_bot.core.types import CodeChunk, ChunkType
7
  from know_lang_bot.config import AppConfig
8
 
9
  @pytest.fixture
 
52
  return mock_result
53
 
54
  @pytest.mark.asyncio
55
+ @patch('know_lang_bot.summarizer.summarizer.Agent')
56
  async def test_summarize_chunk(mock_agent_class, config: AppConfig, sample_chunks: list[CodeChunk], mock_run_result: Mock):
57
  """Test summarizing a single chunk"""
58
  # Setup the mock agent instance
 
71
  assert "def hello()" in call_args
72
  assert "Says hello" in call_args
73
 
74
+ @patch('know_lang_bot.summarizer.summarizer.Agent')
75
  def test_chromadb_initialization(mock_agent_class, config: AppConfig):
76
  """Test ChromaDB initialization"""
77
  mock_agent = mock_agent_class.return_value
 
85
  assert new_summarizer.collection is not None
86
 
87
  @pytest.mark.asyncio
88
+ @patch('know_lang_bot.summarizer.summarizer.ollama')
89
+ @patch('know_lang_bot.summarizer.summarizer.Agent')
90
  async def test_process_and_store_chunk_with_embedding(
91
  mock_agent_class,
92
  mock_ollama,