Spaces:

gabykim
/

KnowLang_Transformers_Demo

Sleeping

App Files Files Community

KnowLang_Transformers_Demo / parser /parse.py

gabykim

python code parsing draft

2d4f211 5 months ago

raw

history blame

5.56 kB

	from typing import List, Optional, Dict
	from enum import Enum
	from pathlib import Path
	import tree_sitter
	from tree_sitter_languages import get_language, get_parser
	from pydantic import BaseModel, Field
	from git import Repo
	from utils.fancy_log import FancyLogger

	LOG = FancyLogger(__name__)


	class ChunkType(str, Enum):
	CLASS = "class"
	FUNCTION = "function"
	MODULE = "module"
	OTHER = "other"

	class CodeChunk(BaseModel):
	"""Represents a chunk of code with its metadata"""
	type: ChunkType
	content: str
	start_line: int
	end_line: int
	file_path: str
	name: Optional[str] = None
	parent_name: Optional[str] = None # For nested classes/functions
	docstring: Optional[str] = None

	class CodeParser:
	parser : tree_sitter.Parser = None
	laguage: tree_sitter.Language = None

	def __init__(self, repo_path: str):
	"""Initialize the parser with a repository path"""
	self.repo_path = Path(repo_path)
	self._init_tree_sitter()

	def _init_tree_sitter(self):
	"""Initialize tree-sitter with Python language support"""
	# In real implementation, we'd need to handle language loading more robustly
	# For MVP, we'll assume Python parser is available
	self.parser = get_parser('python')
	self.language = get_language('python')

	def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
	"""Extract docstring from a class or function node"""
	for child in node.children:
	if child.type == "expression_statement":
	string_node = child.children[0]
	if string_node.type in ("string", "string_literal"):
	return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
	return None

	def parse_file(self, file_path: Path) -> List[CodeChunk]:
	"""Parse a single file and return list of code chunks"""
	if not file_path.suffix == '.py':
	LOG.warning(f"Skipping non-Python file: {file_path}")
	return []

	try:
	with open(file_path, 'rb') as f:
	source_code = f.read()

	tree = self.parser.parse(source_code)
	chunks: List[CodeChunk] = []

	# Process the syntax tree
	for node in tree.root_node.children:
	if node.type == "class_definition":
	chunks.append(self._process_class(node, source_code, file_path))
	elif node.type == "function_definition":
	chunks.append(self._process_function(node, source_code, file_path))
	else:
	# Store other top-level code as separate chunks
	if node.type not in ("comment", "empty_statement"):
	chunks.append(CodeChunk(
	type=ChunkType.OTHER,
	content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
	start_line=node.start_point[0],
	end_line=node.end_point[0],
	file_path=str(file_path)
	))

	return chunks
	except Exception as e:
	LOG.error(f"Error parsing file {file_path}: {str(e)}")
	return []

	def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
	"""Process a class node and return a CodeChunk"""
	name = next(child.text.decode('utf-8')
	for child in node.children
	if child.type == "identifier")

	return CodeChunk(
	type=ChunkType.CLASS,
	name=name,
	content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
	start_line=node.start_point[0],
	end_line=node.end_point[0],
	file_path=str(file_path),
	docstring=self._extract_docstring(node, source_code)
	)

	def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
	"""Process a function node and return a CodeChunk"""
	name = next(child.text.decode('utf-8')
	for child in node.children
	if child.type == "identifier")

	return CodeChunk(
	type=ChunkType.FUNCTION,
	name=name,
	content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
	start_line=node.start_point[0],
	end_line=node.end_point[0],
	file_path=str(file_path),
	docstring=self._extract_docstring(node, source_code)
	)

	def parse_repository(self) -> List[CodeChunk]:
	"""Parse all Python files in the repository"""
	chunks: List[CodeChunk] = []

	try:
	repo = Repo(self.repo_path)
	for root, _, files in repo.working_tree_traverse():
	for file in files:
	if file.endswith('.py'):
	file_path = Path(root) / file
	chunks.extend(self.parse_file(file_path))
	except Exception as e:
	LOG.error(f"Error processing repository: {str(e)}")

	return chunks


	# Usage example:
	if __name__ == "__main__":
	parser = CodeParser("path/to/repo")
	chunks = parser.parse_repository()
	for chunk in chunks:
	print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")