Spaces:

gabykim
/

KnowLang_Transformers_Demo

Running

App Files Files Community

gabykim commited on Jan 27

Commit

2d4f211

1 Parent(s): f22ea30

python code parsing draft

Browse files

Files changed (4) hide show

full_requirements.txt +1 -0
parser/parse.py +144 -0
simple_requirements.txt +1 -1
utils/fancy_log.py +203 -0

full_requirements.txt CHANGED Viewed

@@ -47,6 +47,7 @@ sniffio==1.3.1
 tokenizers==0.21.0
 tqdm==4.67.1
 tree-sitter==0.24.0
 types-requests==2.32.0.20241016
 typing-inspect==0.9.0
 typing_extensions==4.12.2

 tokenizers==0.21.0
 tqdm==4.67.1
 tree-sitter==0.24.0
+tree-sitter-languages==1.10.2
 types-requests==2.32.0.20241016
 typing-inspect==0.9.0
 typing_extensions==4.12.2

parser/parse.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from typing import List, Optional, Dict
+from enum import Enum
+from pathlib import Path
+import tree_sitter
+from tree_sitter_languages import get_language, get_parser
+from pydantic import BaseModel, Field
+from git import Repo
+from utils.fancy_log import FancyLogger
+LOG = FancyLogger(__name__)
+class ChunkType(str, Enum):
+    CLASS = "class"
+    FUNCTION = "function"
+    MODULE = "module"
+    OTHER = "other"
+class CodeChunk(BaseModel):
+    """Represents a chunk of code with its metadata"""
+    type: ChunkType
+    content: str
+    start_line: int
+    end_line: int
+    file_path: str
+    name: Optional[str] = None
+    parent_name: Optional[str] = None  # For nested classes/functions
+    docstring: Optional[str] = None
+class CodeParser:
+    parser : tree_sitter.Parser = None
+    laguage: tree_sitter.Language = None
+    def __init__(self, repo_path: str):
+        """Initialize the parser with a repository path"""
+        self.repo_path = Path(repo_path)
+        self._init_tree_sitter()
+    def _init_tree_sitter(self):
+        """Initialize tree-sitter with Python language support"""
+        # In real implementation, we'd need to handle language loading more robustly
+        # For MVP, we'll assume Python parser is available
+        self.parser = get_parser('python')
+        self.language = get_language('python')
+    def _extract_docstring(self, node: tree_sitter.Node, source_code: bytes) -> Optional[str]:
+        """Extract docstring from a class or function node"""
+        for child in node.children:
+            if child.type == "expression_statement":
+                string_node = child.children[0]
+                if string_node.type in ("string", "string_literal"):
+                    return source_code[string_node.start_byte:string_node.end_byte].decode('utf-8')
+        return None
+    def parse_file(self, file_path: Path) -> List[CodeChunk]:
+        """Parse a single file and return list of code chunks"""
+        if not file_path.suffix == '.py':
+            LOG.warning(f"Skipping non-Python file: {file_path}")
+            return []
+        try:
+            with open(file_path, 'rb') as f:
+                source_code = f.read()
+            tree = self.parser.parse(source_code)
+            chunks: List[CodeChunk] = []
+            # Process the syntax tree
+            for node in tree.root_node.children:
+                if node.type == "class_definition":
+                    chunks.append(self._process_class(node, source_code, file_path))
+                elif node.type == "function_definition":
+                    chunks.append(self._process_function(node, source_code, file_path))
+                else:
+                    # Store other top-level code as separate chunks
+                    if node.type not in ("comment", "empty_statement"):
+                        chunks.append(CodeChunk(
+                            type=ChunkType.OTHER,
+                            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
+                            start_line=node.start_point[0],
+                            end_line=node.end_point[0],
+                            file_path=str(file_path)
+                        ))
+            return chunks
+        except Exception as e:
+            LOG.error(f"Error parsing file {file_path}: {str(e)}")
+            return []
+    def _process_class(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
+        """Process a class node and return a CodeChunk"""
+        name = next(child.text.decode('utf-8')
+                   for child in node.children
+                   if child.type == "identifier")
+        return CodeChunk(
+            type=ChunkType.CLASS,
+            name=name,
+            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
+            start_line=node.start_point[0],
+            end_line=node.end_point[0],
+            file_path=str(file_path),
+            docstring=self._extract_docstring(node, source_code)
+        )
+    def _process_function(self, node: tree_sitter.Node, source_code: bytes, file_path: Path) -> CodeChunk:
+        """Process a function node and return a CodeChunk"""
+        name = next(child.text.decode('utf-8')
+                   for child in node.children
+                   if child.type == "identifier")
+        return CodeChunk(
+            type=ChunkType.FUNCTION,
+            name=name,
+            content=source_code[node.start_byte:node.end_byte].decode('utf-8'),
+            start_line=node.start_point[0],
+            end_line=node.end_point[0],
+            file_path=str(file_path),
+            docstring=self._extract_docstring(node, source_code)
+        )
+    def parse_repository(self) -> List[CodeChunk]:
+        """Parse all Python files in the repository"""
+        chunks: List[CodeChunk] = []
+        try:
+            repo = Repo(self.repo_path)
+            for root, _, files in repo.working_tree_traverse():
+                for file in files:
+                    if file.endswith('.py'):
+                        file_path = Path(root) / file
+                        chunks.extend(self.parse_file(file_path))
+        except Exception as e:
+            LOG.error(f"Error processing repository: {str(e)}")
+        return chunks
+# Usage example:
+if __name__ == "__main__":
+    parser = CodeParser("path/to/repo")
+    chunks = parser.parse_repository()
+    for chunk in chunks:
+        print(f"{chunk.type}: {chunk.name} ({chunk.start_line}-{chunk.end_line})")

simple_requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 pydantic-ai==0.0.20
-tree-sitter==0.24.0
 GitPython==3.1.44

 pydantic-ai==0.0.20
+tree-sitter-languages==1.10.2
 GitPython==3.1.44

utils/fancy_log.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import json
+import logging
+import logging.config
+import logging.handlers
+import os
+import queue
+JSON_LOGGING = os.environ.get("JSON_LOGGING", "false").lower() == "true"
+CHAT = 29
+logging.addLevelName(CHAT, "CHAT")
+RESET_SEQ: str = "\033[0m"
+COLOR_SEQ: str = "\033[1;%dm"
+BOLD_SEQ: str = "\033[1m"
+UNDERLINE_SEQ: str = "\033[04m"
+ORANGE: str = "\033[33m"
+YELLOW: str = "\033[93m"
+WHITE: str = "\33[37m"
+BLUE: str = "\033[34m"
+LIGHT_BLUE: str = "\033[94m"
+RED: str = "\033[91m"
+GREY: str = "\33[90m"
+GREEN: str = "\033[92m"
+EMOJIS: dict[str, str] = {
+    "DEBUG": "🐛",
+    "INFO": "📝",
+    "CHAT": "💬",
+    "WARNING": "⚠️",
+    "ERROR": "❌",
+    "CRITICAL": "💥",
+}
+KEYWORD_COLORS: dict[str, str] = {
+    "DEBUG": WHITE,
+    "INFO": LIGHT_BLUE,
+    "CHAT": GREEN,
+    "WARNING": YELLOW,
+    "ERROR": ORANGE,
+    "CRITICAL": RED,
+}
+class JsonFormatter(logging.Formatter):
+    def format(self, record):
+        return json.dumps(record.__dict__)
+def formatter_message(message: str, use_color: bool = True) -> str:
+    """
+    Syntax highlight certain keywords
+    """
+    if use_color:
+        message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
+    else:
+        message = message.replace("$RESET", "").replace("$BOLD", "")
+    return message
+def format_word(
+    message: str, word: str, color_seq: str, bold: bool = False, underline: bool = False
+) -> str:
+    """
+    Surround the fiven word with a sequence
+    """
+    replacer = color_seq + word + RESET_SEQ
+    if underline:
+        replacer = UNDERLINE_SEQ + replacer
+    if bold:
+        replacer = BOLD_SEQ + replacer
+    return message.replace(word, replacer)
+class ConsoleFormatter(logging.Formatter):
+    """
+    This Formatted simply colors in the levelname i.e 'INFO', 'DEBUG'
+    """
+    def __init__(
+        self, fmt: str, datefmt: str = None, style: str = "%", use_color: bool = True
+    ):
+        super().__init__(fmt, datefmt, style)
+        self.use_color = use_color
+    def format(self, record: logging.LogRecord) -> str:
+        """
+        Format and highlight certain keywords
+        """
+        rec = record
+        levelname = rec.levelname
+        if self.use_color and levelname in KEYWORD_COLORS:
+            levelname_color = KEYWORD_COLORS[levelname] + levelname + RESET_SEQ
+            rec.levelname = levelname_color
+        rec.name = f"{GREY}{rec.name:<15}{RESET_SEQ}"
+        rec.msg = (
+            KEYWORD_COLORS[levelname] + EMOJIS[levelname] + "  " + rec.msg + RESET_SEQ
+        )
+        return logging.Formatter.format(self, rec)
+class FancyLogger(logging.Logger):
+    """
+    This adds extra logging functions such as logger.trade and also
+    sets the logger to use the custom formatter
+    """
+    CONSOLE_FORMAT: str = (
+        "[%(asctime)s] [$BOLD%(name)-15s$RESET] [%(levelname)-8s]\t%(message)s"
+    )
+    FORMAT: str = "%(asctime)s %(name)-15s %(levelname)-8s %(message)s"
+    COLOR_FORMAT: str = formatter_message(CONSOLE_FORMAT, True)
+    JSON_FORMAT: str = '{"time": "%(asctime)s", "name": "%(name)s", "level": "%(levelname)s", "message": "%(message)s"}'
+    def __init__(self, name: str, logLevel: str = "DEBUG"):
+        logging.Logger.__init__(self, name, logLevel)
+        # Queue Handler
+        queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
+        json_formatter = logging.Formatter(self.JSON_FORMAT)
+        queue_handler.setFormatter(json_formatter)
+        self.addHandler(queue_handler)
+        if JSON_LOGGING:
+            console_formatter = JsonFormatter()
+        else:
+            console_formatter = ConsoleFormatter(self.COLOR_FORMAT)
+        console = logging.StreamHandler()
+        console.setFormatter(console_formatter)
+        self.addHandler(console)
+    def chat(self, role: str, openai_repsonse: dict, messages=None, *args, **kws):
+        """
+        Parse the content, log the message and extract the usage into prometheus metrics
+        """
+        role_emojis = {
+            "system": "🖥️",
+            "user": "👤",
+            "assistant": "🤖",
+            "function": "⚙️",
+        }
+        if self.isEnabledFor(CHAT):
+            if messages:
+                for message in messages:
+                    self._log(
+                        CHAT,
+                        f"{role_emojis.get(message['role'], '🔵')}: {message['content']}",
+                    )
+            else:
+                response = json.loads(openai_repsonse)
+                self._log(
+                    CHAT,
+                    f"{role_emojis.get(role, '🔵')}: {response['choices'][0]['message']['content']}",
+                )
+class QueueLogger(logging.Logger):
+    """
+    Custom logger class with queue
+    """
+    def __init__(self, name: str, level: int = logging.NOTSET):
+        super().__init__(name, level)
+        queue_handler = logging.handlers.QueueHandler(queue.Queue(-1))
+        self.addHandler(queue_handler)
+logging_config: dict = dict(
+    version=1,
+    formatters={
+        "console": {
+            "()": ConsoleFormatter,
+            "format": FancyLogger.COLOR_FORMAT,
+        },
+    },
+    handlers={
+        "h": {
+            "class": "logging.StreamHandler",
+            "formatter": "console",
+            "level": logging.INFO,
+        },
+    },
+    root={
+        "handlers": ["h"],
+        "level": logging.INFO,
+    },
+    loggers={
+        "autogpt": {
+            "handlers": ["h"],
+            "level": logging.INFO,
+            "propagate": False,
+        },
+    },
+)
+def setup_logger():
+    """
+    Setup the logger with the specified format
+    """
+    logging.config.dictConfig(logging_config)