Spaces:
Running
Running
| import re | |
| from pathlib import Path | |
| from langchain_community.document_loaders.git import GitLoader | |
| from langflow.custom import Component | |
| from langflow.io import MessageTextInput, Output | |
| from langflow.schema import Data | |
| class GitLoaderComponent(Component): | |
| display_name = "GitLoader" | |
| description = "Load files from a Git repository" | |
| documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" | |
| trace_type = "tool" | |
| icon = "GitLoader" | |
| name = "GitLoader" | |
| inputs = [ | |
| MessageTextInput( | |
| name="repo_path", | |
| display_name="Repository Path", | |
| required=False, | |
| info="The local path to the Git repository.", | |
| ), | |
| MessageTextInput( | |
| name="clone_url", | |
| display_name="Clone URL", | |
| required=False, | |
| info="The URL to clone the Git repository from.", | |
| ), | |
| MessageTextInput( | |
| name="branch", | |
| display_name="Branch", | |
| required=False, | |
| value="main", | |
| info="The branch to load files from. Defaults to 'main'.", | |
| ), | |
| MessageTextInput( | |
| name="file_filter", | |
| display_name="File Filter", | |
| required=False, | |
| advanced=True, | |
| info="A list of patterns to filter files. Example to include only .py files: '*.py'. " | |
| "Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", | |
| ), | |
| MessageTextInput( | |
| name="content_filter", | |
| display_name="Content Filter", | |
| required=False, | |
| advanced=True, | |
| info="A regex pattern to filter files based on their content.", | |
| ), | |
| ] | |
| outputs = [ | |
| Output(name="data", display_name="Data", method="load_documents"), | |
| ] | |
| def is_binary(file_path: str) -> bool: | |
| """Check if a file is binary by looking for null bytes. | |
| This is necessary because when searches are performed using | |
| the content_filter, binary files need to be ignored. | |
| """ | |
| with Path(file_path).open("rb") as file: | |
| return b"\x00" in file.read(1024) | |
| def build_gitloader(self) -> GitLoader: | |
| file_filter_patterns = getattr(self, "file_filter", None) | |
| content_filter_pattern = getattr(self, "content_filter", None) | |
| file_filters = [] | |
| if file_filter_patterns: | |
| patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] | |
| def file_filter(file_path: Path) -> bool: | |
| if len(patterns) == 1 and patterns[0].startswith("!"): | |
| return not file_path.match(patterns[0][1:]) | |
| included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) | |
| excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) | |
| return included and not excluded | |
| file_filters.append(file_filter) | |
| if content_filter_pattern: | |
| content_regex = re.compile(content_filter_pattern) | |
| def content_filter(file_path: Path) -> bool: | |
| content = file_path.read_text(encoding="utf-8", errors="ignore") | |
| return bool(content_regex.search(content)) | |
| file_filters.append(content_filter) | |
| def combined_filter(file_path: str) -> bool: | |
| path = Path(file_path) | |
| if self.is_binary(file_path): | |
| return False | |
| return all(f(path) for f in file_filters) | |
| return GitLoader( | |
| repo_path=self.repo_path, | |
| clone_url=self.clone_url, | |
| branch=self.branch, | |
| file_filter=combined_filter, | |
| ) | |
| def load_documents(self) -> list[Data]: | |
| gitloader = self.build_gitloader() | |
| documents = list(gitloader.lazy_load()) | |
| data = [Data.from_document(doc) for doc in documents] | |
| self.status = data | |
| return data | |