Spaces:
Running
Running
import re | |
from pathlib import Path | |
from langchain_community.document_loaders.git import GitLoader | |
from langflow.custom import Component | |
from langflow.io import MessageTextInput, Output | |
from langflow.schema import Data | |
class GitLoaderComponent(Component): | |
display_name = "GitLoader" | |
description = "Load files from a Git repository" | |
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" | |
trace_type = "tool" | |
icon = "GitLoader" | |
name = "GitLoader" | |
inputs = [ | |
MessageTextInput( | |
name="repo_path", | |
display_name="Repository Path", | |
required=False, | |
info="The local path to the Git repository.", | |
), | |
MessageTextInput( | |
name="clone_url", | |
display_name="Clone URL", | |
required=False, | |
info="The URL to clone the Git repository from.", | |
), | |
MessageTextInput( | |
name="branch", | |
display_name="Branch", | |
required=False, | |
value="main", | |
info="The branch to load files from. Defaults to 'main'.", | |
), | |
MessageTextInput( | |
name="file_filter", | |
display_name="File Filter", | |
required=False, | |
advanced=True, | |
info="A list of patterns to filter files. Example to include only .py files: '*.py'. " | |
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", | |
), | |
MessageTextInput( | |
name="content_filter", | |
display_name="Content Filter", | |
required=False, | |
advanced=True, | |
info="A regex pattern to filter files based on their content.", | |
), | |
] | |
outputs = [ | |
Output(name="data", display_name="Data", method="load_documents"), | |
] | |
def is_binary(file_path: str) -> bool: | |
"""Check if a file is binary by looking for null bytes. | |
This is necessary because when searches are performed using | |
the content_filter, binary files need to be ignored. | |
""" | |
with Path(file_path).open("rb") as file: | |
return b"\x00" in file.read(1024) | |
def build_gitloader(self) -> GitLoader: | |
file_filter_patterns = getattr(self, "file_filter", None) | |
content_filter_pattern = getattr(self, "content_filter", None) | |
file_filters = [] | |
if file_filter_patterns: | |
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] | |
def file_filter(file_path: Path) -> bool: | |
if len(patterns) == 1 and patterns[0].startswith("!"): | |
return not file_path.match(patterns[0][1:]) | |
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) | |
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) | |
return included and not excluded | |
file_filters.append(file_filter) | |
if content_filter_pattern: | |
content_regex = re.compile(content_filter_pattern) | |
def content_filter(file_path: Path) -> bool: | |
content = file_path.read_text(encoding="utf-8", errors="ignore") | |
return bool(content_regex.search(content)) | |
file_filters.append(content_filter) | |
def combined_filter(file_path: str) -> bool: | |
path = Path(file_path) | |
if self.is_binary(file_path): | |
return False | |
return all(f(path) for f in file_filters) | |
return GitLoader( | |
repo_path=self.repo_path, | |
clone_url=self.clone_url, | |
branch=self.branch, | |
file_filter=combined_filter, | |
) | |
def load_documents(self) -> list[Data]: | |
gitloader = self.build_gitloader() | |
documents = list(gitloader.lazy_load()) | |
data = [Data.from_document(doc) for doc in documents] | |
self.status = data | |
return data | |