Spaces:
Running
Running
from langchain_text_splitters import CharacterTextSplitter | |
from langflow.custom import Component | |
from langflow.io import HandleInput, IntInput, MessageTextInput, Output | |
from langflow.schema import Data | |
from langflow.utils.util import unescape_string | |
class SplitTextComponent(Component): | |
display_name: str = "Split Text" | |
description: str = "Split text into chunks based on specified criteria." | |
icon = "scissors-line-dashed" | |
name = "SplitText" | |
inputs = [ | |
HandleInput( | |
name="data_inputs", | |
display_name="Data Inputs", | |
info="The data to split.", | |
input_types=["Data"], | |
is_list=True, | |
), | |
IntInput( | |
name="chunk_overlap", | |
display_name="Chunk Overlap", | |
info="Number of characters to overlap between chunks.", | |
value=200, | |
), | |
IntInput( | |
name="chunk_size", | |
display_name="Chunk Size", | |
info="The maximum number of characters in each chunk.", | |
value=1000, | |
), | |
MessageTextInput( | |
name="separator", | |
display_name="Separator", | |
info="The character to split on. Defaults to newline.", | |
value="\n", | |
), | |
] | |
outputs = [ | |
Output(display_name="Chunks", name="chunks", method="split_text"), | |
] | |
def _docs_to_data(self, docs): | |
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] | |
def split_text(self) -> list[Data]: | |
separator = unescape_string(self.separator) | |
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] | |
splitter = CharacterTextSplitter( | |
chunk_overlap=self.chunk_overlap, | |
chunk_size=self.chunk_size, | |
separator=separator, | |
) | |
docs = splitter.split_documents(documents) | |
data = self._docs_to_data(docs) | |
self.status = data | |
return data | |