from langchain_text_splitters import CharacterTextSplitter from langflow.custom import Component from langflow.io import HandleInput, IntInput, MessageTextInput, Output from langflow.schema import Data from langflow.utils.util import unescape_string class SplitTextComponent(Component): display_name: str = "Split Text" description: str = "Split text into chunks based on specified criteria." icon = "scissors-line-dashed" name = "SplitText" inputs = [ HandleInput( name="data_inputs", display_name="Data Inputs", info="The data to split.", input_types=["Data"], is_list=True, ), IntInput( name="chunk_overlap", display_name="Chunk Overlap", info="Number of characters to overlap between chunks.", value=200, ), IntInput( name="chunk_size", display_name="Chunk Size", info="The maximum number of characters in each chunk.", value=1000, ), MessageTextInput( name="separator", display_name="Separator", info="The character to split on. Defaults to newline.", value="\n", ), ] outputs = [ Output(display_name="Chunks", name="chunks", method="split_text"), ] def _docs_to_data(self, docs): return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] def split_text(self) -> list[Data]: separator = unescape_string(self.separator) documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)] splitter = CharacterTextSplitter( chunk_overlap=self.chunk_overlap, chunk_size=self.chunk_size, separator=separator, ) docs = splitter.split_documents(documents) data = self._docs_to_data(docs) self.status = data return data