from typing import Any from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from langflow.base.textsplitters.model import LCTextSplitterComponent from langflow.inputs.inputs import DataInput, IntInput, MessageTextInput from langflow.utils.util import unescape_string class RecursiveCharacterTextSplitterComponent(LCTextSplitterComponent): display_name: str = "Recursive Character Text Splitter" description: str = "Split text trying to keep all related text together." documentation: str = "https://docs.langflow.org/components/text-splitters#recursivecharactertextsplitter" name = "RecursiveCharacterTextSplitter" icon = "LangChain" inputs = [ IntInput( name="chunk_size", display_name="Chunk Size", info="The maximum length of each chunk.", value=1000, ), IntInput( name="chunk_overlap", display_name="Chunk Overlap", info="The amount of overlap between chunks.", value=200, ), DataInput( name="data_input", display_name="Input", info="The texts to split.", input_types=["Document", "Data"], ), MessageTextInput( name="separators", display_name="Separators", info='The characters to split on.\nIf left empty defaults to ["\\n\\n", "\\n", " ", ""].', is_list=True, ), ] def get_data_input(self) -> Any: return self.data_input def build_text_splitter(self) -> TextSplitter: if not self.separators: separators: list[str] | None = None else: # check if the separators list has escaped characters # if there are escaped characters, unescape them separators = [unescape_string(x) for x in self.separators] return RecursiveCharacterTextSplitter( separators=separators, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, )