Tai Truong
fix readme
d202ada
from langchain_text_splitters import CharacterTextSplitter
from langflow.custom import Component
from langflow.io import HandleInput, IntInput, MessageTextInput, Output
from langflow.schema import Data
from langflow.utils.util import unescape_string
class SplitTextComponent(Component):
display_name: str = "Split Text"
description: str = "Split text into chunks based on specified criteria."
icon = "scissors-line-dashed"
name = "SplitText"
inputs = [
HandleInput(
name="data_inputs",
display_name="Data Inputs",
info="The data to split.",
input_types=["Data"],
is_list=True,
),
IntInput(
name="chunk_overlap",
display_name="Chunk Overlap",
info="Number of characters to overlap between chunks.",
value=200,
),
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The maximum number of characters in each chunk.",
value=1000,
),
MessageTextInput(
name="separator",
display_name="Separator",
info="The character to split on. Defaults to newline.",
value="\n",
),
]
outputs = [
Output(display_name="Chunks", name="chunks", method="split_text"),
]
def _docs_to_data(self, docs):
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
def split_text(self) -> list[Data]:
separator = unescape_string(self.separator)
documents = [_input.to_lc_document() for _input in self.data_inputs if isinstance(_input, Data)]
splitter = CharacterTextSplitter(
chunk_overlap=self.chunk_overlap,
chunk_size=self.chunk_size,
separator=separator,
)
docs = splitter.split_documents(documents)
data = self._docs_to_data(docs)
self.status = data
return data