Tai Truong
fix readme
d202ada
from langchain.docstore.document import Document
from langchain_experimental.text_splitter import SemanticChunker
from langflow.base.textsplitters.model import LCTextSplitterComponent
from langflow.io import (
DropdownInput,
FloatInput,
HandleInput,
IntInput,
MessageTextInput,
Output,
)
from langflow.schema import Data
class SemanticTextSplitterComponent(LCTextSplitterComponent):
"""Split text into semantically meaningful chunks using semantic similarity."""
display_name: str = "Semantic Text Splitter"
name: str = "SemanticTextSplitter"
description: str = "Split text into semantically meaningful chunks using semantic similarity."
documentation = "https://python.langchain.com/docs/how_to/semantic-chunker/"
beta = True # this component is beta because it is imported from langchain_experimental
icon = "LangChain"
inputs = [
HandleInput(
name="data_inputs",
display_name="Data Inputs",
info="List of Data objects containing text and metadata to split.",
input_types=["Data"],
is_list=True,
),
HandleInput(
name="embeddings",
display_name="Embeddings",
info="Embeddings model to use for semantic similarity. Required.",
input_types=["Embeddings"],
is_list=False,
),
DropdownInput(
name="breakpoint_threshold_type",
display_name="Breakpoint Threshold Type",
info=(
"Method to determine breakpoints. Options: 'percentile', "
"'standard_deviation', 'interquartile'. Defaults to 'percentile'."
),
value="percentile",
options=["percentile", "standard_deviation", "interquartile"],
),
FloatInput(
name="breakpoint_threshold_amount",
display_name="Breakpoint Threshold Amount",
info="Numerical amount for the breakpoint threshold.",
value=0.5,
),
IntInput(
name="number_of_chunks",
display_name="Number of Chunks",
info="Number of chunks to split the text into.",
value=5,
),
MessageTextInput(
name="sentence_split_regex",
display_name="Sentence Split Regex",
info="Regular expression to split sentences. Optional.",
value="",
advanced=True,
),
IntInput(
name="buffer_size",
display_name="Buffer Size",
info="Size of the buffer.",
value=0,
advanced=True,
),
]
outputs = [
Output(display_name="Chunks", name="chunks", method="split_text"),
]
def _docs_to_data(self, docs: list[Document]) -> list[Data]:
"""Convert a list of Document objects to Data objects."""
return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]
def split_text(self) -> list[Data]:
"""Split the input data into semantically meaningful chunks."""
try:
embeddings = getattr(self, "embeddings", None)
if embeddings is None:
error_msg = "An embeddings model is required for SemanticTextSplitter."
raise ValueError(error_msg)
if not self.data_inputs:
error_msg = "Data inputs cannot be empty."
raise ValueError(error_msg)
documents = []
for _input in self.data_inputs:
if isinstance(_input, Data):
documents.append(_input.to_lc_document())
else:
error_msg = f"Invalid data input type: {_input}"
raise TypeError(error_msg)
if not documents:
error_msg = "No valid Data objects found in data_inputs."
raise ValueError(error_msg)
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
splitter_params = {
"embeddings": embeddings,
"breakpoint_threshold_type": self.breakpoint_threshold_type or "percentile",
"breakpoint_threshold_amount": self.breakpoint_threshold_amount,
"number_of_chunks": self.number_of_chunks,
"buffer_size": self.buffer_size,
}
if self.sentence_split_regex:
splitter_params["sentence_split_regex"] = self.sentence_split_regex
splitter = SemanticChunker(**splitter_params)
docs = splitter.create_documents(texts, metadatas=metadatas)
data = self._docs_to_data(docs)
self.status = data
except Exception as e:
error_msg = f"An error occurred during semantic splitting: {e}"
raise RuntimeError(error_msg) from e
else:
return data