Tai Truong
fix readme
d202ada
from langchain_unstructured import UnstructuredLoader
from langflow.base.data import BaseFileComponent
from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput
from langflow.schema import Data
class UnstructuredComponent(BaseFileComponent):
display_name = "Unstructured API"
description = (
"Uses Unstructured.io API to extract clean text from raw source documents. "
"Supports a wide range of file types."
)
documentation = (
"https://python.langchain.com/api_reference/unstructured/document_loaders/"
"langchain_unstructured.document_loaders.UnstructuredLoader.html"
)
trace_type = "tool"
icon = "Unstructured"
name = "Unstructured"
# https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types
VALID_EXTENSIONS = [
"bmp",
"csv",
"doc",
"docx",
"eml",
"epub",
"heic",
"html",
"jpeg",
"png",
"md",
"msg",
"odt",
"org",
"p7s",
"pdf",
"png",
"ppt",
"pptx",
"rst",
"rtf",
"tiff",
"txt",
"tsv",
"xls",
"xlsx",
"xml",
]
inputs = [
*BaseFileComponent._base_inputs,
SecretStrInput(
name="api_key",
display_name="Unstructured.io Serverless API Key",
required=True,
info="Unstructured API Key. Create at: https://app.unstructured.io/",
),
MessageTextInput(
name="api_url",
display_name="Unstructured.io API URL",
required=False,
info="Unstructured API URL.",
),
DropdownInput(
name="chunking_strategy",
display_name="Chunking Strategy",
info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking",
options=["", "basic", "by_title", "by_page", "by_similarity"],
real_time_refresh=False,
value="",
),
NestedDictInput(
name="unstructured_args",
display_name="Additional Arguments",
required=False,
info=(
"Optional dictionary of additional arguments to the Loader. "
"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information."
),
),
]
outputs = [
*BaseFileComponent._base_outputs,
]
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
file_paths = [str(file.path) for file in file_list if file.path]
if not file_paths:
self.log("No files to process.")
return file_list
# https://docs.unstructured.io/api-reference/api-services/api-parameters
args = self.unstructured_args or {}
if self.chunking_strategy:
args["chunking_strategy"] = self.chunking_strategy
args["api_key"] = self.api_key
args["partition_via_api"] = True
if self.api_url:
args["url"] = self.api_url
loader = UnstructuredLoader(
file_paths,
**args,
)
documents = loader.load()
processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents]
# Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field
for data in processed_data:
if data and "source" in data.data:
data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source")
return self.rollup_data(file_list, processed_data)