Spaces:
Running
Running
from langchain_unstructured import UnstructuredLoader | |
from langflow.base.data import BaseFileComponent | |
from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput | |
from langflow.schema import Data | |
class UnstructuredComponent(BaseFileComponent): | |
display_name = "Unstructured API" | |
description = ( | |
"Uses Unstructured.io API to extract clean text from raw source documents. " | |
"Supports a wide range of file types." | |
) | |
documentation = ( | |
"https://python.langchain.com/api_reference/unstructured/document_loaders/" | |
"langchain_unstructured.document_loaders.UnstructuredLoader.html" | |
) | |
trace_type = "tool" | |
icon = "Unstructured" | |
name = "Unstructured" | |
# https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types | |
VALID_EXTENSIONS = [ | |
"bmp", | |
"csv", | |
"doc", | |
"docx", | |
"eml", | |
"epub", | |
"heic", | |
"html", | |
"jpeg", | |
"png", | |
"md", | |
"msg", | |
"odt", | |
"org", | |
"p7s", | |
"pdf", | |
"png", | |
"ppt", | |
"pptx", | |
"rst", | |
"rtf", | |
"tiff", | |
"txt", | |
"tsv", | |
"xls", | |
"xlsx", | |
"xml", | |
] | |
inputs = [ | |
*BaseFileComponent._base_inputs, | |
SecretStrInput( | |
name="api_key", | |
display_name="Unstructured.io Serverless API Key", | |
required=True, | |
info="Unstructured API Key. Create at: https://app.unstructured.io/", | |
), | |
MessageTextInput( | |
name="api_url", | |
display_name="Unstructured.io API URL", | |
required=False, | |
info="Unstructured API URL.", | |
), | |
DropdownInput( | |
name="chunking_strategy", | |
display_name="Chunking Strategy", | |
info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking", | |
options=["", "basic", "by_title", "by_page", "by_similarity"], | |
real_time_refresh=False, | |
value="", | |
), | |
NestedDictInput( | |
name="unstructured_args", | |
display_name="Additional Arguments", | |
required=False, | |
info=( | |
"Optional dictionary of additional arguments to the Loader. " | |
"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information." | |
), | |
), | |
] | |
outputs = [ | |
*BaseFileComponent._base_outputs, | |
] | |
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]: | |
file_paths = [str(file.path) for file in file_list if file.path] | |
if not file_paths: | |
self.log("No files to process.") | |
return file_list | |
# https://docs.unstructured.io/api-reference/api-services/api-parameters | |
args = self.unstructured_args or {} | |
if self.chunking_strategy: | |
args["chunking_strategy"] = self.chunking_strategy | |
args["api_key"] = self.api_key | |
args["partition_via_api"] = True | |
if self.api_url: | |
args["url"] = self.api_url | |
loader = UnstructuredLoader( | |
file_paths, | |
**args, | |
) | |
documents = loader.load() | |
processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents] | |
# Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field | |
for data in processed_data: | |
if data and "source" in data.data: | |
data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source") | |
return self.rollup_data(file_list, processed_data) | |