Spaces:

opendigital
/

agent-flow

Running

agent-flow / src /backend /base /langflow /components /unstructured /unstructured.py

Tai Truong

fix readme

d202ada 4 months ago

3.81 kB

	from langchain_unstructured import UnstructuredLoader

	from langflow.base.data import BaseFileComponent
	from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput
	from langflow.schema import Data


	class UnstructuredComponent(BaseFileComponent):
	display_name = "Unstructured API"
	description = (
	"Uses Unstructured.io API to extract clean text from raw source documents. "
	"Supports a wide range of file types."
	)
	documentation = (
	"https://python.langchain.com/api_reference/unstructured/document_loaders/"
	"langchain_unstructured.document_loaders.UnstructuredLoader.html"
	)
	trace_type = "tool"
	icon = "Unstructured"
	name = "Unstructured"

	# https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types
	VALID_EXTENSIONS = [
	"bmp",
	"csv",
	"doc",
	"docx",
	"eml",
	"epub",
	"heic",
	"html",
	"jpeg",
	"png",
	"md",
	"msg",
	"odt",
	"org",
	"p7s",
	"pdf",
	"png",
	"ppt",
	"pptx",
	"rst",
	"rtf",
	"tiff",
	"txt",
	"tsv",
	"xls",
	"xlsx",
	"xml",
	]

	inputs = [
	*BaseFileComponent._base_inputs,
	SecretStrInput(
	name="api_key",
	display_name="Unstructured.io Serverless API Key",
	required=True,
	info="Unstructured API Key. Create at: https://app.unstructured.io/",
	),
	MessageTextInput(
	name="api_url",
	display_name="Unstructured.io API URL",
	required=False,
	info="Unstructured API URL.",
	),
	DropdownInput(
	name="chunking_strategy",
	display_name="Chunking Strategy",
	info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking",
	options=["", "basic", "by_title", "by_page", "by_similarity"],
	real_time_refresh=False,
	value="",
	),
	NestedDictInput(
	name="unstructured_args",
	display_name="Additional Arguments",
	required=False,
	info=(
	"Optional dictionary of additional arguments to the Loader. "
	"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information."
	),
	),
	]

	outputs = [
	*BaseFileComponent._base_outputs,
	]

	def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
	file_paths = [str(file.path) for file in file_list if file.path]

	if not file_paths:
	self.log("No files to process.")
	return file_list

	# https://docs.unstructured.io/api-reference/api-services/api-parameters
	args = self.unstructured_args or {}

	if self.chunking_strategy:
	args["chunking_strategy"] = self.chunking_strategy

	args["api_key"] = self.api_key
	args["partition_via_api"] = True
	if self.api_url:
	args["url"] = self.api_url

	loader = UnstructuredLoader(
	file_paths,
	**args,
	)

	documents = loader.load()

	processed_data: list[Data \| None] = [Data.from_document(doc) if doc else None for doc in documents]

	# Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field
	for data in processed_data:
	if data and "source" in data.data:
	data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source")

	return self.rollup_data(file_list, processed_data)