Spaces:
Configuration error
Configuration error
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
import uuid | |
import warnings | |
from typing import ( | |
IO, | |
TYPE_CHECKING, | |
Any, | |
Dict, | |
List, | |
Literal, | |
Optional, | |
Tuple, | |
Union, | |
) | |
if TYPE_CHECKING: | |
from unstructured.documents.elements import Element | |
import pdb | |
class UnstructuredIO: | |
r"""A class to handle various functionalities provided by the | |
Unstructured library, including version checking, parsing, cleaning, | |
extracting, staging, chunking data, and integrating with cloud | |
services like S3 and Azure for data connection. | |
References: | |
https://docs.unstructured.io/ | |
""" | |
def create_element_from_text( | |
text: str, | |
element_id: Optional[str] = None, | |
embeddings: Optional[List[float]] = None, | |
filename: Optional[str] = None, | |
file_directory: Optional[str] = None, | |
last_modified: Optional[str] = None, | |
filetype: Optional[str] = None, | |
parent_id: Optional[str] = None, | |
) -> "Element": | |
r"""Creates a Text element from a given text input, with optional | |
metadata and embeddings. | |
Args: | |
text (str): The text content for the element. | |
element_id (Optional[str], optional): Unique identifier for the | |
element. (default: :obj:`None`) | |
embeddings (List[float], optional): A list of float | |
numbers representing the text embeddings. | |
(default: :obj:`None`) | |
filename (Optional[str], optional): The name of the file the | |
element is associated with. (default: :obj:`None`) | |
file_directory (Optional[str], optional): The directory path where | |
the file is located. (default: :obj:`None`) | |
last_modified (Optional[str], optional): The last modified date of | |
the file. (default: :obj:`None`) | |
filetype (Optional[str], optional): The type of the file. | |
(default: :obj:`None`) | |
parent_id (Optional[str], optional): The identifier of the parent | |
element. (default: :obj:`None`) | |
Returns: | |
Element: An instance of Text with the provided content and | |
metadata. | |
""" | |
from unstructured.documents.elements import ElementMetadata, Text | |
metadata = ElementMetadata( | |
filename=filename, | |
file_directory=file_directory, | |
last_modified=last_modified, | |
filetype=filetype, | |
parent_id=parent_id, | |
) | |
return Text( | |
text=text, | |
element_id=element_id or str(uuid.uuid4()), | |
metadata=metadata, | |
embeddings=embeddings, | |
) | |
def parse_file_or_url( | |
input_path: str, | |
**kwargs: Any, | |
) -> Union[List["Element"], None]: | |
r"""Loads a file or a URL and parses its contents into elements. | |
Args: | |
input_path (str): Path to the file or URL to be parsed. | |
**kwargs: Extra kwargs passed to the partition function. | |
Returns: | |
Union[List[Element],None]: List of elements after parsing the file | |
or URL if success. | |
Raises: | |
FileNotFoundError: If the file does not exist at the path | |
specified. | |
Notes: | |
Supported file types: | |
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", | |
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx". | |
References: | |
https://unstructured-io.github.io/unstructured/ | |
""" | |
import os | |
from urllib.parse import urlparse | |
from unstructured.partition.auto import partition | |
# Check if the input is a URL | |
parsed_url = urlparse(input_path) | |
# pdb.set_trace() | |
is_url = all([parsed_url.scheme, parsed_url.netloc]) | |
# Handling URL | |
if is_url: | |
try: | |
elements = partition(url=input_path, **kwargs) | |
return elements | |
except Exception: | |
warnings.warn(f"Failed to parse the URL: {input_path}") | |
return None | |
# Handling file | |
else: | |
# Check if the file exists | |
if not os.path.exists(input_path): | |
raise FileNotFoundError( | |
f"The file {input_path} was not found." | |
) | |
# Read the file | |
try: | |
with open(input_path, "rb") as f: | |
elements = partition(file=f, **kwargs) | |
return elements | |
except Exception: | |
warnings.warn(f"Failed to partition the file: {input_path}") | |
return None | |
def parse_bytes( | |
file: IO[bytes], **kwargs: Any | |
) -> Union[List["Element"], None]: | |
r"""Parses a bytes stream and converts its contents into elements. | |
Args: | |
file (IO[bytes]): The file in bytes format to be parsed. | |
**kwargs: Extra kwargs passed to the partition function. | |
Returns: | |
Union[List[Element], None]: List of elements after parsing the file | |
if successful, otherwise `None`. | |
Notes: | |
Supported file types: | |
"csv", "doc", "docx", "epub", "image", "md", "msg", "odt", | |
"org", "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx". | |
References: | |
https://docs.unstructured.io/open-source/core-functionality/partitioning | |
""" | |
from unstructured.partition.auto import partition | |
try: | |
# Use partition to process the bytes stream | |
elements = partition(file=file, **kwargs) | |
return elements | |
except Exception as e: | |
warnings.warn(f"Failed to partition the file stream: {e}") | |
return None | |
def clean_text_data( | |
text: str, | |
clean_options: Optional[List[Tuple[str, Dict[str, Any]]]] = None, | |
) -> str: | |
r"""Cleans text data using a variety of cleaning functions provided by | |
the `unstructured` library. | |
This function applies multiple text cleaning utilities by calling the | |
`unstructured` library's cleaning bricks for operations like | |
replacing Unicode quotes, removing extra whitespace, dashes, non-ascii | |
characters, and more. | |
If no cleaning options are provided, a default set of cleaning | |
operations is applied. These defaults including operations | |
"replace_unicode_quotes", "clean_non_ascii_chars", | |
"group_broken_paragraphs", and "clean_extra_whitespace". | |
Args: | |
text (str): The text to be cleaned. | |
clean_options (dict): A dictionary specifying which cleaning | |
options to apply. The keys should match the names of the | |
cleaning functions, and the values should be dictionaries | |
containing the parameters for each function. Supported types: | |
'clean_extra_whitespace', 'clean_bullets', | |
'clean_ordered_bullets', 'clean_postfix', 'clean_prefix', | |
'clean_dashes', 'clean_trailing_punctuation', | |
'clean_non_ascii_chars', 'group_broken_paragraphs', | |
'remove_punctuation', 'replace_unicode_quotes', | |
'bytes_string_to_string', 'translate_text'. | |
Returns: | |
str: The cleaned text. | |
Raises: | |
AttributeError: If a cleaning option does not correspond to a | |
valid cleaning function in `unstructured`. | |
Notes: | |
The 'options' dictionary keys must correspond to valid cleaning | |
brick names from the `unstructured` library. | |
Each brick's parameters must be provided in a nested dictionary | |
as the value for the key. | |
References: | |
https://unstructured-io.github.io/unstructured/ | |
""" | |
from unstructured.cleaners.core import ( | |
bytes_string_to_string, | |
clean_bullets, | |
clean_dashes, | |
clean_extra_whitespace, | |
clean_non_ascii_chars, | |
clean_ordered_bullets, | |
clean_postfix, | |
clean_prefix, | |
clean_trailing_punctuation, | |
group_broken_paragraphs, | |
remove_punctuation, | |
replace_unicode_quotes, | |
) | |
from unstructured.cleaners.translate import translate_text | |
cleaning_functions: Any = { | |
"clean_extra_whitespace": clean_extra_whitespace, | |
"clean_bullets": clean_bullets, | |
"clean_ordered_bullets": clean_ordered_bullets, | |
"clean_postfix": clean_postfix, | |
"clean_prefix": clean_prefix, | |
"clean_dashes": clean_dashes, | |
"clean_trailing_punctuation": clean_trailing_punctuation, | |
"clean_non_ascii_chars": clean_non_ascii_chars, | |
"group_broken_paragraphs": group_broken_paragraphs, | |
"remove_punctuation": remove_punctuation, | |
"replace_unicode_quotes": replace_unicode_quotes, | |
"bytes_string_to_string": bytes_string_to_string, | |
"translate_text": translate_text, | |
} | |
# Define default clean options if none are provided | |
if clean_options is None: | |
clean_options = [ | |
("replace_unicode_quotes", {}), | |
("clean_non_ascii_chars", {}), | |
("group_broken_paragraphs", {}), | |
("clean_extra_whitespace", {}), | |
] | |
cleaned_text = text | |
for func_name, params in clean_options: | |
if func_name in cleaning_functions: | |
cleaned_text = cleaning_functions[func_name]( | |
cleaned_text, **params | |
) | |
else: | |
raise ValueError( | |
f"'{func_name}' is not a valid function in " | |
"`Unstructured IO`." | |
) | |
return cleaned_text | |
def extract_data_from_text( | |
text: str, | |
extract_type: Literal[ | |
'extract_datetimetz', | |
'extract_email_address', | |
'extract_ip_address', | |
'extract_ip_address_name', | |
'extract_mapi_id', | |
'extract_ordered_bullets', | |
'extract_text_after', | |
'extract_text_before', | |
'extract_us_phone_number', | |
], | |
**kwargs, | |
) -> Any: | |
r"""Extracts various types of data from text using functions from | |
unstructured.cleaners.extract. | |
Args: | |
text (str): Text to extract data from. | |
extract_type (Literal['extract_datetimetz', | |
'extract_email_address', 'extract_ip_address', | |
'extract_ip_address_name', 'extract_mapi_id', | |
'extract_ordered_bullets', 'extract_text_after', | |
'extract_text_before', 'extract_us_phone_number']): Type of | |
data to extract. | |
**kwargs: Additional keyword arguments for specific | |
extraction functions. | |
Returns: | |
Any: The extracted data, type depends on extract_type. | |
References: | |
https://unstructured-io.github.io/unstructured/ | |
""" | |
from unstructured.cleaners.extract import ( | |
extract_datetimetz, | |
extract_email_address, | |
extract_ip_address, | |
extract_ip_address_name, | |
extract_mapi_id, | |
extract_ordered_bullets, | |
extract_text_after, | |
extract_text_before, | |
extract_us_phone_number, | |
) | |
extraction_functions: Any = { | |
"extract_datetimetz": extract_datetimetz, | |
"extract_email_address": extract_email_address, | |
"extract_ip_address": extract_ip_address, | |
"extract_ip_address_name": extract_ip_address_name, | |
"extract_mapi_id": extract_mapi_id, | |
"extract_ordered_bullets": extract_ordered_bullets, | |
"extract_text_after": extract_text_after, | |
"extract_text_before": extract_text_before, | |
"extract_us_phone_number": extract_us_phone_number, | |
} | |
if extract_type not in extraction_functions: | |
raise ValueError(f"Unsupported extract_type: {extract_type}") | |
return extraction_functions[extract_type](text, **kwargs) | |
def stage_elements( | |
elements: List[Any], | |
stage_type: Literal[ | |
'convert_to_csv', | |
'convert_to_dataframe', | |
'convert_to_dict', | |
'dict_to_elements', | |
'stage_csv_for_prodigy', | |
'stage_for_prodigy', | |
'stage_for_baseplate', | |
'stage_for_datasaur', | |
'stage_for_label_box', | |
'stage_for_label_studio', | |
'stage_for_weaviate', | |
], | |
**kwargs, | |
) -> Union[str, List[Dict], Any]: | |
r"""Stages elements for various platforms based on the | |
specified staging type. | |
This function applies multiple staging utilities to format data | |
for different NLP annotation and machine learning tools. It uses | |
the 'unstructured.staging' module's functions for operations like | |
converting to CSV, DataFrame, dictionary, or formatting for | |
specific platforms like Prodigy, etc. | |
Args: | |
elements (List[Any]): List of Element objects to be staged. | |
stage_type (Literal['convert_to_csv', 'convert_to_dataframe', | |
'convert_to_dict', 'dict_to_elements', | |
'stage_csv_for_prodigy', 'stage_for_prodigy', | |
'stage_for_baseplate', 'stage_for_datasaur', | |
'stage_for_label_box', 'stage_for_label_studio', | |
'stage_for_weaviate']): Type of staging to perform. | |
**kwargs: Additional keyword arguments specific to | |
the staging type. | |
Returns: | |
Union[str, List[Dict], Any]: Staged data in the | |
format appropriate for the specified staging type. | |
Raises: | |
ValueError: If the staging type is not supported or a required | |
argument is missing. | |
References: | |
https://unstructured-io.github.io/unstructured/ | |
""" | |
from unstructured.staging import ( | |
base, | |
baseplate, | |
datasaur, | |
label_box, | |
label_studio, | |
prodigy, | |
weaviate, | |
) | |
staging_functions: Any = { | |
"convert_to_csv": base.convert_to_csv, | |
"convert_to_dataframe": base.convert_to_dataframe, | |
"convert_to_dict": base.convert_to_dict, | |
"dict_to_elements": base.dict_to_elements, | |
"stage_csv_for_prodigy": lambda els, | |
**kw: prodigy.stage_csv_for_prodigy(els, kw.get('metadata', [])), | |
"stage_for_prodigy": lambda els, **kw: prodigy.stage_for_prodigy( | |
els, kw.get('metadata', []) | |
), | |
"stage_for_baseplate": baseplate.stage_for_baseplate, | |
"stage_for_datasaur": lambda els, | |
**kw: datasaur.stage_for_datasaur(els, kw.get('entities', [])), | |
"stage_for_label_box": lambda els, | |
**kw: label_box.stage_for_label_box(els, **kw), | |
"stage_for_label_studio": lambda els, | |
**kw: label_studio.stage_for_label_studio(els, **kw), | |
"stage_for_weaviate": weaviate.stage_for_weaviate, | |
} | |
if stage_type not in staging_functions: | |
raise ValueError(f"Unsupported stage type: {stage_type}") | |
return staging_functions[stage_type](elements, **kwargs) | |
def chunk_elements( | |
elements: List["Element"], chunk_type: str, **kwargs | |
) -> List["Element"]: | |
r"""Chunks elements by titles. | |
Args: | |
elements (List[Element]): List of Element objects to be chunked. | |
chunk_type (str): Type chunk going to apply. Supported types: | |
'chunk_by_title'. | |
**kwargs: Additional keyword arguments for chunking. | |
Returns: | |
List[Dict]: List of chunked sections. | |
References: | |
https://unstructured-io.github.io/unstructured/ | |
""" | |
from unstructured.chunking.title import chunk_by_title | |
chunking_functions = { | |
"chunk_by_title": chunk_by_title, | |
} | |
if chunk_type not in chunking_functions: | |
raise ValueError(f"Unsupported chunk type: {chunk_type}") | |
# Format chunks into a list of dictionaries (or your preferred format) | |
return chunking_functions[chunk_type](elements, **kwargs) | |