Spaces:
Running
Running
import re | |
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader | |
from langflow.custom import Component | |
from langflow.helpers.data import data_to_text | |
from langflow.io import DropdownInput, MessageTextInput, Output | |
from langflow.schema import Data | |
from langflow.schema.message import Message | |
class URLComponent(Component): | |
display_name = "URL" | |
description = "Fetch content from one or more URLs." | |
icon = "layout-template" | |
name = "URL" | |
inputs = [ | |
MessageTextInput( | |
name="urls", | |
display_name="URLs", | |
info="Enter one or more URLs, by clicking the '+' button.", | |
is_list=True, | |
tool_mode=True, | |
), | |
DropdownInput( | |
name="format", | |
display_name="Output Format", | |
info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.", | |
options=["Text", "Raw HTML"], | |
value="Text", | |
), | |
] | |
outputs = [ | |
Output(display_name="Data", name="data", method="fetch_content"), | |
Output(display_name="Text", name="text", method="fetch_content_text"), | |
] | |
def ensure_url(self, string: str) -> str: | |
"""Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'. | |
Raises an error if the string is not a valid URL. | |
Parameters: | |
string (str): The string to be checked and possibly modified. | |
Returns: | |
str: The modified string that is ensured to be a URL. | |
Raises: | |
ValueError: If the string is not a valid URL. | |
""" | |
if not string.startswith(("http://", "https://")): | |
string = "http://" + string | |
# Basic URL validation regex | |
url_regex = re.compile( | |
r"^(https?:\/\/)?" # optional protocol | |
r"(www\.)?" # optional www | |
r"([a-zA-Z0-9.-]+)" # domain | |
r"(\.[a-zA-Z]{2,})?" # top-level domain | |
r"(:\d+)?" # optional port | |
r"(\/[^\s]*)?$", # optional path | |
re.IGNORECASE, | |
) | |
if not url_regex.match(string): | |
msg = f"Invalid URL: {string}" | |
raise ValueError(msg) | |
return string | |
def fetch_content(self) -> list[Data]: | |
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()] | |
if self.format == "Raw HTML": | |
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8") | |
else: | |
loader = WebBaseLoader(web_paths=urls, encoding="utf-8") | |
docs = loader.load() | |
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs] | |
self.status = data | |
return data | |
def fetch_content_text(self) -> Message: | |
data = self.fetch_content() | |
result_string = data_to_text("{text}", data) | |
self.status = result_string | |
return Message(text=result_string) | |