Tai Truong
fix readme
d202ada
from spider.spider import Spider
from langflow.base.langchain_utilities.spider_constants import MODES
from langflow.custom import Component
from langflow.io import (
BoolInput,
DictInput,
DropdownInput,
IntInput,
Output,
SecretStrInput,
StrInput,
)
from langflow.schema import Data
class SpiderTool(Component):
display_name: str = "Spider Web Crawler & Scraper"
description: str = "Spider API for web crawling and scraping."
output_types: list[str] = ["Document"]
documentation: str = "https://spider.cloud/docs/api"
inputs = [
SecretStrInput(
name="spider_api_key",
display_name="Spider API Key",
required=True,
password=True,
info="The Spider API Key, get it from https://spider.cloud",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape or crawl",
),
DropdownInput(
name="mode",
display_name="Mode",
required=True,
options=MODES,
value=MODES[0],
info="The mode of operation: scrape or crawl",
),
IntInput(
name="limit",
display_name="Limit",
info="The maximum amount of pages allowed to crawl per website. Set to 0 to crawl all pages.",
advanced=True,
),
IntInput(
name="depth",
display_name="Depth",
info="The crawl limit for maximum depth. If 0, no limit will be applied.",
advanced=True,
),
StrInput(
name="blacklist",
display_name="Blacklist",
info="Blacklist paths that you do not want to crawl. Use Regex patterns.",
advanced=True,
),
StrInput(
name="whitelist",
display_name="Whitelist",
info="Whitelist paths that you want to crawl, ignoring all other routes. Use Regex patterns.",
advanced=True,
),
BoolInput(
name="readability",
display_name="Use Readability",
info="Use readability to pre-process the content for reading.",
advanced=True,
),
IntInput(
name="request_timeout",
display_name="Request Timeout",
info="Timeout for the request in seconds.",
advanced=True,
),
BoolInput(
name="metadata",
display_name="Metadata",
info="Include metadata in the response.",
advanced=True,
),
DictInput(
name="params",
display_name="Additional Parameters",
info="Additional parameters to pass to the API. If provided, other inputs will be ignored.",
),
]
outputs = [
Output(display_name="Markdown", name="content", method="crawl"),
]
def crawl(self) -> list[Data]:
if self.params:
parameters = self.params["data"]
else:
parameters = {
"limit": self.limit or None,
"depth": self.depth or None,
"blacklist": self.blacklist or None,
"whitelist": self.whitelist or None,
"readability": self.readability,
"request_timeout": self.request_timeout or None,
"metadata": self.metadata,
"return_format": "markdown",
}
app = Spider(api_key=self.spider_api_key)
if self.mode == "scrape":
parameters["limit"] = 1
result = app.scrape_url(self.url, parameters)
elif self.mode == "crawl":
result = app.crawl_url(self.url, parameters)
else:
msg = f"Invalid mode: {self.mode}. Must be 'scrape' or 'crawl'."
raise ValueError(msg)
records = []
for record in result:
if self.metadata:
records.append(
Data(
data={
"content": record["content"],
"url": record["url"],
"metadata": record["metadata"],
}
)
)
else:
records.append(Data(data={"content": record["content"], "url": record["url"]}))
return records
class SpiderToolError(Exception):
"""SpiderTool error."""