Spaces:
Sleeping
Sleeping
| # documents loader function | |
| from langchain_community.document_loaders import RecursiveUrlLoader | |
| from bs4 import BeautifulSoup as Soup | |
| from validators import url as url_validator | |
| from langchain_core.documents import Document | |
| import time | |
| import logging | |
| import sys | |
| logger = logging.getLogger(__name__) | |
| def load_docs_from_urls( | |
| urls: list = ["https://docs.python.org/3/"], | |
| max_depth: int = 5, | |
| ) -> list[Document]: | |
| """ | |
| Load documents from a list of URLs. | |
| ## Args: | |
| urls (list, optional): A list of URLs to load documents from. Defaults to ["https://docs.python.org/3/"]. | |
| max_depth (int, optional): Maximum depth to recursively load documents from each URL. Defaults to 5. | |
| ## Returns: | |
| list: A list of documents loaded from the given URLs. | |
| ## Raises: | |
| ValueError: If any URL in the provided list is invalid. | |
| """ | |
| stf = time.time() # Start time for performance measurement | |
| docs = [] | |
| for url in urls: | |
| st = time.time() # Start time for outer performance measurement | |
| if not url_validator(url): | |
| raise ValueError(f"Invalid URL: {url}") | |
| try: | |
| st = time.time() # Start time for inner performance measurement | |
| loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text) | |
| docs.extend(loader.load()) | |
| et = time.time() - st # Calculate time taken for splitting | |
| logMessage=f'Time taken for downloading documents from {url}: {et} seconds.' | |
| logger.info(logMessage) | |
| print(logMessage) | |
| except Exception as e: | |
| logMessage=f"Failed to load or parse the URL {url}. Error: {e}" | |
| logger.error(logMessage) | |
| print(logMessage, file=sys.stderr) | |
| etf = time.time() - stf # Calculate time taken for scrapping all URLs | |
| print(f'Total time taken for downloading {len(docs)} documents: {etf} seconds.') | |
| return docs |