File size: 1,998 Bytes
f5d22a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# documents loader function
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup
from validators import url as url_validator
from langchain_core.documents import Document
import time
import logging
import sys

logger = logging.getLogger(__name__)

def load_docs_from_urls(
        urls: list = ["https://docs.python.org/3/"], 
        max_depth: int = 5,
        ) -> list[Document]:
    """
    Load documents from a list of URLs.

    ## Args:
        urls (list, optional): A list of URLs to load documents from. Defaults to ["https://docs.python.org/3/"].
        max_depth (int, optional): Maximum depth to recursively load documents from each URL. Defaults to 5.

    ## Returns:
        list: A list of documents loaded from the given URLs.
        
    ## Raises:
        ValueError: If any URL in the provided list is invalid.
    """
    stf = time.time()  # Start time for performance measurement
    docs = []
    for url in urls:
        st = time.time()  # Start time for outer performance measurement
        if not url_validator(url):
            raise ValueError(f"Invalid URL: {url}")
        try:
            st = time.time()  # Start time for inner performance measurement
            loader = RecursiveUrlLoader(url=url, max_depth=max_depth, extractor=lambda x: Soup(x, "html.parser").text)
            docs.extend(loader.load())

            et = time.time() - st  # Calculate time taken for splitting
            logMessage=f'Time taken for downloading documents from {url}: {et} seconds.'
            logger.info(logMessage)
            print(logMessage)
        except Exception as e:
            logMessage=f"Failed to load or parse the URL {url}. Error: {e}"
            logger.error(logMessage)
            print(logMessage, file=sys.stderr)
    etf = time.time() - stf  # Calculate time taken for scrapping all URLs
    print(f'Total time taken for downloading {len(docs)} documents: {etf} seconds.')
    return docs