File size: 1,367 Bytes
13fbd2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from langchain_community.document_loaders import WebBaseLoader
from llama_index.readers.web import SimpleWebPageReader
from newspaper import Article
from llama_index.core.llms import ChatMessage
import httpx
from bs4 import BeautifulSoup

def load_web(url: str):
    loader = WebBaseLoader(url)
    return loader.load()

def llama_load_web(url: str):
    docs = SimpleWebPageReader(html_to_text=True).load_data([url])
    return docs

def newspaper_load_web(url: str):
    article = Article(url)
    try:
        article.download()
        article.parse()
        result = {
            "title": article.title,
            "text": article.text,
        }
        return result
    except Exception as e:
        return ""
    
def html2text(url: str):
    from html2text import HTML2Text
    h = HTML2Text()
    h.ignore_links = True
    return h.handle(url)

def httpxs(url: str):
    import httpx
    r = httpx.get(url)
    return r.text

if __name__ == "__main__":
    url = "https://cn.pornhub.com/video/search?search=hongkongdoll"
    response = httpx.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    text_content = soup.find_all("p")
    print([ele.get_text() for ele in text_content if ele.get_text() != ""])
    text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""])
    print(f"text_content: {text_content}")