File size: 1,367 Bytes
13fbd2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
from langchain_community.document_loaders import WebBaseLoader
from llama_index.readers.web import SimpleWebPageReader
from newspaper import Article
from llama_index.core.llms import ChatMessage
import httpx
from bs4 import BeautifulSoup
def load_web(url: str):
loader = WebBaseLoader(url)
return loader.load()
def llama_load_web(url: str):
docs = SimpleWebPageReader(html_to_text=True).load_data([url])
return docs
def newspaper_load_web(url: str):
article = Article(url)
try:
article.download()
article.parse()
result = {
"title": article.title,
"text": article.text,
}
return result
except Exception as e:
return ""
def html2text(url: str):
from html2text import HTML2Text
h = HTML2Text()
h.ignore_links = True
return h.handle(url)
def httpxs(url: str):
import httpx
r = httpx.get(url)
return r.text
if __name__ == "__main__":
url = "https://cn.pornhub.com/video/search?search=hongkongdoll"
response = httpx.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_content = soup.find_all("p")
print([ele.get_text() for ele in text_content if ele.get_text() != ""])
text_content = "\n".join([p.get_text() for p in text_content if p.get_text() != ""])
print(f"text_content: {text_content}")
|