Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| # Set up the Hugging Face model and tokenizer for text extraction | |
| model_name = "distilbert-base-uncased" | |
| model = AutoModelForTokenClassification.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def scrape_website(url): | |
| # Send an HTTP request to the website | |
| response = requests.get(url) | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract the text content from the HTML | |
| text = soup.get_text() | |
| # Preprocess the text using the Hugging Face tokenizer | |
| inputs = tokenizer.encode_plus( | |
| text, | |
| add_special_tokens=True, | |
| max_length=512, | |
| return_attention_mask=True, | |
| return_tensors='pt' | |
| ) | |
| # Use the Hugging Face model to extract the content | |
| outputs = model(**inputs) | |
| content = outputs.last_hidden_state[:, 0, :] | |
| # Convert the content to a string | |
| content_str = tokenizer.decode(content, skip_special_tokens=True) | |
| return content_str | |
| # Define a function to scrape multiple URLs | |
| def scrape_multiple_websites(urls): | |
| contents = [] | |
| for url in urls: | |
| content = scrape_website(url) | |
| contents.append(content) | |
| # Join the contents of multiple URLs | |
| joined_content = '\n\n'.join(contents) | |
| return joined_content | |
| # Example usage: Scrape a single URL | |
| url = "https://www.example.com" | |
| content = scrape_website(url) | |
| print(content) | |
| # Example usage: Scrape multiple URLs | |
| urls = ["https://www.example.com", "https://www.example2.com"] | |
| content = scrape_multiple_websites(urls) | |
| print(content) |