Spaces:
Running
Running
File size: 4,060 Bytes
8efa796 706f4ae 8efa796 5f7a419 8efa796 b928ab9 706f4ae b92fcb4 706f4ae b928ab9 706f4ae b928ab9 81b8336 706f4ae b928ab9 706f4ae b928ab9 706f4ae b928ab9 706f4ae b928ab9 81b8336 b928ab9 8efa796 5f74ea1 faea8d5 5f74ea1 faea8d5 5f74ea1 faea8d5 81b8336 5f74ea1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# scraper.py
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import time
class Scraper:
@staticmethod
async def power_scrapper_2(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Route to block images, videos, and CSS
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
await page.goto(url)
# Get the title
title = await page.title()
# Get all links
page_url = await page.evaluate("""() => {
return Array.from(document.querySelectorAll('a')).map(a => a.href);
}""")
# Get page content (text from paragraphs and headers)
page_content = await page.evaluate("""() => {
let elements = Array.from(document.querySelectorAll('body *'));
return elements.map(element => element.innerText).join('\\n');
}""")
await browser.close()
return title,page_url, page_content
@staticmethod
async def power_scrapper(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Block unnecessary resources to speed up loading
await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
# Open the target website
await page.goto(url, wait_until='domcontentloaded')
# Wait for a short time to ensure dynamic content is loaded
await page.wait_for_timeout(1000)
# Extract all links
links = await page.query_selector_all('a')
page_url = []
page_content = []
for link in links:
href = await link.get_attribute('href')
page_url.append(href)
# Extract all text content
elements = await page.query_selector_all('body *')
for element in elements:
text_content = await element.text_content()
if text_content and text_content.strip():
page_content.append(text_content.strip())
await browser.close()
return page_url, page_content
@staticmethod
def get_links(soup):
links = []
for link in soup.find_all('a'):
href = link.get('href')
links.append(href)
return links
@staticmethod
def get_text_content(soup):
text_elements = []
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
elements = soup.find_all(tag)
for element in elements:
text_elements.append(element.get_text())
return text_elements
@staticmethod
def get_title(soup):
title = soup.find('title').get_text()
return title
@staticmethod
async def scrape(url):
try:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url,timeout=5)
soup = BeautifulSoup(response.content, 'html.parser')
title = Scraper.get_title(soup)
links = Scraper.get_links(soup)
text_content = Scraper.get_text_content(soup)
if not links:
print("Running alternative scrapper")
links, text_content = await Scraper.power_scrapper_2(url)
return {"title": title, "URL": links, "Content": text_content}
except:
print("Running alternative scrapper second time")
title,links, text_content = await Scraper.power_scrapper_2(url)
return {"title": title, "URL": links, "Content": text_content}
|