File size: 4,060 Bytes
8efa796
 
 
 
 
 
706f4ae
 
8efa796
5f7a419
 
8efa796
b928ab9
 
 
 
 
706f4ae
 
b92fcb4
706f4ae
b928ab9
706f4ae
b928ab9
81b8336
706f4ae
b928ab9
 
 
 
706f4ae
 
b928ab9
706f4ae
 
b928ab9
706f4ae
b928ab9
 
81b8336
b928ab9
8efa796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f74ea1
 
faea8d5
5f74ea1
 
 
 
 
 
faea8d5
5f74ea1
 
 
 
faea8d5
81b8336
5f74ea1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# scraper.py

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import time




class Scraper:
    @staticmethod
    async def power_scrapper_2(url):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
    
            # Route to block images, videos, and CSS
            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
    
            await page.goto(url)
    
            # Get the title
            title = await page.title()
    
            # Get all links
            page_url = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a')).map(a => a.href);
            }""")
    
            # Get page content (text from paragraphs and headers)
            page_content = await page.evaluate("""() => {
                let elements = Array.from(document.querySelectorAll('body *'));
                return elements.map(element => element.innerText).join('\\n');
            }""")

            
            await browser.close()
            return title,page_url, page_content
    
    @staticmethod
    async def power_scrapper(url):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Block unnecessary resources to speed up loading
            await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())

            # Open the target website
            await page.goto(url, wait_until='domcontentloaded')

            # Wait for a short time to ensure dynamic content is loaded
            await page.wait_for_timeout(1000)

            # Extract all links
            links = await page.query_selector_all('a')
            page_url = []
            page_content = []
            for link in links:
                href = await link.get_attribute('href')
                page_url.append(href)

            # Extract all text content
            elements = await page.query_selector_all('body *')
        
            for element in elements:
                text_content = await element.text_content()
                if text_content and text_content.strip():
                    page_content.append(text_content.strip())

            await browser.close()
            return page_url, page_content

    @staticmethod
    def get_links(soup):
        links = []
        for link in soup.find_all('a'):
            href = link.get('href')
            links.append(href)
        return links

    @staticmethod
    def get_text_content(soup):
        text_elements = []
        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
            elements = soup.find_all(tag)
            for element in elements:
                text_elements.append(element.get_text())
        return text_elements

    @staticmethod
    def get_title(soup):
        title = soup.find('title').get_text()
        return title

    @staticmethod
    async def scrape(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url,timeout=5)
            soup = BeautifulSoup(response.content, 'html.parser')
    
            title = Scraper.get_title(soup)
            links = Scraper.get_links(soup)
            text_content = Scraper.get_text_content(soup)
    
            if not links:
                print("Running alternative scrapper")
                links, text_content = await Scraper.power_scrapper_2(url)
            return {"title": title, "URL": links, "Content": text_content}
        except:
            print("Running alternative scrapper second time")
            title,links, text_content = await Scraper.power_scrapper_2(url)
            return {"title": title, "URL": links, "Content": text_content}