Koomemartin commited on
Commit
8dba809
·
verified ·
1 Parent(s): d308e6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -10
app.py CHANGED
@@ -4,19 +4,41 @@ import requests
4
  from groq import Groq
5
 
6
  # Define the Website class added
 
 
 
 
 
 
 
 
 
7
  class Website:
 
 
 
 
8
  def __init__(self, url):
9
  self.url = url
10
- try:
11
- response = requests.get(url)
12
- soup = BeautifulSoup(response.content, 'html.parser')
13
- self.title = soup.title.string if soup.title else "No title found"
14
- for irrelevant in soup.body(["script", "style", "img", "input"]):
15
- irrelevant.decompose()
16
- self.text = soup.body.get_text(separator="\n", strip=True)
17
- except Exception as e:
18
- self.title = "Error loading page"
19
- self.text = str(e)
 
 
 
 
 
 
 
 
 
20
 
21
  # Initialize Groq client
22
  api_key = "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"
 
4
  from groq import Groq
5
 
6
  # Define the Website class added
7
+ #Parse webpages which is designed using JavaScript heavely
8
+ # download the chorme driver from here as per your version of chrome - https://developer.chrome.com/docs/chromedriver/downloads
9
+ from selenium import webdriver
10
+ from selenium.webdriver.chrome.service import Service
11
+ from selenium.webdriver.common.by import By
12
+ from selenium.webdriver.chrome.options import Options
13
+
14
+ PATH_TO_CHROME_DRIVER = 'chromedriver.exe'
15
+
16
  class Website:
17
+ url: str
18
+ title: str
19
+ text: str
20
+
21
  def __init__(self, url):
22
  self.url = url
23
+
24
+ options = Options()
25
+
26
+ options.add_argument("--no-sandbox")
27
+ options.add_argument("--disable-dev-shm-usage")
28
+
29
+ service = Service(PATH_TO_CHROME_DRIVER)
30
+ driver = webdriver.Chrome(service=service)
31
+ driver.get(url)
32
+
33
+ # input("Please complete the verification in the browser and press Enter to continue...")
34
+ page_source = driver.page_source
35
+ driver.quit()
36
+
37
+ soup = BeautifulSoup(page_source, 'html.parser')
38
+ self.title = soup.title.string if soup.title else "No title found"
39
+ for irrelevant in soup(["script", "style", "img", "input"]):
40
+ irrelevant.decompose()
41
+ self.text = soup.get_text(separator="\n", strip=True)
42
 
43
  # Initialize Groq client
44
  api_key = "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"