Koomemartin commited on
Commit
8eb6cce
·
verified ·
1 Parent(s): da57c92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -35
app.py CHANGED
@@ -3,47 +3,19 @@ from bs4 import BeautifulSoup
3
  import requests
4
  from groq import Groq
5
 
6
- # Define the Website class added
7
- #Parse webpages which is designed using JavaScript heavely
8
- # download the chorme driver from here as per your version of chrome - https://developer.chrome.com/docs/chromedriver/downloads
9
- from selenium import webdriver
10
- from selenium.webdriver.chrome.service import Service
11
- from selenium.webdriver.common.by import By
12
- from selenium.webdriver.chrome.options import Options
13
- import os
14
-
15
- PATH_TO_CHROME_DRIVER = os.path.join(os.getcwd(),'chromedriver')
16
-
17
- os.chmod(PATH_TO_CHROME_DRIVER, 0o755)
18
-
19
- log_file= os.path.join(os.getcwd(),'logs')
20
-
21
  class Website:
22
- url: str
23
- title: str
24
- text: str
25
 
26
  def __init__(self, url):
 
 
 
27
  self.url = url
28
-
29
- options = Options()
30
-
31
- options.add_argument("--no-sandbox")
32
- options.add_argument("--disable-dev-shm-usage")
33
-
34
- service = Service(executable_path=PATH_TO_CHROME_DRIVER , log_output=log_file)
35
- driver = webdriver.Chrome(service=service)
36
- driver.get(url)
37
-
38
- # input("Please complete the verification in the browser and press Enter to continue...")
39
- page_source = driver.page_source
40
- driver.quit()
41
-
42
- soup = BeautifulSoup(page_source, 'html.parser')
43
  self.title = soup.title.string if soup.title else "No title found"
44
- for irrelevant in soup(["script", "style", "img", "input"]):
45
  irrelevant.decompose()
46
- self.text = soup.get_text(separator="\n", strip=True)
47
 
48
  # Initialize Groq client
49
  api_key = "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"
 
3
  import requests
4
  from groq import Groq
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  class Website:
 
 
 
7
 
8
  def __init__(self, url):
9
+ """
10
+ Create this Website object from the given url using the BeautifulSoup library
11
+ """
12
  self.url = url
13
+ response = requests.get(url)
14
+ soup = BeautifulSoup(response.content, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  self.title = soup.title.string if soup.title else "No title found"
16
+ for irrelevant in soup.body(["script", "style", "img", "input"]):
17
  irrelevant.decompose()
18
+ self.text = soup.body.get_text(separator="\n", strip=True)
19
 
20
  # Initialize Groq client
21
  api_key = "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"