Koomemartin commited on
Commit
2b205af
·
verified ·
1 Parent(s): 7c57a39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -8
app.py CHANGED
@@ -4,19 +4,83 @@ import requests
4
  from groq import Groq
5
  import os
6
  from dotenv import load_dotenv
 
 
7
  class Website:
 
 
 
8
 
9
  def __init__(self, url):
10
- """
11
- Create this Website object from the given url using the BeautifulSoup library
12
- """
13
  self.url = url
14
  response = requests.get(url)
15
- soup = BeautifulSoup(response.content, 'html.parser')
 
16
  self.title = soup.title.string if soup.title else "No title found"
17
- for irrelevant in soup.body(["script", "style", "img", "input"]):
18
- irrelevant.decompose()
19
- self.text = soup.body.get_text(separator="\n", strip=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # Initialize Groq client
22
  # load_dotenv()
@@ -32,9 +96,11 @@ url = st.text_input("Website URL:", " " )
32
  user_query = st.text_area("What would you like to know about this website")
33
 
34
  if user_query:
 
 
35
  # Scrape website content
36
  with st.spinner("Scraping website..."):
37
- website = Website(url)
38
 
39
  if "Error" in website.title:
40
  st.error("Failed to load the website. Please check the URL.")
 
4
  from groq import Groq
5
  import os
6
  from dotenv import load_dotenv
7
+
8
+
9
  class Website:
10
+ """
11
+ A utility class to represent a Website that we have scraped, now with links
12
+ """
13
 
14
  def __init__(self, url):
 
 
 
15
  self.url = url
16
  response = requests.get(url)
17
+ self.body = response.content
18
+ soup = BeautifulSoup(self.body, 'html.parser')
19
  self.title = soup.title.string if soup.title else "No title found"
20
+ if soup.body:
21
+ for irrelevant in soup.body(["script", "style", "img", "input"]):
22
+ irrelevant.decompose()
23
+ self.text = soup.body.get_text(separator="\n", strip=True)
24
+ else:
25
+ self.text = ""
26
+ links = [link.get('href') for link in soup.find_all('a')] # links found in home page
27
+ self.links = [link for link in links if link]
28
+
29
+ def get_contents(self):
30
+ return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
31
+
32
+
33
+ # first lets get relevant links from the home page for a broad information about the website provided
34
+
35
+ link_system_prompt = "You are provided with a list of links found on a webpage. \
36
+ You are able to decide which of the links would be most relevant to the company, \
37
+ such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
38
+ link_system_prompt += "You should respond in JSON as in this example: \n"
39
+ link_system_prompt += """
40
+ {
41
+ "links": [
42
+ {"type": "about page", "url": "https://full.url/goes/here/about"},
43
+ {"type": "careers page": "url": "https://another.full.url/careers"}
44
+ ]
45
+ }
46
+ """
47
+
48
+ def get_links_user_prompt(website):
49
+ user_prompt = f"Here is the list of links on the website of {website.url} - "
50
+ user_prompt += "please decide which of these are relevant web links about the company, respond with the full https URL in JSON format. \
51
+ Do not include Terms of Service, Privacy\n"
52
+ user_prompt += "Links (some might be relative links):\n"
53
+ user_prompt += "\n".join(website.links)
54
+ return user_prompt
55
+
56
+
57
+ def get_links(url):
58
+ website = Website(url)
59
+ response = client.chat.completions.create(
60
+ messages=[
61
+ {"role": "system", "content":link_system_prompt },
62
+ {"role": "user", "content": get_links_user_prompt(website)}
63
+ ],
64
+ model="llama3-groq-70b-8192-tool-use-preview",
65
+ temperature=1,
66
+ max_tokens=2048,
67
+ stop=None,
68
+ stream=False,
69
+ response_format = {"type" : "json_object" })
70
+ result = response.choices[0].message.content
71
+ return json.loads(result)
72
+
73
+
74
+ def get_all_details(url):
75
+ result = "Home page:\n"
76
+ result += Website(url).get_contents()
77
+ links = get_links(url)
78
+ print("Available links:", links)
79
+ for link in links["links"]:
80
+ result += f"\n\n{link['type']}\n"
81
+ result += Website(link["url"]).get_contents()
82
+ return result
83
+
84
 
85
  # Initialize Groq client
86
  # load_dotenv()
 
96
  user_query = st.text_area("What would you like to know about this website")
97
 
98
  if user_query:
99
+
100
+
101
  # Scrape website content
102
  with st.spinner("Scraping website..."):
103
+ website = get_all_details(url)
104
 
105
  if "Error" in website.title:
106
  st.error("Failed to load the website. Please check the URL.")