Spaces:

Koomemartin
/

WebBot

Sleeping

App Files Files Community

Koomemartin commited on Nov 22, 2024

Commit

2b205af

verified ·

1 Parent(s): 7c57a39

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -8

app.py CHANGED Viewed

@@ -4,19 +4,83 @@ import requests
 from groq import Groq
 import os
 from dotenv import load_dotenv
 class Website:
     def __init__(self, url):
-        """
-        Create this Website object from the given url using the BeautifulSoup library
-        """
         self.url = url
         response = requests.get(url)
-        soup = BeautifulSoup(response.content, 'html.parser')
         self.title = soup.title.string if soup.title else "No title found"
-        for irrelevant in soup.body(["script", "style", "img", "input"]):
-            irrelevant.decompose()
-        self.text = soup.body.get_text(separator="\n", strip=True)
 # Initialize Groq client
 # load_dotenv()
@@ -32,9 +96,11 @@ url = st.text_input("Website URL:", " " )
 user_query = st.text_area("What would you like to know about this website")
 if user_query:
     # Scrape website content
     with st.spinner("Scraping website..."):
-        website = Website(url)
     if "Error" in website.title:
         st.error("Failed to load the website. Please check the URL.")

 from groq import Groq
 import os
 from dotenv import load_dotenv
 class Website:
+    """
+    A utility class to represent a Website that we have scraped, now with links
+    """
     def __init__(self, url):
         self.url = url
         response = requests.get(url)
+        self.body = response.content
+        soup = BeautifulSoup(self.body, 'html.parser')
         self.title = soup.title.string if soup.title else "No title found"
+        if soup.body:
+            for irrelevant in soup.body(["script", "style", "img", "input"]):
+                irrelevant.decompose()
+            self.text = soup.body.get_text(separator="\n", strip=True)
+        else:
+            self.text = ""
+        links = [link.get('href') for link in soup.find_all('a')]  # links found in home page
+        self.links = [link for link in links if link]
+    def get_contents(self):
+        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
+# first lets get relevant links from the home page for a broad information about the website provided
+link_system_prompt = "You are provided with a list of links found on a webpage. \
+You are able to decide which of the links would be most relevant to the company, \
+such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
+link_system_prompt += "You should respond in JSON as in this example: \n"
+link_system_prompt += """
+{
+    "links": [
+        {"type": "about page", "url": "https://full.url/goes/here/about"},
+        {"type": "careers page": "url": "https://another.full.url/careers"}
+    ]
+}
+"""
+def get_links_user_prompt(website):
+    user_prompt = f"Here is the list of links on the website of {website.url} - "
+    user_prompt += "please decide which of these are relevant web links  about the company, respond with the full https URL in JSON format. \
+Do not include Terms of Service, Privacy\n"
+    user_prompt += "Links (some might be relative links):\n"
+    user_prompt += "\n".join(website.links)
+    return user_prompt
+def get_links(url):
+    website = Website(url)
+    response = client.chat.completions.create(
+    messages=[
+       {"role": "system", "content":link_system_prompt },
+       {"role": "user", "content": get_links_user_prompt(website)}
+    ],
+    model="llama3-groq-70b-8192-tool-use-preview",
+    temperature=1,
+    max_tokens=2048,
+    stop=None,
+    stream=False,
+    response_format = {"type" : "json_object" })
+    result = response.choices[0].message.content
+    return json.loads(result)
+def get_all_details(url):
+    result = "Home page:\n"
+    result += Website(url).get_contents()
+    links = get_links(url)
+    print("Available links:", links)
+    for link in links["links"]:
+        result += f"\n\n{link['type']}\n"
+        result += Website(link["url"]).get_contents()
+    return result
 # Initialize Groq client
 # load_dotenv()
 user_query = st.text_area("What would you like to know about this website")
 if user_query:
     # Scrape website content
     with st.spinner("Scraping website..."):
+        website = get_all_details(url)
     if "Error" in website.title:
         st.error("Failed to load the website. Please check the URL.")