Spaces:
Running
Running
File size: 5,638 Bytes
71c801a e3d6834 3276856 cd70095 2b205af 24f1a4f 71c801a 2b205af 8dba809 71c801a 8eb6cce 2b205af 8dba809 2b205af 24f1a4f 2b205af 24f1a4f 0aa6c33 2b205af 24f1a4f 2b205af 24f1a4f 2b205af dfc9c16 24f1a4f 2b205af dfc9c16 2b205af 24f1a4f 15a76e8 2b205af 71c801a f34e472 71c801a e52cc51 71c801a f34e472 71c801a 73c52a1 3846328 73c52a1 24f1a4f 71c801a 24f1a4f 0aa6c33 71c801a dfc9c16 15a76e8 e52cc51 da0656f 84f5525 71c801a f34e472 d308e6d da0656f 598294b b2692e3 84f5525 f34e472 b2692e3 4294728 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import streamlit as st
from bs4 import BeautifulSoup
import requests
from groq import Groq
import os
from dotenv import load_dotenv
import json
# scraping pipeline
class Website:
"""
A utility class to represent a Website that we have scraped, now with links
"""
def __init__(self, url):
self.url = url
response = requests.get(url)
self.body = response.content
soup = BeautifulSoup(self.body, 'html.parser')
self.title = soup.title.string if soup.title else "No title found"
if soup.body:
for irrelevant in soup.body(["script", "style", "img", "input"]):
irrelevant.decompose()
self.text = soup.body.get_text(separator="\n", strip=True)
else:
self.text = ""
links = [link.get('href') for link in soup.find_all('a')] # links found in home page
self.links = [link for link in links if link]
def get_contents(self):
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
# first lets get relevant links from the home page for a broad information about the website provided
# system prompt of the first call
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to the website, \
such as links to an About page, or a Company page, or Careers/Jobs pages. Limit the number of extracted links to seven most important links\n"
link_system_prompt += "You should respond in JSON as in this example: \n"
link_system_prompt += """
{
"links": [
{"type": "about page", "url": "https://full.url/goes/here/about"},
{"type": "careers page": "url": "https://another.full.url/careers"}
]
}
"""
#pre defined user prompt to extract only important links in about the website
def get_links_user_prompt(website):
user_prompt = f"Here is the list of links on the website of {website.url} - "
user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy\n"
user_prompt += "Links (some might be relative links):\n"
user_prompt += "\n".join(website.links)
return user_prompt
client=Groq(api_key=os.getenv("GROQ_API_KEY"))
# make the first call to get the important links
def get_links(url):
website = Website(url)
response = client.chat.completions.create(
messages=[
{"role": "system", "content":link_system_prompt },
{"role": "user", "content": get_links_user_prompt(website)}
],
model="llama-3.3-70b-specdec",
temperature=1,
max_tokens=2048,
stop=None,
stream=False,
response_format = {"type" : "json_object" })
result = response.choices[0].message.content
return json.loads(result)
#all the content required to generate information from user about the website
@st.cache_resource
def get_all_details(url):
result = "Home page:\n"
result += Website(url).get_contents()
links = get_links(url)
print("Available links:", links)
for link in links["links"]:
result += f"\n\n{link['type']}\n"
result += Website(link["url"]).get_contents()
return result
# Streamlit UI
st.title("Welcome to WebBot🌍")
st.write("Enter a website URL and ask questions about its content!")
# Input fields
url = st.text_input("Website URL:", " " )
user_query = st.text_area("What would you like to know about this website")
if user_query:
# Scrape website content
with st.spinner("Scraping website..."):
try:
website = get_all_details(url)
st.success("Website loaded successfully!")
except Exception as e:
st.error(f"Failed to load website: {e}")
# Second to Call Groq API for processing
st.write("Querying the website...")
with st.spinner("Processing your query..."):
try:
chat_streaming = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant specializing in extracting and analyzing website content. Provide information required by the user based on the website information provided. Ensure responses are clear, concise, and formatted in Markdown for better readability. use your knowledge to add relevant inforation to the users query"},
{"role": "user", "content": f"Here's the content to use:\n {website} \n Now respond appropriately to the query: {user_query}\n"}
],
model="llama-3.3-70b-specdec",
temperature=0.8,
max_tokens=2042,
top_p=0.6,
stream=False,
)
# st.write('Passed model')
except Exception as e:
st.error(f"Failed to process query to model: {e}")
response = ""
try:
# for chunk in chat_streaming:
# content = chunk.choices[0].delta.content
# if content: # Ensure content is not None
response=chat_streaming.choices[0].message.content
# response += content
st.write("🤖:")
st.write(response)
except Exception as e:
st.error(f"Failed to process query: {e}")
st.markdown("-----")
st.write("© 2024 Application")
st.warning("Disclaimer: This application currently does not support Javascript websites!!")
|