File size: 5,638 Bytes
71c801a
 
 
 
e3d6834
3276856
cd70095
2b205af
24f1a4f
71c801a
2b205af
 
 
8dba809
71c801a
 
8eb6cce
2b205af
 
8dba809
2b205af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24f1a4f
2b205af
24f1a4f
0aa6c33
2b205af
 
 
 
 
 
 
 
 
 
24f1a4f
2b205af
 
24f1a4f
 
2b205af
 
 
 
dfc9c16
 
24f1a4f
2b205af
 
 
 
 
 
 
dfc9c16
2b205af
 
 
 
 
 
 
 
24f1a4f
15a76e8
2b205af
 
 
 
 
 
 
 
 
 
71c801a
 
 
f34e472
71c801a
 
 
e52cc51
71c801a
 
f34e472
71c801a
 
73c52a1
3846328
 
 
 
 
73c52a1
24f1a4f
71c801a
 
 
 
 
24f1a4f
0aa6c33
71c801a
dfc9c16
15a76e8
e52cc51
da0656f
84f5525
71c801a
f34e472
d308e6d
 
da0656f
598294b
b2692e3
84f5525
 
 
 
 
f34e472
b2692e3
 
4294728
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import streamlit as st
from bs4 import BeautifulSoup
import requests
from groq import Groq
import os
from dotenv import load_dotenv
import json

# scraping pipeline
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]  # links found in home page
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


# first lets get relevant links from the home page for a broad information about the website provided

# system prompt of the first call 
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to the website, \
such as links to an About page, or a Company page, or Careers/Jobs pages. Limit the number of extracted links to seven most important links\n"
link_system_prompt += "You should respond in JSON as in this example: \n"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

#pre defined user prompt to extract only important links in about the website 
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
                    Do not include Terms of Service, Privacy\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

client=Groq(api_key=os.getenv("GROQ_API_KEY"))

# make the first call to get the important links 
def get_links(url):
    website = Website(url)
    response = client.chat.completions.create(
    messages=[
       {"role": "system", "content":link_system_prompt },
       {"role": "user", "content": get_links_user_prompt(website)}
    ],
    model="llama-3.3-70b-specdec",
    temperature=1,
    max_tokens=2048,
    stop=None,
    stream=False,
    response_format = {"type" : "json_object" })
    result = response.choices[0].message.content
    return json.loads(result)

#all the content required to generate information from user about the website
@st.cache_resource
def get_all_details(url):
    result = "Home page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Available links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result



# Streamlit UI
st.title("Welcome to WebBot🌍")
st.write("Enter a website URL and ask questions about its content!")

# Input fields
url = st.text_input("Website URL:", " " )
user_query = st.text_area("What would you like to know about this website")

if user_query:
    # Scrape website content
    with st.spinner("Scraping website..."):
        
        try:
            website = get_all_details(url)
            st.success("Website loaded successfully!")
        except Exception as e:
            st.error(f"Failed to load website: {e}")
        
        # Second to Call Groq API for processing
        st.write("Querying the website...")
        with st.spinner("Processing your query..."):
            try:
                chat_streaming = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant specializing in extracting and analyzing website content. Provide information required by the user based on the website information provided. Ensure responses are clear, concise, and formatted in Markdown for better readability. use your knowledge to add relevant inforation to the users query"},
                        {"role": "user", "content": f"Here's the content to use:\n {website} \n Now respond appropriately to the query: {user_query}\n"}
                    ],
                    model="llama-3.3-70b-specdec",
                    temperature=0.8,
                    max_tokens=2042,
                    top_p=0.6,
                    stream=False,
                )
                # st.write('Passed model')

            except Exception as e:
                st.error(f"Failed to process query to model: {e}")
            response = ""
            try:
                # for chunk in chat_streaming:
                #     content = chunk.choices[0].delta.content
                #     if content:  # Ensure content is not None
                response=chat_streaming.choices[0].message.content
                # response += content
                st.write("🤖:")
                st.write(response)
            except Exception as e:
                st.error(f"Failed to process query: {e}")



st.markdown("-----")
st.write("© 2024 Application")
st.warning("Disclaimer: This application currently does not support Javascript websites!!")