Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ import os
|
|
6 |
from dotenv import load_dotenv
|
7 |
import json
|
8 |
|
9 |
-
|
10 |
class Website:
|
11 |
"""
|
12 |
A utility class to represent a Website that we have scraped, now with links
|
@@ -33,9 +33,10 @@ class Website:
|
|
33 |
|
34 |
# first lets get relevant links from the home page for a broad information about the website provided
|
35 |
|
|
|
36 |
link_system_prompt = "You are provided with a list of links found on a webpage. \
|
37 |
-
You are able to decide which of the links would be most relevant to the
|
38 |
-
such as links to an About page, or a Company page, or Careers/Jobs pages
|
39 |
link_system_prompt += "You should respond in JSON as in this example: \n"
|
40 |
link_system_prompt += """
|
41 |
{
|
@@ -46,15 +47,16 @@ link_system_prompt += """
|
|
46 |
}
|
47 |
"""
|
48 |
|
|
|
49 |
def get_links_user_prompt(website):
|
50 |
user_prompt = f"Here is the list of links on the website of {website.url} - "
|
51 |
-
user_prompt += "please decide which of these are relevant web links
|
52 |
-
Do not include Terms of Service, Privacy\n"
|
53 |
user_prompt += "Links (some might be relative links):\n"
|
54 |
user_prompt += "\n".join(website.links)
|
55 |
return user_prompt
|
56 |
|
57 |
-
|
58 |
def get_links(url):
|
59 |
website = Website(url)
|
60 |
response = client.chat.completions.create(
|
@@ -71,6 +73,7 @@ def get_links(url):
|
|
71 |
result = response.choices[0].message.content
|
72 |
return json.loads(result)
|
73 |
|
|
|
74 |
@st.cache_resource
|
75 |
def get_all_details(url):
|
76 |
result = "Home page:\n"
|
@@ -97,8 +100,6 @@ url = st.text_input("Website URL:", " " )
|
|
97 |
user_query = st.text_area("What would you like to know about this website")
|
98 |
|
99 |
if user_query:
|
100 |
-
|
101 |
-
|
102 |
# Scrape website content
|
103 |
with st.spinner("Scraping website..."):
|
104 |
|
@@ -108,15 +109,14 @@ if user_query:
|
|
108 |
except Exception as e:
|
109 |
st.error(f"Failed to load website: {e}")
|
110 |
|
111 |
-
|
112 |
-
# Call Groq API for processing
|
113 |
st.write("Querying the website...")
|
114 |
with st.spinner("Processing your query..."):
|
115 |
try:
|
116 |
chat_streaming = client.chat.completions.create(
|
117 |
messages=[
|
118 |
-
{"role": "system", "content": "You are a helpful assistant specializing in extracting and analyzing website content.
|
119 |
-
{"role": "user", "content": f"
|
120 |
],
|
121 |
model="llama3-groq-70b-8192-tool-use-preview",
|
122 |
temperature=0.9,
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
import json
|
8 |
|
9 |
+
# scraping pipeline
|
10 |
class Website:
|
11 |
"""
|
12 |
A utility class to represent a Website that we have scraped, now with links
|
|
|
33 |
|
34 |
# first lets get relevant links from the home page for a broad information about the website provided
|
35 |
|
36 |
+
# system prompt of the first call
|
37 |
link_system_prompt = "You are provided with a list of links found on a webpage. \
|
38 |
+
You are able to decide which of the links would be most relevant to the website, \
|
39 |
+
such as links to an About page, or a Company page, or Careers/Jobs pages. Kindly choose the top seven links that look to provide more information about the website\n"
|
40 |
link_system_prompt += "You should respond in JSON as in this example: \n"
|
41 |
link_system_prompt += """
|
42 |
{
|
|
|
47 |
}
|
48 |
"""
|
49 |
|
50 |
+
#pre defined user prompt to extract only important links in about the website
|
51 |
def get_links_user_prompt(website):
|
52 |
user_prompt = f"Here is the list of links on the website of {website.url} - "
|
53 |
+
user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
|
54 |
+
Do not include Terms of Service, Privacy\n"
|
55 |
user_prompt += "Links (some might be relative links):\n"
|
56 |
user_prompt += "\n".join(website.links)
|
57 |
return user_prompt
|
58 |
|
59 |
+
# make the first call to get the important links
|
60 |
def get_links(url):
|
61 |
website = Website(url)
|
62 |
response = client.chat.completions.create(
|
|
|
73 |
result = response.choices[0].message.content
|
74 |
return json.loads(result)
|
75 |
|
76 |
+
#all the content required to generate information from user about the website
|
77 |
@st.cache_resource
|
78 |
def get_all_details(url):
|
79 |
result = "Home page:\n"
|
|
|
100 |
user_query = st.text_area("What would you like to know about this website")
|
101 |
|
102 |
if user_query:
|
|
|
|
|
103 |
# Scrape website content
|
104 |
with st.spinner("Scraping website..."):
|
105 |
|
|
|
109 |
except Exception as e:
|
110 |
st.error(f"Failed to load website: {e}")
|
111 |
|
112 |
+
# Second to Call Groq API for processing
|
|
|
113 |
st.write("Querying the website...")
|
114 |
with st.spinner("Processing your query..."):
|
115 |
try:
|
116 |
chat_streaming = client.chat.completions.create(
|
117 |
messages=[
|
118 |
+
{"role": "system", "content": "You are a helpful assistant specializing in extracting and analyzing website content. Provide information required by the user based on the website information provided. Ensure responses are clear, concise, and formatted in Markdown for better readability. use your knowledge to add relevant inforation to the users query"},
|
119 |
+
{"role": "user", "content": f"Here's the content to use:\n{website} \n Know respond appropriately: {user_query}"}
|
120 |
],
|
121 |
model="llama3-groq-70b-8192-tool-use-preview",
|
122 |
temperature=0.9,
|