Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import requests
|
4 |
+
from groq import Groq
|
5 |
+
import os
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import json
|
8 |
+
|
9 |
+
# scraping pipeline
|
10 |
+
class Website:
|
11 |
+
"""
|
12 |
+
A utility class to represent a Website that we have scraped
|
13 |
+
"""
|
14 |
+
|
15 |
+
def __init__(self, url):
|
16 |
+
self.url = url
|
17 |
+
response = requests.get(url)
|
18 |
+
self.body = response.content
|
19 |
+
soup = BeautifulSoup(self.body, 'html.parser')
|
20 |
+
self.title = soup.title.string if soup.title else "No title found"
|
21 |
+
if soup.body:
|
22 |
+
for irrelevant in soup.body(["script", "style", "img", "input"]):
|
23 |
+
irrelevant.decompose()
|
24 |
+
self.text = soup.body.get_text(separator="\n", strip=True)
|
25 |
+
else:
|
26 |
+
self.text = ""
|
27 |
+
links = [link.get('href') for link in soup.find_all('a')] # links found in home page
|
28 |
+
self.links = [link for link in links if link]
|
29 |
+
|
30 |
+
def get_contents(self):
|
31 |
+
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
|
32 |
+
|
33 |
+
|
34 |
+
# first lets get relevant links from the home page for a broad information about the website provided
|
35 |
+
|
36 |
+
# system prompt of the first call
|
37 |
+
link_system_prompt = "You are provided with a list of links found on a webpage. \
|
38 |
+
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
|
39 |
+
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
|
40 |
+
link_system_prompt += "You should respond in JSON as in this example:"
|
41 |
+
link_system_prompt += """
|
42 |
+
{
|
43 |
+
"links": [
|
44 |
+
{"type": "about page", "url": "https://full.url/goes/here/about"},
|
45 |
+
{"type": "careers page": "url": "https://another.full.url/careers"},
|
46 |
+
{"type": "contanct us" : "url" : "mailto:[email protected]"}
|
47 |
+
]
|
48 |
+
}
|
49 |
+
"""
|
50 |
+
|
51 |
+
#pre defined user prompt to extract only important links in about the website
|
52 |
+
def get_links_user_prompt(website):
|
53 |
+
user_prompt = f"Here is the list of links on the website of {website.url} - "
|
54 |
+
user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
|
55 |
+
Do not include Terms of Service, Privacy\n"
|
56 |
+
user_prompt += "Links (some might be relative links):\n"
|
57 |
+
user_prompt += "\n".join(website.links)
|
58 |
+
return user_prompt
|
59 |
+
|
60 |
+
# make the first call to get the important links
|
61 |
+
def get_links(url):
|
62 |
+
website = Website(url)
|
63 |
+
response = client.chat.completions.create(
|
64 |
+
messages=[
|
65 |
+
{"role": "system", "content":link_system_prompt },
|
66 |
+
{"role": "user", "content": get_links_user_prompt(website)}
|
67 |
+
],
|
68 |
+
model="llama3-groq-70b-8192-tool-use-preview",
|
69 |
+
temperature=1,
|
70 |
+
max_tokens=2048,
|
71 |
+
stop=None,
|
72 |
+
stream=False,
|
73 |
+
response_format = {"type" : "json_object" })
|
74 |
+
result = response.choices[0].message.content
|
75 |
+
return json.loads(result)
|
76 |
+
|
77 |
+
#all the content required to generate information from user about the website
|
78 |
+
@st.cache_resource
|
79 |
+
def get_all_details(url):
|
80 |
+
result = "Home page:\n"
|
81 |
+
result += Website(url).get_contents()
|
82 |
+
links = get_links(url)
|
83 |
+
print("Available links:", links)
|
84 |
+
for link in links["links"]:
|
85 |
+
result += f"\n\n{link['type']}\n"
|
86 |
+
result += Website(link["url"]).get_contents()
|
87 |
+
return result
|
88 |
+
|
89 |
+
|
90 |
+
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
|
91 |
+
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
|
92 |
+
Include details of company culture, customers and careers/jobs if you have the information."
|
93 |
+
|
94 |
+
def second_call_sytem_prompt(system):
|
95 |
+
if len(system) == 0:
|
96 |
+
return system_prompt
|
97 |
+
else:
|
98 |
+
return system
|
99 |
+
|
100 |
+
|
101 |
+
def get_brochure_user_prompt(company_name, url):
|
102 |
+
user_prompt = f"You are looking at a company called: {company_name}\n"
|
103 |
+
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown and provide usable links in the contacts areas \n"
|
104 |
+
user_prompt += get_all_details(url)
|
105 |
+
user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters
|
106 |
+
return user_prompt
|
107 |
+
|
108 |
+
# Initialize Groq client
|
109 |
+
# load_dotenv()
|
110 |
+
api_key = os.getenv('GROQ_API_KEY')
|
111 |
+
client = Groq(api_key=api_key)
|
112 |
+
|
113 |
+
# Streamlit UI
|
114 |
+
st.title("AI Brochures 🎨📌")
|
115 |
+
st.write("Create a captivating brochure of your company or institution by only using information from your website!!")
|
116 |
+
|
117 |
+
# Input fields
|
118 |
+
system= st.text_input("Modify the model response using a custom system prompt if not satisfied with generated response": , " " )
|
119 |
+
url = st.text_input("Provide the Company's website URL:", " " )
|
120 |
+
user_query = st.text_area("Provide a title for the brochure or the name of the organization")
|
121 |
+
|
122 |
+
if user_query:
|
123 |
+
# Scrape website content
|
124 |
+
with st.spinner("Scraping website..."):
|
125 |
+
|
126 |
+
try:
|
127 |
+
second_user_prompt = get_brochure_user_prompt(user_query, url)
|
128 |
+
st.success("Website loaded successfully!")
|
129 |
+
except Exception as e:
|
130 |
+
st.error(f"Failed to load website: {e}")
|
131 |
+
|
132 |
+
# Second to Call Groq API for processing
|
133 |
+
st.write("Querying the website...")
|
134 |
+
with st.spinner("Processing your query..."):
|
135 |
+
try:
|
136 |
+
chat_streaming = client.chat.completions.create(
|
137 |
+
messages=[
|
138 |
+
{"role": "system", "content": second_call_sytem_prompt()},
|
139 |
+
{"role": "user", "content": second_user_prompt}
|
140 |
+
],
|
141 |
+
model="llama3-groq-70b-8192-tool-use-preview",
|
142 |
+
temperature=0.8,
|
143 |
+
max_tokens=2042,
|
144 |
+
top_p=0.6,
|
145 |
+
stream=False,
|
146 |
+
)
|
147 |
+
# st.write('Passed model')
|
148 |
+
|
149 |
+
except Exception as e:
|
150 |
+
st.error(f"Failed to process query to model: {e}")
|
151 |
+
response = ""
|
152 |
+
try:
|
153 |
+
# for chunk in chat_streaming:
|
154 |
+
# content = chunk.choices[0].delta.content
|
155 |
+
# if content: # Ensure content is not None
|
156 |
+
response=chat_streaming.choices[0].message.content
|
157 |
+
# response += content
|
158 |
+
st.write("🤖:")
|
159 |
+
st.write(response)
|
160 |
+
except Exception as e:
|
161 |
+
st.error(f"Failed to process query: {e}")
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
st.markdown("--------------")
|
166 |
+
st.write("© 2024 Application")
|