Spaces:
Sleeping
Sleeping
File size: 6,424 Bytes
9eaae75 4793a38 9eaae75 652a180 9eaae75 652a180 9eaae75 54252cf 9eaae75 ff73473 9eaae75 cd4ad00 9eaae75 53002df 9eaae75 53002df 9eaae75 cd4ad00 9eaae75 4793a38 9eaae75 cd4ad00 9eaae75 e64c8fb 9eaae75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import streamlit as st
from bs4 import BeautifulSoup
import requests
from groq import Groq
from dotenv import load_dotenv
import os
import json
# scraping pipeline
class Website:
"""
A utility class to represent a Website that we have scraped
"""
def __init__(self, url):
self.url = url
response = requests.get(url)
self.body = response.content
soup = BeautifulSoup(self.body, 'html.parser')
self.title = soup.title.string if soup.title else "No title found"
if soup.body:
for irrelevant in soup.body(["script", "style", "img", "input"]):
irrelevant.decompose()
self.text = soup.body.get_text(separator="\n", strip=True)
else:
self.text = ""
links = [link.get('href') for link in soup.find_all('a')] # links found in home page
self.links = [link for link in links if link]
def get_contents(self):
return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
# first lets get relevant links from the home page for a broad information about the website provided
# system prompt of the first call
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt +="Kindly avoid selecting email links with this: \n mailto:[email protected] \n "
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
"links": [
{"type": "about page", "url": "https://full.url/goes/here/about"},
{"type": "careers page": "url": "https://another.full.url/careers"}
]
}
"""
#pre defined user prompt to extract only important links in about the website
def get_links_user_prompt(website):
user_prompt = f"Here is the list of links on the website of {website.url} - "
user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy\n"
user_prompt += "Links (some might be relative links):\n"
user_prompt += "\n".join(website.links)
return user_prompt
client = Groq(
api_key=os.getenv("GROQ_API_KEY"),
)
# make the first call to get the important links
def get_links(url):
website = Website(url)
response = client.chat.completions.create(
messages=[
{"role": "system", "content":link_system_prompt },
{"role": "user", "content": get_links_user_prompt(website)}
],
model="llama-3.3-70b-specdec",
temperature=1,
max_tokens=2048,
stop=None,
stream=False,
response_format = {"type" : "json_object" })
result = response.choices[0].message.content
return json.loads(result)
#all the content required to generate information from user about the website
@st.cache_resource # use the cached data to resond to the second query rather than scraping the website again
def get_all_details(url):
result = "Home page:\n"
result += Website(url).get_contents()
links = get_links(url)
print("Available links:", links)
for link in links["links"]:
result += f"\n\n{link['type']}\n"
result += Website(link["url"]).get_contents()
return result
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."
def second_call_sytem_prompt(system=None):
if system:
return system
else:
return system_prompt
def get_brochure_user_prompt(company_name, url):
user_prompt = f"You are looking at a company called: {company_name}\n"
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown and provide usable links in the contacts areas \n"
user_prompt += get_all_details(url)
user_prompt = user_prompt[:30_000] # Truncate if more than 30,000 characters
return user_prompt
# Initialize Groq client
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')
client = Groq(api_key=api_key)
# Streamlit UI
st.title("AI Brochures 🎨📌")
st.write("Create a captivating brochure for your company or institution by only using information from your website!!")
# Input fields
system= st.text_input("Modify the model response using a custom system prompt if not satisfied with generated response:" , " " )
url = st.text_input("Provide the Company's website URL:", " " )
user_query = st.text_area("Provide a title for the brochure or the name of the organization")
if user_query:
# Scrape website content
with st.spinner("Scraping website..."):
try:
second_user_prompt = get_brochure_user_prompt(user_query, url)
st.success("Website loaded successfully!")
except Exception as e:
st.error(f"Failed to load website: {e}")
# Second to Call Groq API for processing
st.write("Querying the website...")
with st.spinner("Processing your query..."):
try:
chat_streaming = client.chat.completions.create(
messages=[
{"role": "system", "content": second_call_sytem_prompt()},
{"role": "user", "content": second_user_prompt}
],
model="llama3-groq-70b-8192-tool-use-preview",
temperature=0.8,
max_tokens=2042,
top_p=0.6,
stream=False,
)
# st.write('Passed model')
except Exception as e:
st.error(f"Failed to process query to model: {e}")
response = ""
try:
response=chat_streaming.choices[0].message.content
st.write("🤖:")
st.write(response)
except Exception as e:
st.error(f"Failed to process query: {e}")
st.markdown("--------------")
st.write("© 2024 Application")
|