Spaces:

Koomemartin
/

AI_Brochures

Sleeping

App Files Files Community

AI_Brochures / app.py

Koomemartin

Update app.py

54252cf verified 3 months ago

raw

history blame

6.42 kB

	import streamlit as st
	from bs4 import BeautifulSoup
	import requests
	from groq import Groq
	from dotenv import load_dotenv
	import os
	import json

	# scraping pipeline
	class Website:
	"""
	A utility class to represent a Website that we have scraped
	"""

	def __init__(self, url):
	self.url = url
	response = requests.get(url)
	self.body = response.content
	soup = BeautifulSoup(self.body, 'html.parser')
	self.title = soup.title.string if soup.title else "No title found"
	if soup.body:
	for irrelevant in soup.body(["script", "style", "img", "input"]):
	irrelevant.decompose()
	self.text = soup.body.get_text(separator="\n", strip=True)
	else:
	self.text = ""
	links = [link.get('href') for link in soup.find_all('a')] # links found in home page
	self.links = [link for link in links if link]

	def get_contents(self):
	return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


	# first lets get relevant links from the home page for a broad information about the website provided

	# system prompt of the first call
	link_system_prompt = "You are provided with a list of links found on a webpage. \
	You are able to decide which of the links would be most relevant to include in a brochure about the company, \
	such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
	link_system_prompt +="Kindly avoid selecting email links with this: \n mailto:[email protected] \n "
	link_system_prompt += "You should respond in JSON as in this example:"
	link_system_prompt += """
	{
	"links": [
	{"type": "about page", "url": "https://full.url/goes/here/about"},
	{"type": "careers page": "url": "https://another.full.url/careers"}
	]
	}
	"""

	#pre defined user prompt to extract only important links in about the website
	def get_links_user_prompt(website):
	user_prompt = f"Here is the list of links on the website of {website.url} - "
	user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
	Do not include Terms of Service, Privacy\n"
	user_prompt += "Links (some might be relative links):\n"
	user_prompt += "\n".join(website.links)
	return user_prompt

	client = Groq(
	api_key=os.getenv("GROQ_API_KEY"),
	)

	# make the first call to get the important links
	def get_links(url):
	website = Website(url)
	response = client.chat.completions.create(
	messages=[
	{"role": "system", "content":link_system_prompt },
	{"role": "user", "content": get_links_user_prompt(website)}
	],
	model="llama-3.3-70b-specdec",
	temperature=1,
	max_tokens=2048,
	stop=None,
	stream=False,
	response_format = {"type" : "json_object" })
	result = response.choices[0].message.content
	return json.loads(result)

	#all the content required to generate information from user about the website
	@st.cache_resource # use the cached data to resond to the second query rather than scraping the website again
	def get_all_details(url):
	result = "Home page:\n"
	result += Website(url).get_contents()
	links = get_links(url)
	print("Available links:", links)
	for link in links["links"]:
	result += f"\n\n{link['type']}\n"
	result += Website(link["url"]).get_contents()
	return result


	system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
	and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
	Include details of company culture, customers and careers/jobs if you have the information."

	def second_call_sytem_prompt(system=None):
	if system:
	return system
	else:
	return system_prompt


	def get_brochure_user_prompt(company_name, url):
	user_prompt = f"You are looking at a company called: {company_name}\n"
	user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown and provide usable links in the contacts areas \n"
	user_prompt += get_all_details(url)
	user_prompt = user_prompt[:30_000] # Truncate if more than 30,000 characters
	return user_prompt

	# Initialize Groq client
	load_dotenv()
	api_key = os.getenv('GROQ_API_KEY')
	client = Groq(api_key=api_key)

	# Streamlit UI
	st.title("AI Brochures 🎨📌")
	st.write("Create a captivating brochure for your company or institution by only using information from your website!!")

	# Input fields
	system= st.text_input("Modify the model response using a custom system prompt if not satisfied with generated response:" , " " )
	url = st.text_input("Provide the Company's website URL:", " " )
	user_query = st.text_area("Provide a title for the brochure or the name of the organization")

	if user_query:
	# Scrape website content
	with st.spinner("Scraping website..."):

	try:
	second_user_prompt = get_brochure_user_prompt(user_query, url)
	st.success("Website loaded successfully!")
	except Exception as e:
	st.error(f"Failed to load website: {e}")

	# Second to Call Groq API for processing
	st.write("Querying the website...")
	with st.spinner("Processing your query..."):
	try:
	chat_streaming = client.chat.completions.create(
	messages=[
	{"role": "system", "content": second_call_sytem_prompt()},
	{"role": "user", "content": second_user_prompt}
	],
	model="llama3-groq-70b-8192-tool-use-preview",
	temperature=0.8,
	max_tokens=2042,
	top_p=0.6,
	stream=False,
	)
	# st.write('Passed model')

	except Exception as e:
	st.error(f"Failed to process query to model: {e}")
	response = ""
	try:
	response=chat_streaming.choices[0].message.content
	st.write("🤖:")
	st.write(response)
	except Exception as e:
	st.error(f"Failed to process query: {e}")



	st.markdown("--------------")
	st.write("© 2024 Application")