import streamlit as st from bs4 import BeautifulSoup import requests from groq import Groq from dotenv import load_dotenv import os import json # scraping pipeline class Website: """ A utility class to represent a Website that we have scraped """ def __init__(self, url): self.url = url response = requests.get(url) self.body = response.content soup = BeautifulSoup(self.body, 'html.parser') self.title = soup.title.string if soup.title else "No title found" if soup.body: for irrelevant in soup.body(["script", "style", "img", "input"]): irrelevant.decompose() self.text = soup.body.get_text(separator="\n", strip=True) else: self.text = "" links = [link.get('href') for link in soup.find_all('a')] # links found in home page self.links = [link for link in links if link] def get_contents(self): return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" # first lets get relevant links from the home page for a broad information about the website provided # system prompt of the first call link_system_prompt = "You are provided with a list of links found on a webpage. \ You are able to decide which of the links would be most relevant to include in a brochure about the company, \ such as links to an About page, or a Company page, or Careers/Jobs pages.\n" link_system_prompt +="Kindly avoid selecting email links with this: \n \n " link_system_prompt += "You should respond in JSON as in this example:" link_system_prompt += """ { "links": [ {"type": "about page", "url": "https://full.url/goes/here/about"}, {"type": "careers page": "url": "https://another.full.url/careers"} ] } """ #pre defined user prompt to extract only important links in about the website def get_links_user_prompt(website): user_prompt = f"Here is the list of links on the website of {website.url} - " user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \ Do not include Terms of Service, Privacy\n" user_prompt += "Links (some might be relative links):\n" user_prompt += "\n".join(website.links) return user_prompt client = Groq( api_key=os.getenv("GROQ_API_KEY"), ) # make the first call to get the important links def get_links(url): website = Website(url) response = messages=[ {"role": "system", "content":link_system_prompt }, {"role": "user", "content": get_links_user_prompt(website)} ], model="llama-3.3-70b-specdec", temperature=1, max_tokens=2048, stop=None, stream=False, response_format = {"type" : "json_object" }) result = response.choices[0].message.content return json.loads(result) #all the content required to generate information from user about the website @st.cache_resource # use the cached data to resond to the second query rather than scraping the website again def get_all_details(url): result = "Home page:\n" result += Website(url).get_contents() links = get_links(url) print("Available links:", links) for link in links["links"]: result += f"\n\n{link['type']}\n" result += Website(link["url"]).get_contents() return result system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \ and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\ Include details of company culture, customers and careers/jobs if you have the information." def second_call_sytem_prompt(system=None): if system: return system else: return system_prompt def get_brochure_user_prompt(company_name, url): user_prompt = f"You are looking at a company called: {company_name}\n" user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown and provide usable links in the contacts areas \n" user_prompt += get_all_details(url) user_prompt = user_prompt[:30_000] # Truncate if more than 30,000 characters return user_prompt # Initialize Groq client load_dotenv() api_key = os.getenv('GROQ_API_KEY') client = Groq(api_key=api_key) # Streamlit UI st.title("AI Brochures 🎨📌") st.write("Create a captivating brochure for your company or institution by only using information from your website!!") # Input fields system= st.text_input("Modify the model response using a custom system prompt if not satisfied with generated response:" , " " ) url = st.text_input("Provide the Company's website URL:", " " ) user_query = st.text_area("Provide a title for the brochure or the name of the organization") if user_query: # Scrape website content with st.spinner("Scraping website..."): try: second_user_prompt = get_brochure_user_prompt(user_query, url) st.success("Website loaded successfully!") except Exception as e: st.error(f"Failed to load website: {e}") # Second to Call Groq API for processing st.write("Querying the website...") with st.spinner("Processing your query..."): try: chat_streaming = messages=[ {"role": "system", "content": second_call_sytem_prompt()}, {"role": "user", "content": second_user_prompt} ], model="llama-3.3-70b-specdec", temperature=0.8, max_tokens=2042, top_p=0.6, stream=False, ) # st.write('Passed model') except Exception as e: st.error(f"Failed to process query to model: {e}") response = "" try: response=chat_streaming.choices[0].message.content st.write("🤖:") st.write(response) except Exception as e: st.error(f"Failed to process query: {e}") st.markdown("--------------") st.write("© 2024 Application")