File size: 6,424 Bytes
9eaae75
 
 
 
4793a38
9eaae75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652a180
9eaae75
 
 
 
 
652a180
9eaae75
 
 
 
 
 
 
 
 
 
 
 
 
54252cf
 
 
 
9eaae75
 
 
 
 
 
 
 
ff73473
9eaae75
 
 
 
 
 
 
 
 
cd4ad00
9eaae75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53002df
 
9eaae75
53002df
 
9eaae75
 
 
 
 
 
cd4ad00
9eaae75
 
 
4793a38
9eaae75
 
 
 
 
cd4ad00
9eaae75
 
e64c8fb
9eaae75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import streamlit as st
from bs4 import BeautifulSoup
import requests
from groq import Groq
from dotenv import load_dotenv
import os
import json

# scraping pipeline
class Website:
    """
    A utility class to represent a Website that we have scraped
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]  # links found in home page
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


# first lets get relevant links from the home page for a broad information about the website provided

# system prompt of the first call 
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt +="Kindly avoid selecting email links with this: \n mailto:[email protected] \n "
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

#pre defined user prompt to extract only important links in about the website 
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links to the website, respond with the full https URL in JSON format. \
                    Do not include Terms of Service, Privacy\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

# make the first call to get the important links 
def get_links(url):
    website = Website(url)
    response = client.chat.completions.create(
    messages=[
       {"role": "system", "content":link_system_prompt },
       {"role": "user", "content": get_links_user_prompt(website)}
    ],
    model="llama-3.3-70b-specdec",
    temperature=1,
    max_tokens=2048,
    stop=None,
    stream=False,
    response_format = {"type" : "json_object" })
    result = response.choices[0].message.content
    return json.loads(result)

#all the content required to generate information from user about the website
@st.cache_resource  # use the cached data to resond to the second query rather than scraping the website again 
def get_all_details(url):
    result = "Home page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Available links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

    
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def second_call_sytem_prompt(system=None):
    if system:
        return system
    else:
        return system_prompt


def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown and provide usable links in the contacts areas \n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:30_000] # Truncate if more than 30,000 characters
    return user_prompt

# Initialize Groq client
load_dotenv()
api_key = os.getenv('GROQ_API_KEY')
client = Groq(api_key=api_key)

# Streamlit UI
st.title("AI Brochures 🎨📌")
st.write("Create a captivating brochure for your company or institution by only using information from your website!!")

# Input fields
system= st.text_input("Modify the model response using a custom system prompt if not satisfied with generated response:" , " "  )
url = st.text_input("Provide the Company's website URL:", " " )
user_query = st.text_area("Provide a title for the brochure or the name of the organization")

if user_query:
    # Scrape website content
    with st.spinner("Scraping website..."):
        
        try:
            second_user_prompt = get_brochure_user_prompt(user_query, url)
            st.success("Website loaded successfully!")
        except Exception as e:
            st.error(f"Failed to load website: {e}")
        
        # Second to Call Groq API for processing
        st.write("Querying the website...")
        with st.spinner("Processing your query..."):
            try:
                chat_streaming = client.chat.completions.create(
                    messages=[
                        {"role": "system", "content": second_call_sytem_prompt()},
                        {"role": "user", "content": second_user_prompt}
                    ],
                    model="llama3-groq-70b-8192-tool-use-preview",
                    temperature=0.8,
                    max_tokens=2042,
                    top_p=0.6,
                    stream=False,
                )
                # st.write('Passed model')

            except Exception as e:
                st.error(f"Failed to process query to model: {e}")
            response = ""
            try:
                response=chat_streaming.choices[0].message.content
                st.write("🤖:")
                st.write(response)
            except Exception as e:
                st.error(f"Failed to process query: {e}")



st.markdown("--------------")
st.write("© 2024 Application")