UKURIKIYEYEZU commited on
Commit
c353a73
Β·
verified Β·
1 Parent(s): 2e609d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +255 -48
app.py CHANGED
@@ -1,64 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
25
 
26
- messages.append({"role": "user", "content": message})
 
 
27
 
28
- response = ""
 
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
41
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
  demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
  )
61
 
62
-
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
+ import os
2
+ from langchain_groq import ChatGroq
3
+ from langchain.prompts import ChatPromptTemplate, PromptTemplate
4
+ from langchain.output_parsers import ResponseSchema, StructuredOutputParser
5
+ from urllib.parse import urljoin, urlparse
6
+ import requests
7
+ from io import BytesIO
8
+ from langchain_chroma import Chroma
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+ from langchain_core.prompts import ChatPromptTemplate
12
  import gradio as gr
13
+ from PyPDF2 import PdfReader
14
+ from langchain_huggingface import HuggingFaceEmbeddings
15
 
16
+ groq_api_key= os.environ.get('ACCESS')
17
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
 
 
18
 
19
+ def scrape_websites(base_urls):
20
+ try:
21
+ visited_links = set() # To avoid revisiting the same link
22
+ content_by_url = {} # Store content from each URL
23
 
24
+ for base_url in base_urls:
25
+ if not base_url.strip():
26
+ continue # Skip empty or invalid URLs
 
 
 
 
 
 
27
 
28
+ print(f"Scraping base URL: {base_url}")
29
+ html_content = fetch_page_content(base_url)
30
+ if html_content:
31
+ cleaned_content = clean_body_content(html_content)
32
+ content_by_url[base_url] = cleaned_content
33
+ visited_links.add(base_url)
34
 
35
+ # Extract and process all internal links
36
+ soup = BeautifulSoup(html_content, "html.parser")
37
+ links = extract_internal_links(base_url, soup)
38
 
39
+ for link in links:
40
+ if link not in visited_links:
41
+ print(f"Scraping link: {link}")
42
+ page_content = fetch_page_content(link)
43
+ if page_content:
44
+ cleaned_content = clean_body_content(page_content)
45
+ content_by_url[link] = cleaned_content
46
+ visited_links.add(link)
47
 
48
+ # If the link is a PDF file, extract its content
49
+ if link.lower().endswith('.pdf'):
50
+ print(f"Extracting PDF content from: {link}")
51
+ pdf_content = extract_pdf_text(link)
52
+ if pdf_content:
53
+ content_by_url[link] = pdf_content
 
 
54
 
55
+ return content_by_url
 
56
 
57
+ except Exception as e:
58
+ print(f"Error during scraping: {e}")
59
+ return {}
60
 
61
+
62
+ def fetch_page_content(url):
63
+ try:
64
+ response = requests.get(url, timeout=10)
65
+ response.raise_for_status()
66
+ return response.text
67
+ except requests.exceptions.RequestException as e:
68
+ print(f"Error fetching {url}: {e}")
69
+ return None
70
+
71
+
72
+ def extract_internal_links(base_url, soup):
73
+ links = set()
74
+ for anchor in soup.find_all("a", href=True):
75
+ href = anchor["href"]
76
+ full_url = urljoin(base_url, href)
77
+ if is_internal_link(base_url, full_url):
78
+ links.add(full_url)
79
+ return links
80
+
81
+
82
+ def is_internal_link(base_url, link_url):
83
+ base_netloc = urlparse(base_url).netloc
84
+ link_netloc = urlparse(link_url).netloc
85
+ return base_netloc == link_netloc
86
+
87
+
88
+ def extract_pdf_text(pdf_url):
89
+ try:
90
+ response = requests.get(pdf_url)
91
+ response.raise_for_status()
92
+ with BytesIO(response.content) as file:
93
+ reader = PdfReader(file)
94
+ pdf_text = ""
95
+ for page in reader.pages:
96
+ pdf_text += page.extract_text()
97
+
98
+ return pdf_text if pdf_text else None
99
+ except requests.exceptions.RequestException as e:
100
+ print(f"Error fetching PDF {pdf_url}: {e}")
101
+ return None
102
+ except Exception as e:
103
+ print(f"Error reading PDF {pdf_url}: {e}")
104
+ return None
105
+
106
+
107
+ def clean_body_content(html_content):
108
+ soup = BeautifulSoup(html_content, "html.parser")
109
+
110
+
111
+ for script_or_style in soup(["script", "style"]):
112
+ script_or_style.extract()
113
+
114
+
115
+ cleaned_content = soup.get_text(separator="\n")
116
+ cleaned_content = "\n".join(
117
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
118
+ )
119
+ return cleaned_content
120
+
121
+
122
+ if __name__ == "__main__":
123
+ website = ["https://www.rra.gov.rw/en/publications",
124
+ "https://www.rra.gov.rw/en/customs-services"
125
+ ]
126
+ all_content = scrape_websites(website)
127
+
128
+ temp_list = []
129
+ for url, content in all_content.items():
130
+ temp_list.append((url, content))
131
+
132
+
133
+ processed_texts = []
134
+
135
+
136
+ for element in temp_list:
137
+ if isinstance(element, tuple):
138
+ url, content = element
139
+ processed_texts.append(f"url: {url}, content: {content}")
140
+ elif isinstance(element, str):
141
+ processed_texts.append(element)
142
+ else:
143
+ processed_texts.append(str(element))
144
+
145
+ def chunk_string(s, chunk_size=1000):
146
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
147
+
148
+ chunked_texts = []
149
+
150
+ for text in processed_texts:
151
+ chunked_texts.extend(chunk_string(text))
152
+
153
+
154
+
155
+
156
+
157
+ vectorstore = Chroma(
158
+ collection_name="RRA",
159
+ embedding_function=embed_model,
160
+ persist_directory="./",
161
+ )
162
+
163
+ vectorstore.get().keys()
164
+
165
+ vectorstore.add_texts(chunked_texts)
166
+
167
+
168
+ template = ("""
169
+ You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses from the provided context: {context} while maintaining a natural tone. Follow these guidelines:
170
+
171
+ 1. **Greetings:** If the user greets you (e.g., "Morning," "Hello," "Hi"), respond warmly and acknowledge the greeting. For example:
172
+ - "😊 Good morning! How can I assist you today?"
173
+ - "Hello! What can I do for you? πŸš€"
174
+ 2. **Extract Information:** If the user asks for specific information, extract only the relevant details from the provided context: {context}.
175
+ 3. **Human-like Interaction:** Respond in a warm, conversational tone. Use emojis occasionally to make the interaction more engaging (e.g., 😊, πŸš€).
176
+ 4. **Stay Updated:** Acknowledge the current date and time to show you are aware of real-time updates.
177
+ 5. **No Extra Content:** If no information matches the user's request, respond politely: "I don't have that information at the moment, but I'm happy to help with something else! 😊"
178
+ 6. **Personalized Interaction:** Use the user's historical interactions (if available) to tailor your responses and make the conversation more personalized.
179
+ 7. **Direct Data Only:** If the user requests specific data, provide only the requested information without additional explanations unless asked.
180
+
181
+ Context: {context}
182
+ User's Question: {question}
183
+ Your Response:
184
+ """)
185
+
186
+ rag_prompt = PromptTemplate.from_template(template)
187
+
188
+ retriever = vectorstore.as_retriever()
189
+
190
+ from langchain_core.output_parsers import StrOutputParser
191
+ from langchain_core.runnables import RunnablePassthrough
192
+
193
+ llm = ChatGroq(model="llama-3.3-70b-versatile", api_key=groq_api_key )
194
+
195
+ rag_chain = (
196
+ {"context": retriever, "question": RunnablePassthrough()}
197
+ | rag_prompt
198
+ | llm
199
+ | StrOutputParser()
200
+ )
201
+
202
+
203
+ # Define the RAG memory stream function
204
+ def rag_memory_stream(message, history):
205
+ partial_text = ""
206
+ for new_text in rag_chain.stream(message): # Replace with actual streaming logic
207
+ partial_text += new_text
208
+ yield partial_text
209
+
210
+ # Title with emojis
211
+ title = "RRA Chatbot"
212
+
213
+ # Short description for the examples section
214
+ examples = [
215
+ " What is TIN deregistration? What about Tax account deactivation?",
216
+ "What is "permanent establishment"?",
217
+ "when do I receive my registration certificate?"
218
+ ]
219
+
220
+ # Custom CSS for styling the interface
221
+ custom_css = """
222
+ body {
223
+ font-family: "Arial", serif;
224
+ }
225
+ .gradio-container {
226
+ font-family: "Times New Roman", serif;
227
+ }
228
+ .gr-button {
229
+ background-color: #007bff; /* Blue button */
230
+ color: white;
231
+ border: none;
232
+ border-radius: 5px;
233
+ font-size: 16px;
234
+ padding: 10px 20px;
235
+ cursor: pointer;
236
+ }
237
+ .gr-textbox:focus, .gr-button:focus {
238
+ outline: none; /* Remove outline focus for a cleaner look */
239
+ }
240
+
241
+ /* Custom CSS for the examples section */
242
+ .gr-examples {
243
+ font-size: 30px; /* Increase font size of examples */
244
+ background-color: #f9f9f9; /* Light background color */
245
+ border-radius: 30px; /* Rounded corners */
246
+ }
247
+
248
+ .gr-examples .example {
249
+ background-color: white; /* White background for each example */
250
+ cursor: pointer; /* Change cursor to pointer on hover */
251
+ transition: background-color 0.3s ease; /* Smooth hover effect */
252
+ }
253
+
254
+ .gr-examples .example:hover {
255
+ background-color: #f1f1f1; /* Light gray background on hover */
256
+ }
257
  """
258
+
259
+ # Create the Chat Interface
260
  demo = gr.ChatInterface(
261
+ fn=rag_memory_stream,
262
+ title=title,
263
+ examples=examples, # Display the short description and example questions
264
+ fill_height=True,
265
+ theme="soft",
266
+ css=custom_css, # Apply the custom CSS
 
 
 
 
 
 
 
267
  )
268
 
269
+ # Launch the app
270
  if __name__ == "__main__":
271
+ demo.launch(share=True, inbrowser=True, height=800, debug=True, width="100%")