UKURIKIYEYEZU commited on
Commit
5e4a27f
Β·
verified Β·
1 Parent(s): bc75c0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +379 -60
app.py CHANGED
@@ -1,64 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from google.colab import userdata
4
+ from PyPDF2 import PdfReader
5
+
6
+ ## Embedding model!
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
9
+
10
+ import pandas as pd
11
+
12
+ # Set folder path
13
+ folder_path = "/content/drive/MyDrive/Ijwi_folder"
14
+ context_data = []
15
+
16
+ # List all files in the folder
17
+ files = os.listdir(folder_path)
18
+
19
+ # Get list of CSV and Excel files
20
+ data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))]
21
+
22
+ # Process each file
23
+ for f, file in enumerate(data_files, 1):
24
+ print(f"\nProcessing file {f}: {file}")
25
+ file_path = os.path.join(folder_path, file)
26
+
27
+ try:
28
+ # Read the file based on its extension
29
+ if file.endswith('.csv'):
30
+ df = pd.read_csv(file_path)
31
+ else:
32
+ df = pd.read_excel(file_path)
33
+
34
+ # Extract non-empty values from column 2 and append them
35
+ context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
36
+
37
+ except Exception as e:
38
+ print(f"Error processing file {file}: {str(e)}")
39
+
40
+
41
+
42
+
43
+
44
+ def extract_text_from_pdf(pdf_path):
45
+ """Extracts text from a PDF file."""
46
+ try:
47
+ with open(pdf_path, "rb") as file:
48
+ reader = PyPDF2.PdfReader(file)
49
+ text = "".join(page.extract_text() or "" for page in reader.pages) # Handle None cases
50
+ return text
51
+ except Exception as e:
52
+ print(f"Error extracting text from {pdf_path}: {e}")
53
+ return ""
54
+
55
+ # Folder containing the PDFs
56
+ folder_path ="/content/drive/MyDrive/Ijwi_folder" # Update with your actual folder path
57
+
58
+ # Initialize the list to hold the extracted text chunks
59
+ text_chunks = []
60
+
61
+ # Get all PDF filenames in the folder
62
+ filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
63
+
64
+ # Process each PDF file
65
+ for index, file in enumerate(filenames, 1):
66
+ print(f"\nProcessing file {index}: {file}")
67
+ pdf_path = os.path.join(folder_path, file)
68
+
69
+ try:
70
+ # Extract text from the PDF
71
+ extracted_text = extract_text_from_pdf(pdf_path)
72
+
73
+ if extracted_text.strip(): # Ensure extracted text is not just whitespace
74
+ # Split extracted text into chunks of 1000 characters
75
+ chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
76
+
77
+ # Append extracted chunks to the list
78
+ text_chunks.extend(chunks)
79
+ else:
80
+ print(f"No text found in the PDF: {file}")
81
+
82
+ except Exception as e:
83
+ print(f"Error reading the PDF {file}: {e}")
84
+
85
+
86
+
87
+ from urllib.parse import urljoin, urlparse
88
+ import requests
89
+ from io import BytesIO
90
+
91
+ from bs4 import BeautifulSoup
92
+ from langchain_core.prompts import ChatPromptTemplate
93
  import gradio as gr
94
+
95
+
96
+ def scrape_websites(base_urls):
97
+ try:
98
+ visited_links = set() # To avoid revisiting the same link
99
+ content_by_url = {} # Store content from each URL
100
+
101
+ for base_url in base_urls:
102
+ if not base_url.strip():
103
+ continue # Skip empty or invalid URLs
104
+
105
+ print(f"Scraping base URL: {base_url}")
106
+ html_content = fetch_page_content(base_url)
107
+ if html_content:
108
+ cleaned_content = clean_body_content(html_content)
109
+ content_by_url[base_url] = cleaned_content
110
+ visited_links.add(base_url)
111
+
112
+ # Extract and process all internal links
113
+ soup = BeautifulSoup(html_content, "html.parser")
114
+ links = extract_internal_links(base_url, soup)
115
+
116
+ for link in links:
117
+ if link not in visited_links:
118
+ print(f"Scraping link: {link}")
119
+ page_content = fetch_page_content(link)
120
+ if page_content:
121
+ cleaned_content = clean_body_content(page_content)
122
+ content_by_url[link] = cleaned_content
123
+ visited_links.add(link)
124
+
125
+ # If the link is a PDF file, extract its content
126
+ if link.lower().endswith('.pdf'):
127
+ print(f"Extracting PDF content from: {link}")
128
+ pdf_content = extract_pdf_text(link)
129
+ if pdf_content:
130
+ content_by_url[link] = pdf_content
131
+
132
+ return content_by_url
133
+
134
+ except Exception as e:
135
+ print(f"Error during scraping: {e}")
136
+ return {}
137
+
138
+
139
+ def fetch_page_content(url):
140
+ try:
141
+ response = requests.get(url, timeout=10)
142
+ response.raise_for_status()
143
+ return response.text
144
+ except requests.exceptions.RequestException as e:
145
+ print(f"Error fetching {url}: {e}")
146
+ return None
147
+
148
+
149
+ def extract_internal_links(base_url, soup):
150
+ links = set()
151
+ for anchor in soup.find_all("a", href=True):
152
+ href = anchor["href"]
153
+ full_url = urljoin(base_url, href)
154
+ if is_internal_link(base_url, full_url):
155
+ links.add(full_url)
156
+ return links
157
+
158
+
159
+ def is_internal_link(base_url, link_url):
160
+ base_netloc = urlparse(base_url).netloc
161
+ link_netloc = urlparse(link_url).netloc
162
+ return base_netloc == link_netloc
163
+
164
+
165
+ def extract_pdf_text(pdf_url):
166
+ try:
167
+ response = requests.get(pdf_url)
168
+ response.raise_for_status()
169
+
170
+ # Open the PDF from the response content
171
+ with BytesIO(response.content) as file:
172
+ reader = PdfReader(file)
173
+ pdf_text = ""
174
+ for page in reader.pages:
175
+ pdf_text += page.extract_text()
176
+
177
+ return pdf_text if pdf_text else None
178
+ except requests.exceptions.RequestException as e:
179
+ print(f"Error fetching PDF {pdf_url}: {e}")
180
+ return None
181
+ except Exception as e:
182
+ print(f"Error reading PDF {pdf_url}: {e}")
183
+ return None
184
+
185
+
186
+ def clean_body_content(html_content):
187
+ soup = BeautifulSoup(html_content, "html.parser")
188
+
189
+ # Remove scripts and styles
190
+ for script_or_style in soup(["script", "style"]):
191
+ script_or_style.extract()
192
+
193
+ # Get text and clean up
194
+ cleaned_content = soup.get_text(separator="\n")
195
+ cleaned_content = "\n".join(
196
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
197
+ )
198
+ return cleaned_content
199
+
200
 
201
 
202
  if __name__ == "__main__":
203
+ website = [
204
+ "https://www.rib.gov.rw/index.php?id=371",
205
+ "https://haguruka.org.rw/our-work/"
206
+ ]
207
+ all_content = scrape_websites(website)
208
+
209
+ # Temporary list to store (url, content) tuples
210
+ temp_list = []
211
+
212
+ # Process and store each URL with its content
213
+ for url, content in all_content.items():
214
+ temp_list.append((url, content))
215
+
216
+
217
+
218
+ processed_texts = []
219
+
220
+ # Process each element in the temporary list
221
+ for element in temp_list:
222
+ if isinstance(element, tuple):
223
+ url, content = element # Unpack the tuple
224
+ processed_texts.append(f"url: {url}, content: {content}")
225
+ elif isinstance(element, str):
226
+ processed_texts.append(element)
227
+ else:
228
+ processed_texts.append(str(element))
229
+
230
+ def chunk_string(s, chunk_size=2000):
231
+ return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
232
+
233
+ # List to store the chunks
234
+ chunked_texts = []
235
+
236
+ for text in processed_texts:
237
+ chunked_texts.extend(chunk_string(text))
238
+
239
+ data = []
240
+ data.extend(context_data)
241
+ data.extend([item for item in text_chunks if item not in data])
242
+ data.extend([item for item in chunked_texts if item not in data])
243
+
244
+
245
+
246
+ from langchain_community.vectorstores import Chroma
247
+
248
+
249
+ vectorstore = Chroma(
250
+ collection_name="GBV_dataset",
251
+ embedding_function=embed_model,
252
+ )
253
+
254
+ vectorstore.get().keys()
255
+
256
+ # add data to vector nstore
257
+ vectorstore.add_texts(data)
258
+
259
+
260
+
261
+
262
+
263
+
264
+ from openai import OpenAI
265
+ from langchain_core.prompts import PromptTemplate
266
+ from langchain_core.output_parsers import StrOutputParser
267
+ from langchain_core.runnables import RunnablePassthrough
268
+ import gradio as gr
269
+ from typing import Iterator
270
+ import time
271
+
272
+
273
+ # Template with user personalization and improved welcome message
274
+ template = ("""
275
+ You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses based on the provided context: {context}. Follow these guidelines:
276
+
277
+ 1. **Contextual Interaction**
278
+ - Begin with a warm and empathetic welcome message
279
+ - Extract precise details from provided context: {context}
280
+ - Respond directly to user's question: {question}
281
+ - Remember the user's name is {first_name} and address them by name occasionally not always
282
+
283
+ 2. **Communication Guidelines**
284
+ - Maintain warm, conversational tone
285
+ - Use occasional emojis for engagement
286
+ - Provide clear, concise information
287
+
288
+ 3. **Response Strategies**
289
+ - Greet users naturally and ask about their wellbeing (e.g., "Hello {first_name}! 😊 How are you feeling today?", "Welcome, {first_name}! 😊 You're in a safe and caring space. What's on your mind today?")
290
+ - Always start with a check-in about the user's wellbeing or current situation
291
+ - Deliver only relevant information
292
+ - Avoid generating content beyond context
293
+ - Handle missing information transparently
294
+
295
+ 4. **No Extra Content**
296
+ - If no information matches user's request:
297
+ * Respond politely: "I don't have that information at the moment, {first_name}. 😊"
298
+ * Offer alternative assistance options
299
+ - Strictly avoid generating unsupported content
300
+ - Prevent information padding or speculation
301
+
302
+ 5. **Extracting Relevant Links**
303
+ - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
304
+ - Example response:
305
+ - "Here is the link you requested, {first_name}: [URL]"
306
+
307
+ 6. **Real-Time Awareness**
308
+ - Acknowledge current context when appropriate
309
+ - Stay focused on user's immediate needs
310
+ - If this is the first message, always ask how the user is feeling and what they would like help with today
311
+
312
+ **Context:** {context}
313
+ **User's Question:** {question}
314
+ **Welcome Message:** {welcome_message}
315
+ **Is First Message:** {is_first_message}
316
+ **Your Response:**
317
+ """)
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+ class OpenRouterLLM:
328
+ def __init__(self, api_key: str):
329
+ self.client = OpenAI(
330
+ base_url="https://openrouter.ai/api/v1",
331
+ api_key=api
332
+ )
333
+ self.headers = {
334
+ "HTTP-Referer": "http://localhost:3000",
335
+ "X-Title": "Local Development"
336
+ }
337
+
338
+
339
+ def stream(self, prompt: str) -> Iterator[str]:
340
+ try:
341
+ completion = self.client.chat.completions.create(
342
+ extra_headers=self.headers,
343
+ model="deepseek/deepseek-r1-distill-llama-70b:free",
344
+ #model="google/gemini-2.0-flash-thinking-exp:free",
345
+ messages=[{"role": "user", "content": prompt}],
346
+ stream=True
347
+ )
348
+
349
+ for chunk in completion:
350
+ if chunk.choices[0].delta.content is not None:
351
+ yield chunk.choices[0].delta.content
352
+ except Exception as e:
353
+ yield f"Error: {str(e)}"
354
+
355
+
356
+
357
+
358
+ class UserSession:
359
+ def __init__(self):
360
+ self.current_user = None
361
+ self.is_first_message = True
362
+
363
+ def set_user(self, user_info):
364
+ self.current_user = user_info
365
+ self.is_first_message = True
366
+
367
+ def get_user(self):
368
+ return self.current_user
369
+
370
+ def mark_message_sent(self):
371
+ self.is_first_message = False
372
+
373
+ def is_first(self):
374
+ return self.is_first_message
375
+
376
+ # Initialize session and LLM
377
+ user_session = UserSession()
378
+
379
+
380
+
381
+
382
+
383
+