import os import PyPDF2 from PyPDF2 import PdfReader import pandas as pd ## Embedding model! from langchain_huggingface import HuggingFaceEmbeddings embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1") folder_path = "./" context_data = [] # List all files in the folder files = os.listdir(folder_path) # Get list of CSV and Excel files data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))] # Process each file for f, file in enumerate(data_files, 1): print(f"\nProcessing file {f}: {file}") file_path = os.path.join(folder_path, file) try: # Read the file based on its extension if file.endswith('.csv'): df = pd.read_csv(file_path) else: df = pd.read_excel(file_path) # Extract non-empty values from column 2 and append them context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist()) except Exception as e: print(f"Error processing file {file}: {str(e)}") import os import PyPDF2 from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document def extract_text_from_pdf(pdf_path): """Extract text from a PDF file.""" try: with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) return "".join(page.extract_text() or "" for page in reader.pages) except Exception as e: print(f"Error with {pdf_path}: {e}") return "" pdf_files = [f for f in files if f.lower().endswith(".pdf")] # Process PDFs documents = [] for file in pdf_files: print(f"Processing: {file}") pdf_path = os.path.join(folder_path, file) text = extract_text_from_pdf(pdf_path) if text: documents.append(Document(page_content=text, metadata={"source": file})) # Split into chunks text_splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=500, chunk_overlap=50 ) chunks = text_splitter.split_documents(documents) text_only_chunks = [chunk.page_content for chunk in chunks] from urllib.parse import urljoin, urlparse import requests from io import BytesIO from bs4 import BeautifulSoup from langchain_core.prompts import ChatPromptTemplate import gradio as gr def scrape_websites(base_urls): try: visited_links = set() # To avoid revisiting the same link content_by_url = {} # Store content from each URL for base_url in base_urls: if not base_url.strip(): continue # Skip empty or invalid URLs print(f"Scraping base URL: {base_url}") html_content = fetch_page_content(base_url) if html_content: cleaned_content = clean_body_content(html_content) content_by_url[base_url] = cleaned_content visited_links.add(base_url) # Extract and process all internal links soup = BeautifulSoup(html_content, "html.parser") links = extract_internal_links(base_url, soup) for link in links: if link not in visited_links: print(f"Scraping link: {link}") page_content = fetch_page_content(link) if page_content: cleaned_content = clean_body_content(page_content) content_by_url[link] = cleaned_content visited_links.add(link) # If the link is a PDF file, extract its content if link.lower().endswith('.pdf'): print(f"Extracting PDF content from: {link}") pdf_content = extract_pdf_text(link) if pdf_content: content_by_url[link] = pdf_content return content_by_url except Exception as e: print(f"Error during scraping: {e}") return {} def fetch_page_content(url): try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def extract_internal_links(base_url, soup): links = set() for anchor in soup.find_all("a", href=True): href = anchor["href"] full_url = urljoin(base_url, href) if is_internal_link(base_url, full_url): links.add(full_url) return links def is_internal_link(base_url, link_url): base_netloc = urlparse(base_url).netloc link_netloc = urlparse(link_url).netloc return base_netloc == link_netloc def extract_pdf_text(pdf_url): try: response = requests.get(pdf_url) response.raise_for_status() # Open the PDF from the response content with BytesIO(response.content) as file: reader = PdfReader(file) pdf_text = "" for page in reader.pages: pdf_text += page.extract_text() return pdf_text if pdf_text else None except requests.exceptions.RequestException as e: print(f"Error fetching PDF {pdf_url}: {e}") return None except Exception as e: print(f"Error reading PDF {pdf_url}: {e}") return None def clean_body_content(html_content): soup = BeautifulSoup(html_content, "html.parser") # Remove scripts and styles for script_or_style in soup(["script", "style"]): script_or_style.extract() # Get text and clean up cleaned_content = soup.get_text(separator="\n") cleaned_content = "\n".join( line.strip() for line in cleaned_content.splitlines() if line.strip() ) return cleaned_content # if __name__ == "__main__": # website = [ # #"https://www.rib.gov.rw/index.php?id=371", # "https://haguruka.org.rw/our-work/" # ] # all_content = scrape_websites(website) # # Temporary list to store (url, content) tuples # temp_list = [] # # Process and store each URL with its content # for url, content in all_content.items(): # temp_list.append((url, content)) # processed_texts = [] # # Process each element in the temporary list # for element in temp_list: # if isinstance(element, tuple): # url, content = element # Unpack the tuple # processed_texts.append(f"url: {url}, content: {content}") # elif isinstance(element, str): # processed_texts.append(element) # else: # processed_texts.append(str(element)) # def chunk_string(s, chunk_size=2000): # return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)] # # List to store the chunks # chunked_texts = [] # for text in processed_texts: # chunked_texts.extend(chunk_string(text)) data = [] data.extend(context_data) #data.extend([item for item in text_only_chunks if item not in data]) # data.extend([item for item in chunked_texts if item not in data]) #from langchain_community.vectorstores import Chroma from langchain_chroma import Chroma vectorstore = Chroma( collection_name="GBV_data_set", embedding_function=embed_model, ) vectorstore.get().keys() # add data to vector nstore vectorstore.add_texts(data) api= os.environ.get('V1') from openai import OpenAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough import gradio as gr from typing import Iterator import time #template for GBV support chatbot template = (""" You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries. You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner. When responding follow these guidelines: 1. **Emotional Intelligence** - Validate feelings without judgment (e.g., "It is completely understandable to feel this way") - Offer reassurance when appropriate, always centered on empowerment - Adjust your tone based on the emotional state conveyed 2. **Personalized Communication** - Avoid contractions (e.g., use I am instead of I'm) - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions - Balance warmth with professionalism 3. **Conversation Management** - Refer to {conversation_history} to maintain continuity and avoid repetition - Keep responses concise unless greater detail is explicitly requested - Use clear paragraph breaks for readability - Prioritize immediate concerns before addressing secondary issues 4. **Information Delivery** - Extract only relevant information from {context} that directly addresses the question - Present information in accessible, non-technical language - Organize resource recommendations in order of relevance and accessibility - Provide links [URL] only when specifically requested, prefaced with clear descriptions - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?" 5. **Safety and Ethics** - Prioritize user safety in all responses - Never generate speculative content about their specific situation - Avoid phrases that could minimize experiences or create pressure - Include gentle reminders about professional help when discussing serious issues Your response should balance emotional support with practical guidance. **Context:** {context} **User's Question:** {question} **Your Response:** """) rag_prompt = PromptTemplate.from_template(template) retriever = vectorstore.as_retriever() import requests API_TOKEN = os.environ.get('Token') model_name = "facebook/nllb-200-distilled-600M" url = f"https://api-inference.huggingface.co/models/{model_name}" headers = { "Authorization": f"Bearer {API_TOKEN}" } def translate_text(text, src_lang, tgt_lang): """Translate text using Hugging Face API""" response = requests.post( url, headers=headers, json={ "inputs": text, "parameters": { "src_lang": src_lang, "tgt_lang": tgt_lang } } ) if response.status_code == 200: result = response.json() if isinstance(result, list) and len(result) > 0: return result[0]['translation_text'] return result['translation_text'] else: print(f"Translation error: {response.status_code}, {response.text}") return text # Return original text if translation fails class OpenRouterLLM: def __init__(self, key: str): try: self.client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=key ) self.headers = { "HTTP-Referer": "http://localhost:3000", "X-Title": "Local Development" } except Exception as e: print(f"Initialization error: {e}") raise def stream(self, prompt: str) -> Iterator[str]: try: completion = self.client.chat.completions.create( model="deepseek/deepseek-r1-distill-llama-70b:free", #model="meta-llama/llama-3.3-70b-instruct:free", #model="qwen/qwq-32b:free", messages=[{"role": "user", "content": prompt}], stream=True ) for chunk in completion: delta = chunk.choices[0].delta if hasattr(delta, "content") and delta.content: yield delta.content except Exception as e: yield f"Streaming error: {str(e)}" class UserSession: def __init__(self, llm: OpenRouterLLM): # Accept an instance of OpenRouterLLM self.current_user = None self.welcome_message = None self.conversation_history = [] # Add conversation history storage self.llm = llm # Store the LLM instance def set_user(self, user_info): self.current_user = user_info self.set_welcome_message(user_info.get("Nickname", "Guest")) # Initialize conversation history with welcome message welcome = self.get_welcome_message() self.conversation_history = [ {"role": "assistant", "content": welcome}, ] def get_user(self): return self.current_user def set_welcome_message(self, Nickname, src_lang="eng_Latn", tgt_lang="kin_Latn"): """Set a dynamic welcome message using the OpenRouterLLM.""" prompt = ( f"Create a very brief welcome message for {Nickname}. " f"The message should: " f"1. Welcome {Nickname} warmly and professionally. " f"2. Emphasize that this is a safe and trusted space. " f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. " f"4. Use a tone that is warm, reassuring, and professional. " f"5. Keep the message concise and impactful." ) # Use the OpenRouterLLM to generate the message welcome = "".join(self.llm.stream(prompt)) # Stream and concatenate the response welcome_text=translate_text(welcome, src_lang, tgt_lang) # Format the message with HTML styling self.welcome_message = ( f"
" f"{welcome_text}" f"
" ) def get_welcome_message(self): return self.welcome_message def add_to_history(self, role, message): """Add a message to the conversation history""" self.conversation_history.append({"role": role, "content": message}) def get_conversation_history(self): """Get the full conversation history""" return self.conversation_history def get_formatted_history(self): """Get conversation history formatted as a string for the LLM""" formatted_history = "" for entry in self.conversation_history: role = "User" if entry["role"] == "user" else "Assistant" formatted_history += f"{role}: {entry['content']}\n\n" return formatted_history api_key =api llm_instance = OpenRouterLLM(key=api_key) #llm_instance = model user_session = UserSession(llm_instance) def collect_user_info(Nickname): if not Nickname: return "Nickname is required to proceed.", gr.update(visible=False), gr.update(visible=True), [] # Store user info for chat session user_info = { "Nickname": Nickname, "timestamp": time.strftime("%Y-%m-%d %H:%M:%S") } # Set user in session user_session.set_user(user_info) # Generate welcome message welcome_message = user_session.get_welcome_message() # Add initial message to start the conversation chat_history = add_initial_message([(None, welcome_message)]) # Return welcome message and update UI return welcome_message, gr.update(visible=True), gr.update(visible=False), chat_history # Add initial message to start the conversation def add_initial_message(chatbot): #initial_message = (" " # ) return chatbot #+ [(None, initial_message)] # Create RAG chain with user context and conversation history def create_rag_chain(retriever, template, api_key): llm = OpenRouterLLM(api_key) rag_prompt = PromptTemplate.from_template(template) def stream_func(input_dict): # Get context using the retriever's invoke method context = retriever.invoke(input_dict["question"]) context_str = "\n".join([doc.page_content for doc in context]) # Get user info from the session user_info = user_session.get_user() or {} first_name = user_info.get("Nickname", "User") # Get conversation history conversation_history = user_session.get_formatted_history() # Format prompt with user context and conversation history prompt = rag_prompt.format( context=context_str, question=input_dict["question"], first_name=first_name, conversation_history=conversation_history ) # Stream response return llm.stream(prompt) return stream_func # def rag_memory_stream(message, history): # # Add user message to history # user_session.add_to_history("user", message) # # Initialize with empty response # partial_text = "" # full_response = "" # # Use the rag_chain with the question # for new_text in rag_chain({"question": message}): # partial_text += new_text # full_response = partial_text # yield partial_text # # After generating the complete response, add it to history # user_session.add_to_history("assistant", full_response) def rag_memory_stream(message, history, user_lang="kin_Latn", system_lang="eng_Latn"): english_message = translate_text(message, user_lang, system_lang) user_session.add_to_history("user", english_message) full_response = "" for new_text in rag_chain({"question": english_message}): full_response += new_text translated_response = translate_text(full_response, system_lang, user_lang) user_session.add_to_history("assistant", full_response) yield translated_response import gradio as gr api_key = api def chatbot_interface(): api_key = api global template template = """ You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries. **Previous conversation:** {conversation_history} **Context information:** {context} **User's Question:** {question} When responding follow these guidelines: 1. **Strict Context Adherence** - Only use information that appears in the provided {context} - If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response 2. **Personalized Communication** - Avoid contractions (e.g., use I am instead of I'm) - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions - Balance warmth with professionalism 3. **Emotional Intelligence** - Validate feelings without judgment - Offer reassurance when appropriate, always centered on empowerment - Adjust your tone based on the emotional state conveyed 4. **Conversation Management** - Refer to {conversation_history} to maintain continuity and avoid repetition - Keep responses concise unless greater detail is explicitly requested - Use clear paragraph breaks for readability 5. **Information Delivery** - Extract only relevant information from {context} that directly addresses the question - Present information in accessible, non-technical language - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?" 6. **Safety and Ethics** - Do not generate any speculative content or advice not supported by the context - If the context contains safety information, prioritize sharing that information Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials. **Context:** {context} **User's Question:** {question} **Your Response:** """ global rag_chain rag_chain = create_rag_chain(retriever, template, api_key) with gr.Blocks() as demo: # User registration section with gr.Column(visible=True, elem_id="registration_container") as registration_container: gr.Markdown("### Your privacy matters to us! Just share a nickname you feel comfy with to start chatting..") with gr.Row(): first_name = gr.Textbox( label="Nickname", placeholder="Enter your Nickname You feel comfy", scale=1, elem_id="input_nickname" ) with gr.Row(): submit_btn = gr.Button("Start Chatting", variant="primary", scale=2) response_message = gr.Markdown() # Chatbot section (initially hidden) with gr.Column(visible=False, elem_id="chatbot_container") as chatbot_container: chat_interface = gr.ChatInterface( fn=rag_memory_stream, title="Chat with GBVR", fill_height=True ) # Footer with version info gr.Markdown("Ijwi ry'Ubufasha Chatbot v1.0.0 © 2025") # Handle user registration submit_btn.click( collect_user_info, inputs=[first_name], outputs=[response_message, chatbot_container, registration_container, chat_interface.chatbot] ) demo.css = """ :root { --background: #f0f0f0; --text: #000000; } body, .gradio-container { margin: 0; padding: 0; width: 100vw; height: 100vh; display: flex; flex-direction: column; justify-content: center; align-items: center; background: var(--background); color: var(--text); } .gradio-container { max-width: 100%; max-height: 100%; } .gr-box { background: var(--background); color: var(--text); border-radius: 12px; padding: 2rem; border: 1px solid rgba(0, 0, 0, 0.1); box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05); } .gr-button-primary { background: var(--background); color: var(--text); padding: 12px 24px; border-radius: 8px; transition: all 0.3s ease; border: 1px solid rgba(0, 0, 0, 0.1); } .gr-button-primary:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.2); } footer { text-align: center; color: var(--text); opacity: 0.7; padding: 1rem; font-size: 0.9em; } .gr-markdown h3 { color: var(--text); margin-bottom: 1rem; } .registration-markdown, .chat-title h1 { color: var(--text); } """ return demo # Launch the interface if __name__ == "__main__": chatbot_interface().launch(share=True)