import os import PyPDF2 from PyPDF2 import PdfReader import pandas as pd ## Embedding model! from langchain_huggingface import HuggingFaceEmbeddings embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1") folder_path = "./" context_data = [] # List all files in the folder files = os.listdir(folder_path) # Get list of CSV and Excel files data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))] # Process each file for f, file in enumerate(data_files, 1): print(f"\nProcessing file {f}: {file}") file_path = os.path.join(folder_path, file) try: # Read the file based on its extension if file.endswith('.csv'): df = pd.read_csv(file_path) else: df = pd.read_excel(file_path) # Extract non-empty values from column 2 and append them context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist()) except Exception as e: print(f"Error processing file {file}: {str(e)}") import os import PyPDF2 from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document def extract_text_from_pdf(pdf_path): """Extract text from a PDF file.""" try: with open(pdf_path, "rb") as file: reader = PyPDF2.PdfReader(file) return "".join(page.extract_text() or "" for page in reader.pages) except Exception as e: print(f"Error with {pdf_path}: {e}") return "" pdf_files = [f for f in files if f.lower().endswith(".pdf")] # Process PDFs documents = [] for file in pdf_files: print(f"Processing: {file}") pdf_path = os.path.join(folder_path, file) text = extract_text_from_pdf(pdf_path) if text: documents.append(Document(page_content=text, metadata={"source": file})) # Split into chunks text_splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=500, chunk_overlap=50 ) chunks = text_splitter.split_documents(documents) text_only_chunks = [chunk.page_content for chunk in chunks] from urllib.parse import urljoin, urlparse import requests from io import BytesIO from bs4 import BeautifulSoup from langchain_core.prompts import ChatPromptTemplate import gradio as gr def scrape_websites(base_urls): try: visited_links = set() # To avoid revisiting the same link content_by_url = {} # Store content from each URL for base_url in base_urls: if not base_url.strip(): continue # Skip empty or invalid URLs print(f"Scraping base URL: {base_url}") html_content = fetch_page_content(base_url) if html_content: cleaned_content = clean_body_content(html_content) content_by_url[base_url] = cleaned_content visited_links.add(base_url) # Extract and process all internal links soup = BeautifulSoup(html_content, "html.parser") links = extract_internal_links(base_url, soup) for link in links: if link not in visited_links: print(f"Scraping link: {link}") page_content = fetch_page_content(link) if page_content: cleaned_content = clean_body_content(page_content) content_by_url[link] = cleaned_content visited_links.add(link) # If the link is a PDF file, extract its content if link.lower().endswith('.pdf'): print(f"Extracting PDF content from: {link}") pdf_content = extract_pdf_text(link) if pdf_content: content_by_url[link] = pdf_content return content_by_url except Exception as e: print(f"Error during scraping: {e}") return {} def fetch_page_content(url): try: response = requests.get(url, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") return None def extract_internal_links(base_url, soup): links = set() for anchor in soup.find_all("a", href=True): href = anchor["href"] full_url = urljoin(base_url, href) if is_internal_link(base_url, full_url): links.add(full_url) return links def is_internal_link(base_url, link_url): base_netloc = urlparse(base_url).netloc link_netloc = urlparse(link_url).netloc return base_netloc == link_netloc def extract_pdf_text(pdf_url): try: response = requests.get(pdf_url) response.raise_for_status() # Open the PDF from the response content with BytesIO(response.content) as file: reader = PdfReader(file) pdf_text = "" for page in reader.pages: pdf_text += page.extract_text() return pdf_text if pdf_text else None except requests.exceptions.RequestException as e: print(f"Error fetching PDF {pdf_url}: {e}") return None except Exception as e: print(f"Error reading PDF {pdf_url}: {e}") return None def clean_body_content(html_content): soup = BeautifulSoup(html_content, "html.parser") # Remove scripts and styles for script_or_style in soup(["script", "style"]): script_or_style.extract() # Get text and clean up cleaned_content = soup.get_text(separator="\n") cleaned_content = "\n".join( line.strip() for line in cleaned_content.splitlines() if line.strip() ) return cleaned_content # if __name__ == "__main__": # website = [ # #"https://www.rib.gov.rw/index.php?id=371", # "https://haguruka.org.rw/our-work/" # ] # all_content = scrape_websites(website) # # Temporary list to store (url, content) tuples # temp_list = [] # # Process and store each URL with its content # for url, content in all_content.items(): # temp_list.append((url, content)) # processed_texts = [] # # Process each element in the temporary list # for element in temp_list: # if isinstance(element, tuple): # url, content = element # Unpack the tuple # processed_texts.append(f"url: {url}, content: {content}") # elif isinstance(element, str): # processed_texts.append(element) # else: # processed_texts.append(str(element)) # def chunk_string(s, chunk_size=2000): # return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)] # # List to store the chunks # chunked_texts = [] # for text in processed_texts: # chunked_texts.extend(chunk_string(text)) data = [] data.extend(context_data) #data.extend([item for item in text_only_chunks if item not in data]) # data.extend([item for item in chunked_texts if item not in data]) #from langchain_community.vectorstores import Chroma from langchain_chroma import Chroma vectorstore = Chroma( collection_name="GBV_data_set", embedding_function=embed_model, ) vectorstore.get().keys() # add data to vector nstore vectorstore.add_texts(data) api= os.environ.get('V1') from openai import OpenAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough import gradio as gr from typing import Iterator import time #template for GBV support chatbot template = (""" You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries. You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner. When responding follow these guidelines: 1. **Emotional Intelligence** - Validate feelings without judgment (e.g., "It is completely understandable to feel this way") - Offer reassurance when appropriate, always centered on empowerment - Adjust your tone based on the emotional state conveyed 2. **Personalized Communication** - Avoid contractions (e.g., use I am instead of I'm) - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions - Balance warmth with professionalism 3. **Conversation Management** - Refer to {conversation_history} to maintain continuity and avoid repetition - Keep responses concise unless greater detail is explicitly requested - Use clear paragraph breaks for readability - Prioritize immediate concerns before addressing secondary issues 4. **Information Delivery** - Extract only relevant information from {context} that directly addresses the question - Present information in accessible, non-technical language - Organize resource recommendations in order of relevance and accessibility - Provide links [URL] only when specifically requested, prefaced with clear descriptions - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?" 5. **Safety and Ethics** - Prioritize user safety in all responses - Never generate speculative content about their specific situation - Avoid phrases that could minimize experiences or create pressure - Include gentle reminders about professional help when discussing serious issues Your response should balance emotional support with practical guidance. **Context:** {context} **User's Question:** {question} **Your Response:** """) rag_prompt = PromptTemplate.from_template(template) retriever = vectorstore.as_retriever() import requests API_TOKEN = os.environ.get('Token') model_name = "facebook/nllb-200-distilled-600M" url = f"https://api-inference.huggingface.co/models/{model_name}" headers = { "Authorization": f"Bearer {API_TOKEN}" } def translate_text(text, src_lang, tgt_lang): """Translate text using Hugging Face API""" response = requests.post( url, headers=headers, json={ "inputs": text, "parameters": { "src_lang": src_lang, "tgt_lang": tgt_lang } } ) if response.status_code == 200: result = response.json() if isinstance(result, list) and len(result) > 0: return result[0]['translation_text'] return result['translation_text'] else: print(f"Translation error: {response.status_code}, {response.text}") return text # Return original text if translation fails class OpenRouterLLM: def __init__(self, key: str): try: self.client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=key ) self.headers = { "HTTP-Referer": "http://localhost:3000", "X-Title": "Local Development" } except Exception as e: print(f"Initialization error: {e}") raise def stream(self, prompt: str) -> Iterator[str]: try: completion = self.client.chat.completions.create( model="deepseek/deepseek-r1-distill-llama-70b:free", #model="meta-llama/llama-3.3-70b-instruct:free", #model="qwen/qwq-32b:free", messages=[{"role": "user", "content": prompt}], stream=True ) for chunk in completion: delta = chunk.choices[0].delta if hasattr(delta, "content") and delta.content: yield delta.content except Exception as e: yield f"Streaming error: {str(e)}" class UserSession: def __init__(self, llm: OpenRouterLLM): # Accept an instance of OpenRouterLLM self.current_user = None self.welcome_message = None self.conversation_history = [] # Add conversation history storage self.llm = llm # Store the LLM instance def set_user(self, user_info): self.current_user = user_info self.set_welcome_message(user_info.get("Nickname", "Guest")) # Initialize conversation history with welcome message welcome = self.get_welcome_message() self.conversation_history = [ {"role": "assistant", "content": welcome}, ] def get_user(self): return self.current_user def set_welcome_message(self, Nickname, src_lang="eng_Latn", tgt_lang="kin_Latn"): """Set a dynamic welcome message using the OpenRouterLLM.""" prompt = ( f"Create a very brief welcome message for {Nickname}. " f"The message should: " f"1. Welcome {Nickname} warmly and professionally. " f"2. Emphasize that this is a safe and trusted space. " f"3. Highlight specialized support for gender-based violence (GBV) and legal assistance. " f"4. Use a tone that is warm, reassuring, and professional. " f"5. Keep the message concise and impactful." ) # Use the OpenRouterLLM to generate the message welcome = "".join(self.llm.stream(prompt)) # Stream and concatenate the response welcome_text=translate_text(welcome, src_lang, tgt_lang) # Format the message with HTML styling self.welcome_message = ( f"