Spaces:

UKURIKIYEYEZU
/

Help_chatbot

Runtime error

App Files Files Community

UKURIKIYEYEZU commited on Feb 27

Commit

5e4a27f

verified ·

1 Parent(s): bc75c0d

Update app.py

Browse files

Files changed (1) hide show

app.py +379 -60

app.py CHANGED Viewed

@@ -1,64 +1,383 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import os
+import PyPDF2
+from google.colab import userdata
+from PyPDF2 import PdfReader
+## Embedding model!
+from langchain_huggingface import HuggingFaceEmbeddings
+embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")
+import pandas as pd
+# Set folder path
+folder_path = "/content/drive/MyDrive/Ijwi_folder"
+context_data = []
+# List all files in the folder
+files = os.listdir(folder_path)
+# Get list of CSV and Excel files
+data_files = [f for f in files if f.endswith(('.csv', '.xlsx', '.xls'))]
+# Process each file
+for f, file in enumerate(data_files, 1):
+    print(f"\nProcessing file {f}: {file}")
+    file_path = os.path.join(folder_path, file)
+    try:
+        # Read the file based on its extension
+        if file.endswith('.csv'):
+            df = pd.read_csv(file_path)
+        else:
+            df = pd.read_excel(file_path)
+        # Extract non-empty values from column 2 and append them
+        context_data.extend(df.iloc[:, 2].dropna().astype(str).tolist())
+    except Exception as e:
+        print(f"Error processing file {file}: {str(e)}")
+def extract_text_from_pdf(pdf_path):
+    """Extracts text from a PDF file."""
+    try:
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            text = "".join(page.extract_text() or "" for page in reader.pages)  # Handle None cases
+            return text
+    except Exception as e:
+        print(f"Error extracting text from {pdf_path}: {e}")
+        return ""
+# Folder containing the PDFs
+folder_path ="/content/drive/MyDrive/Ijwi_folder"  # Update with your actual folder path
+# Initialize the list to hold the extracted text chunks
+text_chunks = []
+# Get all PDF filenames in the folder
+filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
+# Process each PDF file
+for index, file in enumerate(filenames, 1):
+    print(f"\nProcessing file {index}: {file}")
+    pdf_path = os.path.join(folder_path, file)
+    try:
+        # Extract text from the PDF
+        extracted_text = extract_text_from_pdf(pdf_path)
+        if extracted_text.strip():  # Ensure extracted text is not just whitespace
+            # Split extracted text into chunks of 1000 characters
+            chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
+            # Append extracted chunks to the list
+            text_chunks.extend(chunks)
+        else:
+            print(f"No text found in the PDF: {file}")
+    except Exception as e:
+        print(f"Error reading the PDF {file}: {e}")
+from urllib.parse import urljoin, urlparse
+import requests
+from io import BytesIO
+from bs4 import BeautifulSoup
+from langchain_core.prompts import ChatPromptTemplate
 import gradio as gr
+def scrape_websites(base_urls):
+    try:
+        visited_links = set()  # To avoid revisiting the same link
+        content_by_url = {}  # Store content from each URL
+        for base_url in base_urls:
+            if not base_url.strip():
+                continue  # Skip empty or invalid URLs
+            print(f"Scraping base URL: {base_url}")
+            html_content = fetch_page_content(base_url)
+            if html_content:
+                cleaned_content = clean_body_content(html_content)
+                content_by_url[base_url] = cleaned_content
+                visited_links.add(base_url)
+                # Extract and process all internal links
+                soup = BeautifulSoup(html_content, "html.parser")
+                links = extract_internal_links(base_url, soup)
+                for link in links:
+                    if link not in visited_links:
+                        print(f"Scraping link: {link}")
+                        page_content = fetch_page_content(link)
+                        if page_content:
+                            cleaned_content = clean_body_content(page_content)
+                            content_by_url[link] = cleaned_content
+                            visited_links.add(link)
+                        # If the link is a PDF file, extract its content
+                        if link.lower().endswith('.pdf'):
+                            print(f"Extracting PDF content from: {link}")
+                            pdf_content = extract_pdf_text(link)
+                            if pdf_content:
+                                content_by_url[link] = pdf_content
+        return content_by_url
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+        return {}
+def fetch_page_content(url):
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.text
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching {url}: {e}")
+        return None
+def extract_internal_links(base_url, soup):
+    links = set()
+    for anchor in soup.find_all("a", href=True):
+        href = anchor["href"]
+        full_url = urljoin(base_url, href)
+        if is_internal_link(base_url, full_url):
+            links.add(full_url)
+    return links
+def is_internal_link(base_url, link_url):
+    base_netloc = urlparse(base_url).netloc
+    link_netloc = urlparse(link_url).netloc
+    return base_netloc == link_netloc
+def extract_pdf_text(pdf_url):
+    try:
+        response = requests.get(pdf_url)
+        response.raise_for_status()
+        # Open the PDF from the response content
+        with BytesIO(response.content) as file:
+            reader = PdfReader(file)
+            pdf_text = ""
+            for page in reader.pages:
+                pdf_text += page.extract_text()
+        return pdf_text if pdf_text else None
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching PDF {pdf_url}: {e}")
+        return None
+    except Exception as e:
+        print(f"Error reading PDF {pdf_url}: {e}")
+        return None
+def clean_body_content(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove scripts and styles
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.extract()
+    # Get text and clean up
+    cleaned_content = soup.get_text(separator="\n")
+    cleaned_content = "\n".join(
+        line.strip() for line in cleaned_content.splitlines() if line.strip()
+    )
+    return cleaned_content
 if __name__ == "__main__":
+    website = [
+               "https://www.rib.gov.rw/index.php?id=371",
+               "https://haguruka.org.rw/our-work/"
+               ]
+    all_content = scrape_websites(website)
+    # Temporary list to store (url, content) tuples
+    temp_list = []
+    # Process and store each URL with its content
+    for url, content in all_content.items():
+        temp_list.append((url, content))
+processed_texts = []
+# Process each element in the temporary list
+for element in temp_list:
+    if isinstance(element, tuple):
+        url, content = element  # Unpack the tuple
+        processed_texts.append(f"url: {url}, content: {content}")
+    elif isinstance(element, str):
+        processed_texts.append(element)
+    else:
+        processed_texts.append(str(element))
+def chunk_string(s, chunk_size=2000):
+    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+# List to store the chunks
+chunked_texts = []
+for text in processed_texts:
+  chunked_texts.extend(chunk_string(text))
+data = []
+data.extend(context_data)
+data.extend([item for item in text_chunks if item not in data])
+data.extend([item for item in chunked_texts if item not in data])
+from langchain_community.vectorstores import Chroma
+vectorstore = Chroma(
+    collection_name="GBV_dataset",
+    embedding_function=embed_model,
+)
+vectorstore.get().keys()
+# add data to vector nstore
+vectorstore.add_texts(data)
+from openai import OpenAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+import gradio as gr
+from typing import Iterator
+import time
+# Template with user personalization and improved welcome message
+template = ("""
+    You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses based on the provided context: {context}. Follow these guidelines:
+    1. **Contextual Interaction**
+      - Begin with a warm and empathetic welcome message
+      - Extract precise details from provided context: {context}
+      - Respond directly to user's question: {question}
+      - Remember the user's name is {first_name} and address them by name occasionally not always
+    2. **Communication Guidelines**
+      - Maintain warm, conversational tone
+      - Use occasional emojis for engagement
+      - Provide clear, concise information
+    3. **Response Strategies**
+      - Greet users naturally and ask about their wellbeing (e.g., "Hello {first_name}! 😊 How are you feeling today?", "Welcome, {first_name}! 😊 You're in a safe and caring space. What's on your mind today?")
+      - Always start with a check-in about the user's wellbeing or current situation
+      - Deliver only relevant information
+      - Avoid generating content beyond context
+      - Handle missing information transparently
+    4. **No Extra Content**
+      - If no information matches user's request:
+        * Respond politely: "I don't have that information at the moment, {first_name}. 😊"
+        * Offer alternative assistance options
+      - Strictly avoid generating unsupported content
+      - Prevent information padding or speculation
+    5. **Extracting Relevant Links**
+       - If the user asks for a link related to their request `{question}`, extract the most relevant URL from `{context}` and provide it directly.
+       - Example response:
+         - "Here is the link you requested, {first_name}: [URL]"
+    6. **Real-Time Awareness**
+      - Acknowledge current context when appropriate
+      - Stay focused on user's immediate needs
+      - If this is the first message, always ask how the user is feeling and what they would like help with today
+    **Context:** {context}
+    **User's Question:** {question}
+    **Welcome Message:** {welcome_message}
+    **Is First Message:** {is_first_message}
+    **Your Response:**
+""")
+class OpenRouterLLM:
+    def __init__(self, api_key: str):
+        self.client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api
+        )
+        self.headers = {
+            "HTTP-Referer": "http://localhost:3000",
+            "X-Title": "Local Development"
+        }
+    def stream(self, prompt: str) -> Iterator[str]:
+        try:
+            completion = self.client.chat.completions.create(
+                extra_headers=self.headers,
+                model="deepseek/deepseek-r1-distill-llama-70b:free",
+                #model="google/gemini-2.0-flash-thinking-exp:free",
+                messages=[{"role": "user", "content": prompt}],
+                stream=True
+            )
+            for chunk in completion:
+                if chunk.choices[0].delta.content is not None:
+                    yield chunk.choices[0].delta.content
+        except Exception as e:
+            yield f"Error: {str(e)}"
+class UserSession:
+    def __init__(self):
+        self.current_user = None
+        self.is_first_message = True
+    def set_user(self, user_info):
+        self.current_user = user_info
+        self.is_first_message = True
+    def get_user(self):
+        return self.current_user
+    def mark_message_sent(self):
+        self.is_first_message = False
+    def is_first(self):
+        return self.is_first_message
+# Initialize session and LLM
+user_session = UserSession()