Spaces:

UKURIKIYEYEZU
/

Help_chatbot

Runtime error

App Files Files Community

UKURIKIYEYEZU commited on Mar 21

Commit

1d3914d

verified ·

1 Parent(s): aa3888c

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -84

app.py CHANGED Viewed

@@ -39,46 +39,42 @@ for f, file in enumerate(data_files, 1):
-# def extract_text_from_pdf(pdf_path):
-#     """Extracts text from a PDF file."""
-#     try:
-#         with open(pdf_path, "rb") as file:
-#             reader = PyPDF2.PdfReader(file)
-#             text = "".join(page.extract_text() or "" for page in reader.pages)  # Handle None cases
-#             return text
-#     except Exception as e:
-#         print(f"Error extracting text from {pdf_path}: {e}")
-#         return ""
-# folder_path = "./"
-# # Initialize the list to hold the extracted text chunks
-# text_chunks = []
-# # Get all PDF filenames in the folder
-# filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
-# # Process each PDF file
-# for index, file in enumerate(filenames, 1):
-#     print(f"\nProcessing file {index}: {file}")
-#     pdf_path = os.path.join(folder_path, file)
-#     try:
-#         # Extract text from the PDF
-#         extracted_text = extract_text_from_pdf(pdf_path)
-#         if extracted_text.strip():  # Ensure extracted text is not just whitespace
-#             # Split extracted text into chunks of 1000 characters
-#             chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
-#             # Append extracted chunks to the list
-#             text_chunks.extend(chunks)
-#         else:
-#             print(f"No text found in the PDF: {file}")
-#     except Exception as e:
-#         print(f"Error reading the PDF {file}: {e}")
 from urllib.parse import urljoin, urlparse
@@ -235,7 +231,7 @@ def clean_body_content(html_content):
 data = []
 data.extend(context_data)
-# data.extend([item for item in text_chunks if item not in data])
 # data.extend([item for item in chunked_texts if item not in data])
@@ -246,7 +242,7 @@ from langchain_chroma import Chroma
 vectorstore = Chroma(
-    collection_name="GBV_set",
     embedding_function=embed_model,
 )
@@ -561,49 +557,46 @@ def chatbot_interface():
     global template
     template = """
-    You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
-    You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner.
-    **Previous conversation:**
-    {conversation_history}
-    **Context information:**
-    {context}
-    **User's Question:** {question}
-    When responding follow these guidelines:
-    1. **Emotional Intelligence**
-       - Validate feelings without judgment (e.g., "It is completely understandable to feel this way")
-       - Offer reassurance when appropriate, always centered on empowerment
-       - Adjust your tone based on the emotional state conveyed
-    2. **Personalized Communication**
-       - Avoid contractions (e.g., use I am instead of I'm)
-       - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
-       - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
-       - Balance warmth with professionalism
-    3. **Conversation Management**
-       - Refer to {conversation_history} to maintain continuity and avoid repetition
-       - Keep responses concise unless greater detail is explicitly requested
-       - Use clear paragraph breaks for readability
-       - Prioritize immediate concerns before addressing secondary issues
-    4. **Information Delivery**
-       - Extract only relevant information from {context} that directly addresses the question
-       - Present information in accessible, non-technical language
-       - Organize resource recommendations in order of relevance and accessibility
-       - Provide links only when specifically requested, prefaced with clear descriptions
-       - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
-    5. **Safety and Ethics**
-       - Prioritize user safety in all responses
-       - Never generate speculative content about their specific situation
-       - Avoid phrases that could minimize experiences or create pressure
-       - Include gentle reminders about professional help when discussing serious issues
-    Your response should balance emotional support with practical guidance.
     """

+import os
+import PyPDF2
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+def extract_text_from_pdf(pdf_path):
+    """Extract text from a PDF file."""
+    try:
+        with open(pdf_path, "rb") as file:
+            reader = PyPDF2.PdfReader(file)
+            return "".join(page.extract_text() or "" for page in reader.pages)
+    except Exception as e:
+        print(f"Error with {pdf_path}: {e}")
+        return ""
+# Folder path and get PDF files
+folder_path = "/content/drive/MyDrive/Ijwi_folder"
+pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
+# Process PDFs
+documents = []
+for file in pdf_files:
+    print(f"Processing: {file}")
+    pdf_path = os.path.join(folder_path, file)
+    text = extract_text_from_pdf(pdf_path)
+    if text:
+        documents.append(Document(page_content=text, metadata={"source": file}))
+# Split into chunks
+text_splitter = RecursiveCharacterTextSplitter(
+    separators=['\n\n', '\n', '.', ','],
+    chunk_size=500,
+    chunk_overlap=50
+)
+chunks = text_splitter.split_documents(documents)
+text_only_chunks = [chunk.page_content for chunk in chunks]
 from urllib.parse import urljoin, urlparse
 data = []
 data.extend(context_data)
+ data.extend([item for item in text_only_chunks if item not in data])
 # data.extend([item for item in chunked_texts if item not in data])
 vectorstore = Chroma(
+    collection_name="GBV_data_set",
     embedding_function=embed_model,
 )
     global template
     template = """
+        You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Do not use any information outside what is specifically provided in the context section.
+        **Previous conversation:** {conversation_history}
+        **Context information:** {context}
+        **User's Question:** {question}
+        When responding follow these guidelines:
+        1. **Strict Context Adherence**
+           - Only use information that appears in the provided {context}
+           - If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response
+           - Do not use general knowledge or information not present in the context
+        2. **Emotional Intelligence**
+           - Validate feelings without judgment based solely on context information
+           - Offer reassurance only using language and approaches mentioned in the context
+           - Adjust your tone based on the emotional state conveyed while staying true to context
+        3. **Personalized Communication**
+           - Avoid contractions (e.g., use I am instead of I'm)
+           - Use language patterns and terminology found in the context
+           - Balance warmth with professionalism as demonstrated in the context
+        4. **Conversation Management**
+           - Refer to {conversation_history} to maintain continuity and avoid repetition
+           - Keep responses concise unless greater detail is explicitly requested
+           - Use clear paragraph breaks for readability
+        5. **Information Delivery**
+           - Extract only relevant information from {context} that directly addresses the question
+           - Present information in accessible, non-technical language
+           - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
+        6. **Safety and Ethics**
+           - Only recommend resources or approaches explicitly mentioned in the context
+           - Do not generate any speculative content or advice not supported by the context
+           - If the context contains safety information, prioritize sharing that information
+        Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.
     """