Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -39,46 +39,42 @@ for f, file in enumerate(data_files, 1):
|
|
39 |
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
#
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
# # Append extracted chunks to the list
|
75 |
-
# text_chunks.extend(chunks)
|
76 |
-
# else:
|
77 |
-
# print(f"No text found in the PDF: {file}")
|
78 |
-
|
79 |
-
# except Exception as e:
|
80 |
-
# print(f"Error reading the PDF {file}: {e}")
|
81 |
-
|
82 |
|
83 |
|
84 |
from urllib.parse import urljoin, urlparse
|
@@ -235,7 +231,7 @@ def clean_body_content(html_content):
|
|
235 |
|
236 |
data = []
|
237 |
data.extend(context_data)
|
238 |
-
|
239 |
# data.extend([item for item in chunked_texts if item not in data])
|
240 |
|
241 |
|
@@ -246,7 +242,7 @@ from langchain_chroma import Chroma
|
|
246 |
|
247 |
|
248 |
vectorstore = Chroma(
|
249 |
-
collection_name="
|
250 |
embedding_function=embed_model,
|
251 |
)
|
252 |
|
@@ -561,49 +557,46 @@ def chatbot_interface():
|
|
561 |
global template
|
562 |
|
563 |
template = """
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
- Include gentle reminders about professional help when discussing serious issues
|
605 |
-
|
606 |
-
Your response should balance emotional support with practical guidance.
|
607 |
"""
|
608 |
|
609 |
|
|
|
39 |
|
40 |
|
41 |
|
42 |
+
import os
|
43 |
+
import PyPDF2
|
44 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
45 |
+
from langchain.schema import Document
|
46 |
|
47 |
+
def extract_text_from_pdf(pdf_path):
|
48 |
+
"""Extract text from a PDF file."""
|
49 |
+
try:
|
50 |
+
with open(pdf_path, "rb") as file:
|
51 |
+
reader = PyPDF2.PdfReader(file)
|
52 |
+
return "".join(page.extract_text() or "" for page in reader.pages)
|
53 |
+
except Exception as e:
|
54 |
+
print(f"Error with {pdf_path}: {e}")
|
55 |
+
return ""
|
56 |
+
|
57 |
+
# Folder path and get PDF files
|
58 |
+
folder_path = "/content/drive/MyDrive/Ijwi_folder"
|
59 |
+
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
|
60 |
+
|
61 |
+
# Process PDFs
|
62 |
+
documents = []
|
63 |
+
for file in pdf_files:
|
64 |
+
print(f"Processing: {file}")
|
65 |
+
pdf_path = os.path.join(folder_path, file)
|
66 |
+
text = extract_text_from_pdf(pdf_path)
|
67 |
+
if text:
|
68 |
+
documents.append(Document(page_content=text, metadata={"source": file}))
|
69 |
+
|
70 |
+
# Split into chunks
|
71 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
72 |
+
separators=['\n\n', '\n', '.', ','],
|
73 |
+
chunk_size=500,
|
74 |
+
chunk_overlap=50
|
75 |
+
)
|
76 |
+
chunks = text_splitter.split_documents(documents)
|
77 |
+
text_only_chunks = [chunk.page_content for chunk in chunks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
|
80 |
from urllib.parse import urljoin, urlparse
|
|
|
231 |
|
232 |
data = []
|
233 |
data.extend(context_data)
|
234 |
+
data.extend([item for item in text_only_chunks if item not in data])
|
235 |
# data.extend([item for item in chunked_texts if item not in data])
|
236 |
|
237 |
|
|
|
242 |
|
243 |
|
244 |
vectorstore = Chroma(
|
245 |
+
collection_name="GBV_data_set",
|
246 |
embedding_function=embed_model,
|
247 |
)
|
248 |
|
|
|
557 |
global template
|
558 |
|
559 |
template = """
|
560 |
+
You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Do not use any information outside what is specifically provided in the context section.
|
561 |
+
|
562 |
+
**Previous conversation:** {conversation_history}
|
563 |
+
**Context information:** {context}
|
564 |
+
**User's Question:** {question}
|
565 |
+
|
566 |
+
When responding follow these guidelines:
|
567 |
+
|
568 |
+
1. **Strict Context Adherence**
|
569 |
+
- Only use information that appears in the provided {context}
|
570 |
+
- If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response
|
571 |
+
- Do not use general knowledge or information not present in the context
|
572 |
+
|
573 |
+
2. **Emotional Intelligence**
|
574 |
+
- Validate feelings without judgment based solely on context information
|
575 |
+
- Offer reassurance only using language and approaches mentioned in the context
|
576 |
+
- Adjust your tone based on the emotional state conveyed while staying true to context
|
577 |
+
|
578 |
+
3. **Personalized Communication**
|
579 |
+
- Avoid contractions (e.g., use I am instead of I'm)
|
580 |
+
- Use language patterns and terminology found in the context
|
581 |
+
- Balance warmth with professionalism as demonstrated in the context
|
582 |
+
|
583 |
+
4. **Conversation Management**
|
584 |
+
- Refer to {conversation_history} to maintain continuity and avoid repetition
|
585 |
+
- Keep responses concise unless greater detail is explicitly requested
|
586 |
+
- Use clear paragraph breaks for readability
|
587 |
+
|
588 |
+
5. **Information Delivery**
|
589 |
+
- Extract only relevant information from {context} that directly addresses the question
|
590 |
+
- Present information in accessible, non-technical language
|
591 |
+
- When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
|
592 |
+
|
593 |
+
|
594 |
+
6. **Safety and Ethics**
|
595 |
+
- Only recommend resources or approaches explicitly mentioned in the context
|
596 |
+
- Do not generate any speculative content or advice not supported by the context
|
597 |
+
- If the context contains safety information, prioritize sharing that information
|
598 |
+
|
599 |
+
Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.
|
|
|
|
|
|
|
600 |
"""
|
601 |
|
602 |
|