UKURIKIYEYEZU commited on
Commit
1d3914d
·
verified ·
1 Parent(s): aa3888c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -84
app.py CHANGED
@@ -39,46 +39,42 @@ for f, file in enumerate(data_files, 1):
39
 
40
 
41
 
 
 
 
 
42
 
43
- # def extract_text_from_pdf(pdf_path):
44
- # """Extracts text from a PDF file."""
45
- # try:
46
- # with open(pdf_path, "rb") as file:
47
- # reader = PyPDF2.PdfReader(file)
48
- # text = "".join(page.extract_text() or "" for page in reader.pages) # Handle None cases
49
- # return text
50
- # except Exception as e:
51
- # print(f"Error extracting text from {pdf_path}: {e}")
52
- # return ""
53
-
54
- # folder_path = "./"
55
- # # Initialize the list to hold the extracted text chunks
56
- # text_chunks = []
57
-
58
- # # Get all PDF filenames in the folder
59
- # filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
60
-
61
- # # Process each PDF file
62
- # for index, file in enumerate(filenames, 1):
63
- # print(f"\nProcessing file {index}: {file}")
64
- # pdf_path = os.path.join(folder_path, file)
65
-
66
- # try:
67
- # # Extract text from the PDF
68
- # extracted_text = extract_text_from_pdf(pdf_path)
69
-
70
- # if extracted_text.strip(): # Ensure extracted text is not just whitespace
71
- # # Split extracted text into chunks of 1000 characters
72
- # chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
73
-
74
- # # Append extracted chunks to the list
75
- # text_chunks.extend(chunks)
76
- # else:
77
- # print(f"No text found in the PDF: {file}")
78
-
79
- # except Exception as e:
80
- # print(f"Error reading the PDF {file}: {e}")
81
-
82
 
83
 
84
  from urllib.parse import urljoin, urlparse
@@ -235,7 +231,7 @@ def clean_body_content(html_content):
235
 
236
  data = []
237
  data.extend(context_data)
238
- # data.extend([item for item in text_chunks if item not in data])
239
  # data.extend([item for item in chunked_texts if item not in data])
240
 
241
 
@@ -246,7 +242,7 @@ from langchain_chroma import Chroma
246
 
247
 
248
  vectorstore = Chroma(
249
- collection_name="GBV_set",
250
  embedding_function=embed_model,
251
  )
252
 
@@ -561,49 +557,46 @@ def chatbot_interface():
561
  global template
562
 
563
  template = """
564
- You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your primary goal is to provide emotionally intelligent support while maintaining appropriate boundaries.
565
- You are a conversational AI. Respond directly and naturally to the user's input without displaying any system messages, backend processes, or 'thinking...' responses. Only provide the final response in a human-like and engaging manner.
566
- **Previous conversation:**
567
- {conversation_history}
568
-
569
- **Context information:**
570
- {context}
571
-
572
- **User's Question:** {question}
573
-
574
- When responding follow these guidelines:
575
-
576
- 1. **Emotional Intelligence**
577
- - Validate feelings without judgment (e.g., "It is completely understandable to feel this way")
578
- - Offer reassurance when appropriate, always centered on empowerment
579
- - Adjust your tone based on the emotional state conveyed
580
-
581
- 2. **Personalized Communication**
582
- - Avoid contractions (e.g., use I am instead of I'm)
583
- - Incorporate thoughtful pauses or reflective questions when the conversation involves difficult topics
584
- - Use selective emojis (😊, 🤗, ❤️) only when tone-appropriate and not during crisis discussions
585
- - Balance warmth with professionalism
586
-
587
- 3. **Conversation Management**
588
- - Refer to {conversation_history} to maintain continuity and avoid repetition
589
- - Keep responses concise unless greater detail is explicitly requested
590
- - Use clear paragraph breaks for readability
591
- - Prioritize immediate concerns before addressing secondary issues
592
-
593
- 4. **Information Delivery**
594
- - Extract only relevant information from {context} that directly addresses the question
595
- - Present information in accessible, non-technical language
596
- - Organize resource recommendations in order of relevance and accessibility
597
- - Provide links only when specifically requested, prefaced with clear descriptions
598
- - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
599
-
600
- 5. **Safety and Ethics**
601
- - Prioritize user safety in all responses
602
- - Never generate speculative content about their specific situation
603
- - Avoid phrases that could minimize experiences or create pressure
604
- - Include gentle reminders about professional help when discussing serious issues
605
-
606
- Your response should balance emotional support with practical guidance.
607
  """
608
 
609
 
 
39
 
40
 
41
 
42
+ import os
43
+ import PyPDF2
44
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
45
+ from langchain.schema import Document
46
 
47
+ def extract_text_from_pdf(pdf_path):
48
+ """Extract text from a PDF file."""
49
+ try:
50
+ with open(pdf_path, "rb") as file:
51
+ reader = PyPDF2.PdfReader(file)
52
+ return "".join(page.extract_text() or "" for page in reader.pages)
53
+ except Exception as e:
54
+ print(f"Error with {pdf_path}: {e}")
55
+ return ""
56
+
57
+ # Folder path and get PDF files
58
+ folder_path = "/content/drive/MyDrive/Ijwi_folder"
59
+ pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
60
+
61
+ # Process PDFs
62
+ documents = []
63
+ for file in pdf_files:
64
+ print(f"Processing: {file}")
65
+ pdf_path = os.path.join(folder_path, file)
66
+ text = extract_text_from_pdf(pdf_path)
67
+ if text:
68
+ documents.append(Document(page_content=text, metadata={"source": file}))
69
+
70
+ # Split into chunks
71
+ text_splitter = RecursiveCharacterTextSplitter(
72
+ separators=['\n\n', '\n', '.', ','],
73
+ chunk_size=500,
74
+ chunk_overlap=50
75
+ )
76
+ chunks = text_splitter.split_documents(documents)
77
+ text_only_chunks = [chunk.page_content for chunk in chunks]
 
 
 
 
 
 
 
 
78
 
79
 
80
  from urllib.parse import urljoin, urlparse
 
231
 
232
  data = []
233
  data.extend(context_data)
234
+ data.extend([item for item in text_only_chunks if item not in data])
235
  # data.extend([item for item in chunked_texts if item not in data])
236
 
237
 
 
242
 
243
 
244
  vectorstore = Chroma(
245
+ collection_name="GBV_data_set",
246
  embedding_function=embed_model,
247
  )
248
 
 
557
  global template
558
 
559
  template = """
560
+ You are a compassionate and supportive AI assistant specializing in helping individuals affected by Gender-Based Violence (GBV). Your responses must be based EXCLUSIVELY on the information provided in the context. Do not use any information outside what is specifically provided in the context section.
561
+
562
+ **Previous conversation:** {conversation_history}
563
+ **Context information:** {context}
564
+ **User's Question:** {question}
565
+
566
+ When responding follow these guidelines:
567
+
568
+ 1. **Strict Context Adherence**
569
+ - Only use information that appears in the provided {context}
570
+ - If the answer is not found in the context, state "I don't have that information in my available resources" rather than generating a response
571
+ - Do not use general knowledge or information not present in the context
572
+
573
+ 2. **Emotional Intelligence**
574
+ - Validate feelings without judgment based solely on context information
575
+ - Offer reassurance only using language and approaches mentioned in the context
576
+ - Adjust your tone based on the emotional state conveyed while staying true to context
577
+
578
+ 3. **Personalized Communication**
579
+ - Avoid contractions (e.g., use I am instead of I'm)
580
+ - Use language patterns and terminology found in the context
581
+ - Balance warmth with professionalism as demonstrated in the context
582
+
583
+ 4. **Conversation Management**
584
+ - Refer to {conversation_history} to maintain continuity and avoid repetition
585
+ - Keep responses concise unless greater detail is explicitly requested
586
+ - Use clear paragraph breaks for readability
587
+
588
+ 5. **Information Delivery**
589
+ - Extract only relevant information from {context} that directly addresses the question
590
+ - Present information in accessible, non-technical language
591
+ - When information is unavailable, respond with: "I don't have that specific information right now, {first_name}. Would it be helpful if I focus on [alternative support option]?"
592
+
593
+
594
+ 6. **Safety and Ethics**
595
+ - Only recommend resources or approaches explicitly mentioned in the context
596
+ - Do not generate any speculative content or advice not supported by the context
597
+ - If the context contains safety information, prioritize sharing that information
598
+
599
+ Your response must come entirely from the provided context, maintaining the supportive tone while never introducing information from outside the provided materials.
 
 
 
600
  """
601
 
602