Spaces:

UKURIKIYEYEZU
/

Help_chatbot

Runtime error

App Files Files Community

UKURIKIYEYEZU commited on Mar 7

Commit

b24f7f9

verified ·

1 Parent(s): d26ee43

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -60

app.py CHANGED Viewed

@@ -39,44 +39,44 @@ for f, file in enumerate(data_files, 1):
-def extract_text_from_pdf(pdf_path):
-    """Extracts text from a PDF file."""
-    try:
-        with open(pdf_path, "rb") as file:
-            reader = PyPDF2.PdfReader(file)
-            text = "".join(page.extract_text() or "" for page in reader.pages)  # Handle None cases
-            return text
-    except Exception as e:
-        print(f"Error extracting text from {pdf_path}: {e}")
-        return ""
-folder_path = "./"
-# Initialize the list to hold the extracted text chunks
-text_chunks = []
-# Get all PDF filenames in the folder
-filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
-# Process each PDF file
-for index, file in enumerate(filenames, 1):
-    print(f"\nProcessing file {index}: {file}")
-    pdf_path = os.path.join(folder_path, file)
-    try:
-        # Extract text from the PDF
-        extracted_text = extract_text_from_pdf(pdf_path)
-        if extracted_text.strip():  # Ensure extracted text is not just whitespace
-            # Split extracted text into chunks of 1000 characters
-            chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
-            # Append extracted chunks to the list
-            text_chunks.extend(chunks)
-        else:
-            print(f"No text found in the PDF: {file}")
-    except Exception as e:
-        print(f"Error reading the PDF {file}: {e}")
@@ -195,47 +195,47 @@ def clean_body_content(html_content):
-if __name__ == "__main__":
-    website = [
-               #"https://www.rib.gov.rw/index.php?id=371",
-               "https://haguruka.org.rw/our-work/"
-               ]
-    all_content = scrape_websites(website)
-    # Temporary list to store (url, content) tuples
-    temp_list = []
-    # Process and store each URL with its content
-    for url, content in all_content.items():
-        temp_list.append((url, content))
-processed_texts = []
-# Process each element in the temporary list
-for element in temp_list:
-    if isinstance(element, tuple):
-        url, content = element  # Unpack the tuple
-        processed_texts.append(f"url: {url}, content: {content}")
-    elif isinstance(element, str):
-        processed_texts.append(element)
-    else:
-        processed_texts.append(str(element))
-def chunk_string(s, chunk_size=2000):
-    return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
-# List to store the chunks
-chunked_texts = []
-for text in processed_texts:
-  chunked_texts.extend(chunk_string(text))
 data = []
 data.extend(context_data)
-data.extend([item for item in text_chunks if item not in data])
-data.extend([item for item in chunked_texts if item not in data])

+# def extract_text_from_pdf(pdf_path):
+#     """Extracts text from a PDF file."""
+#     try:
+#         with open(pdf_path, "rb") as file:
+#             reader = PyPDF2.PdfReader(file)
+#             text = "".join(page.extract_text() or "" for page in reader.pages)  # Handle None cases
+#             return text
+#     except Exception as e:
+#         print(f"Error extracting text from {pdf_path}: {e}")
+#         return ""
+# folder_path = "./"
+# # Initialize the list to hold the extracted text chunks
+# text_chunks = []
+# # Get all PDF filenames in the folder
+# filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
+# # Process each PDF file
+# for index, file in enumerate(filenames, 1):
+#     print(f"\nProcessing file {index}: {file}")
+#     pdf_path = os.path.join(folder_path, file)
+#     try:
+#         # Extract text from the PDF
+#         extracted_text = extract_text_from_pdf(pdf_path)
+#         if extracted_text.strip():  # Ensure extracted text is not just whitespace
+#             # Split extracted text into chunks of 1000 characters
+#             chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
+#             # Append extracted chunks to the list
+#             text_chunks.extend(chunks)
+#         else:
+#             print(f"No text found in the PDF: {file}")
+#     except Exception as e:
+#         print(f"Error reading the PDF {file}: {e}")
+# if __name__ == "__main__":
+#     website = [
+#                #"https://www.rib.gov.rw/index.php?id=371",
+#                "https://haguruka.org.rw/our-work/"
+#                ]
+#     all_content = scrape_websites(website)
+#     # Temporary list to store (url, content) tuples
+#     temp_list = []
+#     # Process and store each URL with its content
+#     for url, content in all_content.items():
+#         temp_list.append((url, content))
+# processed_texts = []
+# # Process each element in the temporary list
+# for element in temp_list:
+#     if isinstance(element, tuple):
+#         url, content = element  # Unpack the tuple
+#         processed_texts.append(f"url: {url}, content: {content}")
+#     elif isinstance(element, str):
+#         processed_texts.append(element)
+#     else:
+#         processed_texts.append(str(element))
+# def chunk_string(s, chunk_size=2000):
+#     return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
+# # List to store the chunks
+# chunked_texts = []
+# for text in processed_texts:
+#   chunked_texts.extend(chunk_string(text))
 data = []
 data.extend(context_data)
+# data.extend([item for item in text_chunks if item not in data])
+# data.extend([item for item in chunked_texts if item not in data])