Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -39,44 +39,44 @@ for f, file in enumerate(data_files, 1):
|
|
39 |
|
40 |
|
41 |
|
42 |
-
def extract_text_from_pdf(pdf_path):
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
folder_path = "./"
|
54 |
-
# Initialize the list to hold the extracted text chunks
|
55 |
-
text_chunks = []
|
56 |
|
57 |
-
# Get all PDF filenames in the folder
|
58 |
-
filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
|
59 |
|
60 |
-
# Process each PDF file
|
61 |
-
for index, file in enumerate(filenames, 1):
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
|
81 |
|
82 |
|
@@ -195,47 +195,47 @@ def clean_body_content(html_content):
|
|
195 |
|
196 |
|
197 |
|
198 |
-
if __name__ == "__main__":
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
|
205 |
-
|
206 |
-
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
|
212 |
|
213 |
|
214 |
-
processed_texts = []
|
215 |
|
216 |
-
# Process each element in the temporary list
|
217 |
-
for element in temp_list:
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
|
226 |
-
def chunk_string(s, chunk_size=2000):
|
227 |
-
|
228 |
|
229 |
-
# List to store the chunks
|
230 |
-
chunked_texts = []
|
231 |
|
232 |
-
for text in processed_texts:
|
233 |
-
|
234 |
|
235 |
data = []
|
236 |
data.extend(context_data)
|
237 |
-
data.extend([item for item in text_chunks if item not in data])
|
238 |
-
data.extend([item for item in chunked_texts if item not in data])
|
239 |
|
240 |
|
241 |
|
|
|
39 |
|
40 |
|
41 |
|
42 |
+
# def extract_text_from_pdf(pdf_path):
|
43 |
+
# """Extracts text from a PDF file."""
|
44 |
+
# try:
|
45 |
+
# with open(pdf_path, "rb") as file:
|
46 |
+
# reader = PyPDF2.PdfReader(file)
|
47 |
+
# text = "".join(page.extract_text() or "" for page in reader.pages) # Handle None cases
|
48 |
+
# return text
|
49 |
+
# except Exception as e:
|
50 |
+
# print(f"Error extracting text from {pdf_path}: {e}")
|
51 |
+
# return ""
|
52 |
|
53 |
+
# folder_path = "./"
|
54 |
+
# # Initialize the list to hold the extracted text chunks
|
55 |
+
# text_chunks = []
|
56 |
|
57 |
+
# # Get all PDF filenames in the folder
|
58 |
+
# filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
|
59 |
|
60 |
+
# # Process each PDF file
|
61 |
+
# for index, file in enumerate(filenames, 1):
|
62 |
+
# print(f"\nProcessing file {index}: {file}")
|
63 |
+
# pdf_path = os.path.join(folder_path, file)
|
64 |
|
65 |
+
# try:
|
66 |
+
# # Extract text from the PDF
|
67 |
+
# extracted_text = extract_text_from_pdf(pdf_path)
|
68 |
|
69 |
+
# if extracted_text.strip(): # Ensure extracted text is not just whitespace
|
70 |
+
# # Split extracted text into chunks of 1000 characters
|
71 |
+
# chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
|
72 |
|
73 |
+
# # Append extracted chunks to the list
|
74 |
+
# text_chunks.extend(chunks)
|
75 |
+
# else:
|
76 |
+
# print(f"No text found in the PDF: {file}")
|
77 |
|
78 |
+
# except Exception as e:
|
79 |
+
# print(f"Error reading the PDF {file}: {e}")
|
80 |
|
81 |
|
82 |
|
|
|
195 |
|
196 |
|
197 |
|
198 |
+
# if __name__ == "__main__":
|
199 |
+
# website = [
|
200 |
+
# #"https://www.rib.gov.rw/index.php?id=371",
|
201 |
+
# "https://haguruka.org.rw/our-work/"
|
202 |
+
# ]
|
203 |
+
# all_content = scrape_websites(website)
|
204 |
|
205 |
+
# # Temporary list to store (url, content) tuples
|
206 |
+
# temp_list = []
|
207 |
|
208 |
+
# # Process and store each URL with its content
|
209 |
+
# for url, content in all_content.items():
|
210 |
+
# temp_list.append((url, content))
|
211 |
|
212 |
|
213 |
|
214 |
+
# processed_texts = []
|
215 |
|
216 |
+
# # Process each element in the temporary list
|
217 |
+
# for element in temp_list:
|
218 |
+
# if isinstance(element, tuple):
|
219 |
+
# url, content = element # Unpack the tuple
|
220 |
+
# processed_texts.append(f"url: {url}, content: {content}")
|
221 |
+
# elif isinstance(element, str):
|
222 |
+
# processed_texts.append(element)
|
223 |
+
# else:
|
224 |
+
# processed_texts.append(str(element))
|
225 |
|
226 |
+
# def chunk_string(s, chunk_size=2000):
|
227 |
+
# return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
|
228 |
|
229 |
+
# # List to store the chunks
|
230 |
+
# chunked_texts = []
|
231 |
|
232 |
+
# for text in processed_texts:
|
233 |
+
# chunked_texts.extend(chunk_string(text))
|
234 |
|
235 |
data = []
|
236 |
data.extend(context_data)
|
237 |
+
# data.extend([item for item in text_chunks if item not in data])
|
238 |
+
# data.extend([item for item in chunked_texts if item not in data])
|
239 |
|
240 |
|
241 |
|