UKURIKIYEYEZU commited on
Commit
b24f7f9
·
verified ·
1 Parent(s): d26ee43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -60
app.py CHANGED
@@ -39,44 +39,44 @@ for f, file in enumerate(data_files, 1):
39
 
40
 
41
 
42
- def extract_text_from_pdf(pdf_path):
43
- """Extracts text from a PDF file."""
44
- try:
45
- with open(pdf_path, "rb") as file:
46
- reader = PyPDF2.PdfReader(file)
47
- text = "".join(page.extract_text() or "" for page in reader.pages) # Handle None cases
48
- return text
49
- except Exception as e:
50
- print(f"Error extracting text from {pdf_path}: {e}")
51
- return ""
52
 
53
- folder_path = "./"
54
- # Initialize the list to hold the extracted text chunks
55
- text_chunks = []
56
 
57
- # Get all PDF filenames in the folder
58
- filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
59
 
60
- # Process each PDF file
61
- for index, file in enumerate(filenames, 1):
62
- print(f"\nProcessing file {index}: {file}")
63
- pdf_path = os.path.join(folder_path, file)
64
 
65
- try:
66
- # Extract text from the PDF
67
- extracted_text = extract_text_from_pdf(pdf_path)
68
 
69
- if extracted_text.strip(): # Ensure extracted text is not just whitespace
70
- # Split extracted text into chunks of 1000 characters
71
- chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
72
 
73
- # Append extracted chunks to the list
74
- text_chunks.extend(chunks)
75
- else:
76
- print(f"No text found in the PDF: {file}")
77
 
78
- except Exception as e:
79
- print(f"Error reading the PDF {file}: {e}")
80
 
81
 
82
 
@@ -195,47 +195,47 @@ def clean_body_content(html_content):
195
 
196
 
197
 
198
- if __name__ == "__main__":
199
- website = [
200
- #"https://www.rib.gov.rw/index.php?id=371",
201
- "https://haguruka.org.rw/our-work/"
202
- ]
203
- all_content = scrape_websites(website)
204
 
205
- # Temporary list to store (url, content) tuples
206
- temp_list = []
207
 
208
- # Process and store each URL with its content
209
- for url, content in all_content.items():
210
- temp_list.append((url, content))
211
 
212
 
213
 
214
- processed_texts = []
215
 
216
- # Process each element in the temporary list
217
- for element in temp_list:
218
- if isinstance(element, tuple):
219
- url, content = element # Unpack the tuple
220
- processed_texts.append(f"url: {url}, content: {content}")
221
- elif isinstance(element, str):
222
- processed_texts.append(element)
223
- else:
224
- processed_texts.append(str(element))
225
 
226
- def chunk_string(s, chunk_size=2000):
227
- return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
228
 
229
- # List to store the chunks
230
- chunked_texts = []
231
 
232
- for text in processed_texts:
233
- chunked_texts.extend(chunk_string(text))
234
 
235
  data = []
236
  data.extend(context_data)
237
- data.extend([item for item in text_chunks if item not in data])
238
- data.extend([item for item in chunked_texts if item not in data])
239
 
240
 
241
 
 
39
 
40
 
41
 
42
+ # def extract_text_from_pdf(pdf_path):
43
+ # """Extracts text from a PDF file."""
44
+ # try:
45
+ # with open(pdf_path, "rb") as file:
46
+ # reader = PyPDF2.PdfReader(file)
47
+ # text = "".join(page.extract_text() or "" for page in reader.pages) # Handle None cases
48
+ # return text
49
+ # except Exception as e:
50
+ # print(f"Error extracting text from {pdf_path}: {e}")
51
+ # return ""
52
 
53
+ # folder_path = "./"
54
+ # # Initialize the list to hold the extracted text chunks
55
+ # text_chunks = []
56
 
57
+ # # Get all PDF filenames in the folder
58
+ # filenames = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]
59
 
60
+ # # Process each PDF file
61
+ # for index, file in enumerate(filenames, 1):
62
+ # print(f"\nProcessing file {index}: {file}")
63
+ # pdf_path = os.path.join(folder_path, file)
64
 
65
+ # try:
66
+ # # Extract text from the PDF
67
+ # extracted_text = extract_text_from_pdf(pdf_path)
68
 
69
+ # if extracted_text.strip(): # Ensure extracted text is not just whitespace
70
+ # # Split extracted text into chunks of 1000 characters
71
+ # chunks = [extracted_text[i:i+2000] for i in range(0, len(extracted_text), 2000)]
72
 
73
+ # # Append extracted chunks to the list
74
+ # text_chunks.extend(chunks)
75
+ # else:
76
+ # print(f"No text found in the PDF: {file}")
77
 
78
+ # except Exception as e:
79
+ # print(f"Error reading the PDF {file}: {e}")
80
 
81
 
82
 
 
195
 
196
 
197
 
198
+ # if __name__ == "__main__":
199
+ # website = [
200
+ # #"https://www.rib.gov.rw/index.php?id=371",
201
+ # "https://haguruka.org.rw/our-work/"
202
+ # ]
203
+ # all_content = scrape_websites(website)
204
 
205
+ # # Temporary list to store (url, content) tuples
206
+ # temp_list = []
207
 
208
+ # # Process and store each URL with its content
209
+ # for url, content in all_content.items():
210
+ # temp_list.append((url, content))
211
 
212
 
213
 
214
+ # processed_texts = []
215
 
216
+ # # Process each element in the temporary list
217
+ # for element in temp_list:
218
+ # if isinstance(element, tuple):
219
+ # url, content = element # Unpack the tuple
220
+ # processed_texts.append(f"url: {url}, content: {content}")
221
+ # elif isinstance(element, str):
222
+ # processed_texts.append(element)
223
+ # else:
224
+ # processed_texts.append(str(element))
225
 
226
+ # def chunk_string(s, chunk_size=2000):
227
+ # return [s[i:i+chunk_size] for i in range(0, len(s), chunk_size)]
228
 
229
+ # # List to store the chunks
230
+ # chunked_texts = []
231
 
232
+ # for text in processed_texts:
233
+ # chunked_texts.extend(chunk_string(text))
234
 
235
  data = []
236
  data.extend(context_data)
237
+ # data.extend([item for item in text_chunks if item not in data])
238
+ # data.extend([item for item in chunked_texts if item not in data])
239
 
240
 
241