thechaiexperiment commited on
Commit
8473822
·
verified ·
1 Parent(s): e7b3f0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -0
app.py CHANGED
@@ -192,6 +192,46 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
192
  print(f"Error loading embeddings: {e}")
193
  return None
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  init_success = load_models() and load_data()
196
 
197
  def translate_text(text, source_to_target='ar_to_en'):
 
192
  print(f"Error loading embeddings: {e}")
193
  return None
194
 
195
+ def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
196
+ """Load document data from HTML articles in a specified folder."""
197
+ try:
198
+ print("Loading documents data...")
199
+ # Check if the folder exists
200
+ if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
201
+ print(f"Error: Folder '{folder_path}' not found")
202
+ return False
203
+ # List all HTML files in the folder
204
+ html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
205
+ if not html_files:
206
+ print(f"No HTML files found in folder '{folder_path}'")
207
+ return False
208
+ documents = []
209
+ # Iterate through each HTML file and parse the content
210
+ for file_name in html_files:
211
+ file_path = os.path.join(folder_path, file_name)
212
+ try:
213
+ with open(file_path, 'r', encoding='utf-8') as file:
214
+ # Parse the HTML file
215
+ soup = BeautifulSoup(file, 'html.parser')
216
+ # Extract text content (or customize this as per your needs)
217
+ text = soup.get_text(separator='\n').strip()
218
+ documents.append({"file_name": file_name, "content": text})
219
+ except Exception as e:
220
+ print(f"Error reading file {file_name}: {e}")
221
+ # Convert the list of documents to a DataFrame
222
+ data['df'] = pd.DataFrame(documents)
223
+
224
+ if data['df'].empty:
225
+ print("No valid documents loaded.")
226
+ return False
227
+ print(f"Successfully loaded {len(data['df'])} document records.")
228
+ return True
229
+ except Exception as e:
230
+ print(f"Error loading documents data: {e}")
231
+ data['df'] = pd.DataFrame()
232
+ return False
233
+
234
+
235
  init_success = load_models() and load_data()
236
 
237
  def translate_text(text, source_to_target='ar_to_en'):