Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -192,6 +192,46 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
|
|
192 |
print(f"Error loading embeddings: {e}")
|
193 |
return None
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
init_success = load_models() and load_data()
|
196 |
|
197 |
def translate_text(text, source_to_target='ar_to_en'):
|
|
|
192 |
print(f"Error loading embeddings: {e}")
|
193 |
return None
|
194 |
|
195 |
+
def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
|
196 |
+
"""Load document data from HTML articles in a specified folder."""
|
197 |
+
try:
|
198 |
+
print("Loading documents data...")
|
199 |
+
# Check if the folder exists
|
200 |
+
if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
|
201 |
+
print(f"Error: Folder '{folder_path}' not found")
|
202 |
+
return False
|
203 |
+
# List all HTML files in the folder
|
204 |
+
html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
|
205 |
+
if not html_files:
|
206 |
+
print(f"No HTML files found in folder '{folder_path}'")
|
207 |
+
return False
|
208 |
+
documents = []
|
209 |
+
# Iterate through each HTML file and parse the content
|
210 |
+
for file_name in html_files:
|
211 |
+
file_path = os.path.join(folder_path, file_name)
|
212 |
+
try:
|
213 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
214 |
+
# Parse the HTML file
|
215 |
+
soup = BeautifulSoup(file, 'html.parser')
|
216 |
+
# Extract text content (or customize this as per your needs)
|
217 |
+
text = soup.get_text(separator='\n').strip()
|
218 |
+
documents.append({"file_name": file_name, "content": text})
|
219 |
+
except Exception as e:
|
220 |
+
print(f"Error reading file {file_name}: {e}")
|
221 |
+
# Convert the list of documents to a DataFrame
|
222 |
+
data['df'] = pd.DataFrame(documents)
|
223 |
+
|
224 |
+
if data['df'].empty:
|
225 |
+
print("No valid documents loaded.")
|
226 |
+
return False
|
227 |
+
print(f"Successfully loaded {len(data['df'])} document records.")
|
228 |
+
return True
|
229 |
+
except Exception as e:
|
230 |
+
print(f"Error loading documents data: {e}")
|
231 |
+
data['df'] = pd.DataFrame()
|
232 |
+
return False
|
233 |
+
|
234 |
+
|
235 |
init_success = load_models() and load_data()
|
236 |
|
237 |
def translate_text(text, source_to_target='ar_to_en'):
|