Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 22

Commit

8473822

verified ·

1 Parent(s): e7b3f0b

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -0

app.py CHANGED Viewed

@@ -192,6 +192,46 @@ def load_recipes_embeddings() -> Optional[Dict[str, np.ndarray]]:
         print(f"Error loading embeddings: {e}")
         return None
 init_success = load_models() and load_data()
 def translate_text(text, source_to_target='ar_to_en'):

         print(f"Error loading embeddings: {e}")
         return None
+def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
+    """Load document data from HTML articles in a specified folder."""
+    try:
+        print("Loading documents data...")
+        # Check if the folder exists
+        if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
+            print(f"Error: Folder '{folder_path}' not found")
+            return False
+        # List all HTML files in the folder
+        html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
+        if not html_files:
+            print(f"No HTML files found in folder '{folder_path}'")
+            return False
+        documents = []
+        # Iterate through each HTML file and parse the content
+        for file_name in html_files:
+            file_path = os.path.join(folder_path, file_name)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    # Parse the HTML file
+                    soup = BeautifulSoup(file, 'html.parser')
+                    # Extract text content (or customize this as per your needs)
+                    text = soup.get_text(separator='\n').strip()
+                    documents.append({"file_name": file_name, "content": text})
+            except Exception as e:
+                print(f"Error reading file {file_name}: {e}")
+            # Convert the list of documents to a DataFrame
+            data['df'] = pd.DataFrame(documents)
+            if data['df'].empty:
+                print("No valid documents loaded.")
+                return False
+            print(f"Successfully loaded {len(data['df'])} document records.")
+            return True
+        except Exception as e:
+            print(f"Error loading documents data: {e}")
+            data['df'] = pd.DataFrame()
+            return False
 init_success = load_models() and load_data()
 def translate_text(text, source_to_target='ar_to_en'):