Spaces:

Rulga
/

Doc-chat

Sleeping

App Files Files Community

Rulga commited on Mar 21

Commit

c2a566f

1 Parent(s): 58849bb

Enhance document loading with alternative method using BeautifulSoup and update requirements for requests and beautifulsoup4

Browse files

Files changed (2) hide show

api/fastapi_server.py +25 -3
requirements.txt +2 -1

api/fastapi_server.py CHANGED Viewed

@@ -30,6 +30,8 @@ import traceback
 from typing import Dict, List, Optional
 from pydantic import BaseModel
 from huggingface_hub import Repository, snapshot_download
 # Initialize environment variables
 load_dotenv()
@@ -232,15 +234,35 @@ def build_knowledge_base():
         # Create folder in advance
         os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
         # Load documents with detailed logging
         for url in URLS:
             try:
                 print(f"Attempting to load {url}")
-                loader = WebBaseLoader(url)
                 docs = loader.load()
                 print(f"Successfully loaded {url}, got {len(docs)} documents")
-                documents.extend(docs)
-                print(f"Loaded {url}")
             except Exception as e:
                 print(f"Failed to load {url}: {str(e)}")
                 print(f"Full error: {traceback.format_exc()}")

 from typing import Dict, List, Optional
 from pydantic import BaseModel
 from huggingface_hub import Repository, snapshot_download
+import requests
+from bs4 import BeautifulSoup
 # Initialize environment variables
 load_dotenv()
         # Create folder in advance
         os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
         # Load documents with detailed logging
         for url in URLS:
             try:
                 print(f"Attempting to load {url}")
+                loader = WebBaseLoader(
+                    web_paths=[url],
+                    header_template=headers,
+                    requests_per_second=2,
+                    timeout=30
+                )
                 docs = loader.load()
                 print(f"Successfully loaded {url}, got {len(docs)} documents")
+                if docs:
+                    documents.extend(docs)
+                else:
+                    # Попробуем альтернативный метод загрузки
+                    response = requests.get(url, headers=headers, timeout=30)
+                    response.raise_for_status()
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    # Получаем основной контент, исключая навигацию и футер
+                    main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
+                    if main_content:
+                        from langchain_core.documents import Document
+                        documents.append(Document(page_content=main_content, metadata={"source": url}))
+                        print(f"Loaded {url} using alternative method")
             except Exception as e:
                 print(f"Failed to load {url}: {str(e)}")
                 print(f"Full error: {traceback.format_exc()}")

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ huggingface_hub>=0.19.0
 jinja2>=3.0.0
 aiofiles>=0.8.0
 python-multipart>=0.0.6
-requests

 jinja2>=3.0.0
 aiofiles>=0.8.0
 python-multipart>=0.0.6
+beautifulsoup4>=4.12.0
+requests>=2.31.0