Enhance document loading with alternative method using BeautifulSoup and update requirements for requests and beautifulsoup4
Browse files- api/fastapi_server.py +25 -3
- requirements.txt +2 -1
api/fastapi_server.py
CHANGED
@@ -30,6 +30,8 @@ import traceback
|
|
30 |
from typing import Dict, List, Optional
|
31 |
from pydantic import BaseModel
|
32 |
from huggingface_hub import Repository, snapshot_download
|
|
|
|
|
33 |
|
34 |
# Initialize environment variables
|
35 |
load_dotenv()
|
@@ -232,15 +234,35 @@ def build_knowledge_base():
|
|
232 |
# Create folder in advance
|
233 |
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
234 |
|
|
|
|
|
|
|
|
|
235 |
# Load documents with detailed logging
|
236 |
for url in URLS:
|
237 |
try:
|
238 |
print(f"Attempting to load {url}")
|
239 |
-
loader = WebBaseLoader(
|
|
|
|
|
|
|
|
|
|
|
240 |
docs = loader.load()
|
241 |
print(f"Successfully loaded {url}, got {len(docs)} documents")
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
except Exception as e:
|
245 |
print(f"Failed to load {url}: {str(e)}")
|
246 |
print(f"Full error: {traceback.format_exc()}")
|
|
|
30 |
from typing import Dict, List, Optional
|
31 |
from pydantic import BaseModel
|
32 |
from huggingface_hub import Repository, snapshot_download
|
33 |
+
import requests
|
34 |
+
from bs4 import BeautifulSoup
|
35 |
|
36 |
# Initialize environment variables
|
37 |
load_dotenv()
|
|
|
234 |
# Create folder in advance
|
235 |
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
236 |
|
237 |
+
headers = {
|
238 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
239 |
+
}
|
240 |
+
|
241 |
# Load documents with detailed logging
|
242 |
for url in URLS:
|
243 |
try:
|
244 |
print(f"Attempting to load {url}")
|
245 |
+
loader = WebBaseLoader(
|
246 |
+
web_paths=[url],
|
247 |
+
header_template=headers,
|
248 |
+
requests_per_second=2,
|
249 |
+
timeout=30
|
250 |
+
)
|
251 |
docs = loader.load()
|
252 |
print(f"Successfully loaded {url}, got {len(docs)} documents")
|
253 |
+
if docs:
|
254 |
+
documents.extend(docs)
|
255 |
+
else:
|
256 |
+
# Попробуем альтернативный метод загрузки
|
257 |
+
response = requests.get(url, headers=headers, timeout=30)
|
258 |
+
response.raise_for_status()
|
259 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
260 |
+
# Получаем основной контент, исключая навигацию и футер
|
261 |
+
main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
|
262 |
+
if main_content:
|
263 |
+
from langchain_core.documents import Document
|
264 |
+
documents.append(Document(page_content=main_content, metadata={"source": url}))
|
265 |
+
print(f"Loaded {url} using alternative method")
|
266 |
except Exception as e:
|
267 |
print(f"Failed to load {url}: {str(e)}")
|
268 |
print(f"Full error: {traceback.format_exc()}")
|
requirements.txt
CHANGED
@@ -13,4 +13,5 @@ huggingface_hub>=0.19.0
|
|
13 |
jinja2>=3.0.0
|
14 |
aiofiles>=0.8.0
|
15 |
python-multipart>=0.0.6
|
16 |
-
|
|
|
|
13 |
jinja2>=3.0.0
|
14 |
aiofiles>=0.8.0
|
15 |
python-multipart>=0.0.6
|
16 |
+
beautifulsoup4>=4.12.0
|
17 |
+
requests>=2.31.0
|