Rulga commited on
Commit
c2a566f
·
1 Parent(s): 58849bb

Enhance document loading with alternative method using BeautifulSoup and update requirements for requests and beautifulsoup4

Browse files
Files changed (2) hide show
  1. api/fastapi_server.py +25 -3
  2. requirements.txt +2 -1
api/fastapi_server.py CHANGED
@@ -30,6 +30,8 @@ import traceback
30
  from typing import Dict, List, Optional
31
  from pydantic import BaseModel
32
  from huggingface_hub import Repository, snapshot_download
 
 
33
 
34
  # Initialize environment variables
35
  load_dotenv()
@@ -232,15 +234,35 @@ def build_knowledge_base():
232
  # Create folder in advance
233
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
234
 
 
 
 
 
235
  # Load documents with detailed logging
236
  for url in URLS:
237
  try:
238
  print(f"Attempting to load {url}")
239
- loader = WebBaseLoader(url)
 
 
 
 
 
240
  docs = loader.load()
241
  print(f"Successfully loaded {url}, got {len(docs)} documents")
242
- documents.extend(docs)
243
- print(f"Loaded {url}")
 
 
 
 
 
 
 
 
 
 
 
244
  except Exception as e:
245
  print(f"Failed to load {url}: {str(e)}")
246
  print(f"Full error: {traceback.format_exc()}")
 
30
  from typing import Dict, List, Optional
31
  from pydantic import BaseModel
32
  from huggingface_hub import Repository, snapshot_download
33
+ import requests
34
+ from bs4 import BeautifulSoup
35
 
36
  # Initialize environment variables
37
  load_dotenv()
 
234
  # Create folder in advance
235
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
236
 
237
+ headers = {
238
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
239
+ }
240
+
241
  # Load documents with detailed logging
242
  for url in URLS:
243
  try:
244
  print(f"Attempting to load {url}")
245
+ loader = WebBaseLoader(
246
+ web_paths=[url],
247
+ header_template=headers,
248
+ requests_per_second=2,
249
+ timeout=30
250
+ )
251
  docs = loader.load()
252
  print(f"Successfully loaded {url}, got {len(docs)} documents")
253
+ if docs:
254
+ documents.extend(docs)
255
+ else:
256
+ # Попробуем альтернативный метод загрузки
257
+ response = requests.get(url, headers=headers, timeout=30)
258
+ response.raise_for_status()
259
+ soup = BeautifulSoup(response.text, 'html.parser')
260
+ # Получаем основной контент, исключая навигацию и футер
261
+ main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
262
+ if main_content:
263
+ from langchain_core.documents import Document
264
+ documents.append(Document(page_content=main_content, metadata={"source": url}))
265
+ print(f"Loaded {url} using alternative method")
266
  except Exception as e:
267
  print(f"Failed to load {url}: {str(e)}")
268
  print(f"Full error: {traceback.format_exc()}")
requirements.txt CHANGED
@@ -13,4 +13,5 @@ huggingface_hub>=0.19.0
13
  jinja2>=3.0.0
14
  aiofiles>=0.8.0
15
  python-multipart>=0.0.6
16
- requests
 
 
13
  jinja2>=3.0.0
14
  aiofiles>=0.8.0
15
  python-multipart>=0.0.6
16
+ beautifulsoup4>=4.12.0
17
+ requests>=2.31.0