Rulga commited on
Commit
2aa225e
·
1 Parent(s): f631be3

cl min tid

Browse files
Files changed (1) hide show
  1. app.py +132 -94
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import os
2
- import time
3
  import sys
4
  import json
5
  import traceback
6
  import warnings
 
 
7
  from datetime import datetime
8
  from typing import Optional, List, Dict
9
  import logging
@@ -15,16 +16,14 @@ logging.basicConfig(
15
  format='%(asctime)s - %(levelname)s - %(message)s'
16
  )
17
 
18
- import requests
19
  from bs4 import BeautifulSoup
20
  from dotenv import load_dotenv
21
- from fastapi import FastAPI, HTTPException
22
  from pydantic import BaseModel
23
  from langchain_groq import ChatGroq
24
  from langchain_huggingface import HuggingFaceEmbeddings
25
  from langchain_community.vectorstores import FAISS
26
  from langchain_text_splitters import RecursiveCharacterTextSplitter
27
- from langchain_community.document_loaders import WebBaseLoader, BSHTMLLoader
28
  from langchain_core.prompts import PromptTemplate
29
  from langchain_core.output_parsers import StrOutputParser
30
  from langchain_core.tracers import ConsoleCallbackHandler
@@ -42,6 +41,8 @@ app = FastAPI(title="Status Law Assistant API")
42
 
43
  # Конфигурация базы знаний
44
  KB_CONFIG_PATH = "vector_store/kb_config.json"
 
 
45
 
46
  def get_kb_config():
47
  if os.path.exists(KB_CONFIG_PATH):
@@ -84,21 +85,6 @@ URLS = [
84
  "https://status.law/faq"
85
  ]
86
 
87
- # Check write permissions
88
- try:
89
- if not os.path.exists(VECTOR_STORE_PATH):
90
- os.makedirs(VECTOR_STORE_PATH)
91
- test_file_path = os.path.join(VECTOR_STORE_PATH, 'test_write.txt')
92
- with open(test_file_path, 'w') as f:
93
- f.write('test')
94
- os.remove(test_file_path)
95
- print(f"Write permissions OK for {VECTOR_STORE_PATH}")
96
- except Exception as e:
97
- print(f"WARNING: No write permissions for {VECTOR_STORE_PATH}: {str(e)}")
98
- print("Current working directory:", os.getcwd())
99
- print("User:", os.getenv('USER'))
100
- sys.exit(1)
101
-
102
  # Enhanced logging
103
  class CustomCallbackHandler(ConsoleCallbackHandler):
104
  def on_chain_end(self, run):
@@ -127,56 +113,96 @@ def init_models():
127
  api_key=os.getenv("GROQ_API_KEY"),
128
  callback_manager=callback_manager
129
  )
 
130
  embeddings = HuggingFaceEmbeddings(
131
- model_name="intfloat/multilingual-e5-large-instruct"
132
  )
133
  return llm, embeddings
134
  except Exception as e:
135
  raise Exception(f"Model initialization failed: {str(e)}")
136
 
137
- def check_url_availability(url: str) -> bool:
 
 
 
 
 
 
 
138
  try:
139
- response = requests.get(url, verify=False, timeout=10)
140
- return response.status_code == 200
 
 
 
 
 
 
 
 
141
  except Exception as e:
142
- print(f"Error checking {url}: {str(e)}")
143
- return False
144
 
145
- def load_url_content(url: str) -> List[Document]:
146
- try:
147
- response = requests.get(url, verify=False, timeout=30)
148
- if response.status_code != 200:
149
- print(f"Failed to load {url}, status code: {response.status_code}")
150
- return []
151
-
152
- soup = BeautifulSoup(response.text, 'html.parser')
153
 
154
- # Remove script and style elements
155
- for script in soup(["script", "style"]):
156
- script.decompose()
157
-
158
- # Get text content
159
- text = soup.get_text()
160
 
161
- # Clean up text
162
- lines = (line.strip() for line in text.splitlines())
163
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
164
- text = ' '.join(chunk for chunk in chunks if chunk)
 
 
 
 
 
 
165
 
166
- return [Document(page_content=text, metadata={"source": url})]
167
- except Exception as e:
168
- print(f"Error processing {url}: {str(e)}")
169
- return []
170
 
171
- def build_knowledge_base(embeddings):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  try:
173
  logger.info("Starting knowledge base construction...")
174
  kb_config = get_kb_config()
175
- documents = []
176
- os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
177
 
178
  # Определяем URL для обработки
179
- urls_to_process = [url for url in URLS if url not in kb_config["processed_urls"]]
 
 
 
 
 
180
 
181
  if not urls_to_process:
182
  logger.info("No new URLs to process")
@@ -184,41 +210,27 @@ def build_knowledge_base(embeddings):
184
 
185
  logger.info(f"Processing {len(urls_to_process)} new URLs")
186
 
187
- available_urls = [url for url in urls_to_process if check_url_availability(url)]
188
- logger.info(f"Accessible URLs: {len(available_urls)} out of {len(urls_to_process)}")
189
-
190
- for url in available_urls:
191
- try:
192
- logger.info(f"Processing {url}")
193
- docs = load_url_content(url)
194
- if docs:
195
- documents.extend(docs)
196
- kb_config["processed_urls"].append(url)
197
- logger.info(f"Successfully loaded content from {url}")
198
- else:
199
- logger.warning(f"No content extracted from {url}")
200
- except Exception as e:
201
- logger.error(f"Failed to process {url}: {str(e)}")
202
- continue
203
 
204
  if not documents:
205
- if kb_config["processed_urls"]:
206
  logger.info("No new documents to add, loading existing vector store")
207
  return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
208
  raise Exception("No documents were successfully loaded!")
209
 
210
  logger.info(f"Total new documents loaded: {len(documents)}")
211
 
 
212
  text_splitter = RecursiveCharacterTextSplitter(
213
- chunk_size=1000,
214
- chunk_overlap=50
215
  )
216
  logger.info("Splitting documents into chunks...")
217
  chunks = text_splitter.split_documents(documents)
218
  logger.info(f"Created {len(chunks)} chunks")
219
 
220
- # Если есть существующая база знаний, добавляем к ней
221
- if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
222
  logger.info("Loading existing vector store...")
223
  vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
224
  logger.info("Adding new documents to existing vector store...")
@@ -231,6 +243,10 @@ def build_knowledge_base(embeddings):
231
  vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
232
 
233
  # Обновляем конфигурацию
 
 
 
 
234
  kb_config["version"] += 1
235
  kb_config["last_update"] = datetime.now().isoformat()
236
  save_kb_config(kb_config)
@@ -244,10 +260,14 @@ def build_knowledge_base(embeddings):
244
  raise Exception(f"Knowledge base creation failed: {str(e)}")
245
 
246
  # Initialize models and knowledge base on startup
247
- try:
248
- llm, embeddings = init_models()
249
- vector_store = None
250
 
 
 
 
 
 
251
  if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
252
  try:
253
  vector_store = FAISS.load_local(
@@ -257,28 +277,26 @@ try:
257
  )
258
  logger.info("Successfully loaded existing knowledge base")
259
  except Exception as e:
260
- logger.warning(f"Could not load existing knowledge base, will create new one: {str(e)}")
261
  vector_store = None
262
  else:
263
- logger.info("No existing knowledge base found, will create new one")
264
-
265
- if vector_store is None:
266
- logger.info("Building new knowledge base...")
267
- vector_store = build_knowledge_base(embeddings)
268
- logger.info("Knowledge base built successfully")
269
 
270
- except Exception as e:
271
- logger.error(f"Critical initialization error: {str(e)}")
272
- logger.error(traceback.format_exc())
273
- raise
274
-
275
- # API endpoints
276
  # API endpoints
277
  @app.post("/chat", response_model=ChatResponse)
278
  async def chat_endpoint(request: ChatRequest):
 
 
 
 
 
 
 
 
 
279
  try:
280
  # Retrieve context
281
- context_docs = vector_store.similarity_search(request.message)
282
  context_text = "\n".join([d.page_content for d in context_docs])
283
 
284
  # Generate response
@@ -315,19 +333,39 @@ async def chat_endpoint(request: ChatRequest):
315
  raise HTTPException(status_code=500, detail=str(e))
316
 
317
  @app.post("/rebuild-kb")
318
- async def rebuild_knowledge_base():
 
 
 
 
 
 
 
319
  try:
320
- global vector_store
321
- vector_store = build_knowledge_base(embeddings)
322
- return {"status": "success", "message": "Knowledge base rebuilt successfully"}
 
323
  except Exception as e:
324
  raise HTTPException(status_code=500, detail=str(e))
325
 
 
 
 
 
 
 
 
 
 
326
  @app.get("/kb-status")
327
  async def get_kb_status():
328
  """Get current knowledge base status"""
 
 
329
  kb_config = get_kb_config()
330
  return {
 
331
  "version": kb_config["version"],
332
  "total_urls": len(URLS),
333
  "processed_urls": len(kb_config["processed_urls"]),
@@ -356,4 +394,4 @@ def log_interaction(user_input: str, bot_response: str, context: str):
356
 
357
  if __name__ == "__main__":
358
  import uvicorn
359
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
1
  import os
 
2
  import sys
3
  import json
4
  import traceback
5
  import warnings
6
+ import asyncio
7
+ import aiohttp
8
  from datetime import datetime
9
  from typing import Optional, List, Dict
10
  import logging
 
16
  format='%(asctime)s - %(levelname)s - %(message)s'
17
  )
18
 
 
19
  from bs4 import BeautifulSoup
20
  from dotenv import load_dotenv
21
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
22
  from pydantic import BaseModel
23
  from langchain_groq import ChatGroq
24
  from langchain_huggingface import HuggingFaceEmbeddings
25
  from langchain_community.vectorstores import FAISS
26
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
27
  from langchain_core.prompts import PromptTemplate
28
  from langchain_core.output_parsers import StrOutputParser
29
  from langchain_core.tracers import ConsoleCallbackHandler
 
41
 
42
  # Конфигурация базы знаний
43
  KB_CONFIG_PATH = "vector_store/kb_config.json"
44
+ CACHE_DIR = "cache"
45
+ os.makedirs(CACHE_DIR, exist_ok=True)
46
 
47
  def get_kb_config():
48
  if os.path.exists(KB_CONFIG_PATH):
 
85
  "https://status.law/faq"
86
  ]
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # Enhanced logging
89
  class CustomCallbackHandler(ConsoleCallbackHandler):
90
  def on_chain_end(self, run):
 
113
  api_key=os.getenv("GROQ_API_KEY"),
114
  callback_manager=callback_manager
115
  )
116
+ # Используем smaller модель для эмбеддингов
117
  embeddings = HuggingFaceEmbeddings(
118
+ model_name="intfloat/multilingual-e5-small-instruct"
119
  )
120
  return llm, embeddings
121
  except Exception as e:
122
  raise Exception(f"Model initialization failed: {str(e)}")
123
 
124
+ async def fetch_url(session, url):
125
+ cache_file = os.path.join(CACHE_DIR, f"{url.replace('/', '_').replace(':', '_')}.html")
126
+
127
+ # Проверяем кэш
128
+ if os.path.exists(cache_file):
129
+ with open(cache_file, 'r', encoding='utf-8') as f:
130
+ return url, f.read()
131
+
132
  try:
133
+ async with session.get(url, ssl=False, timeout=30) as response:
134
+ if response.status == 200:
135
+ content = await response.text()
136
+ # Сохраняем в кэш
137
+ with open(cache_file, 'w', encoding='utf-8') as f:
138
+ f.write(content)
139
+ return url, content
140
+ else:
141
+ logger.warning(f"Failed to load {url}, status code: {response.status}")
142
+ return url, None
143
  except Exception as e:
144
+ logger.error(f"Error fetching {url}: {str(e)}")
145
+ return url, None
146
 
147
+ def process_html_content(url, html_content):
148
+ if not html_content:
149
+ return None
 
 
 
 
 
150
 
151
+ soup = BeautifulSoup(html_content, 'html.parser')
152
+
153
+ # Remove script and style elements
154
+ for script in soup(["script", "style"]):
155
+ script.decompose()
 
156
 
157
+ # Get text content
158
+ text = soup.get_text()
159
+
160
+ # Clean up text
161
+ lines = (line.strip() for line in text.splitlines())
162
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
163
+ text = ' '.join(chunk for chunk in chunks if chunk)
164
+
165
+ if not text.strip():
166
+ return None
167
 
168
+ return Document(page_content=text, metadata={"source": url})
 
 
 
169
 
170
+ async def load_all_urls(urls_to_process):
171
+ documents = []
172
+
173
+ async with aiohttp.ClientSession() as session:
174
+ tasks = [fetch_url(session, url) for url in urls_to_process]
175
+ results = await asyncio.gather(*tasks)
176
+
177
+ for url, content in results:
178
+ if content:
179
+ doc = process_html_content(url, content)
180
+ if doc:
181
+ documents.append(doc)
182
+ logger.info(f"Successfully processed content from {url}")
183
+ else:
184
+ logger.warning(f"No useful content extracted from {url}")
185
+ else:
186
+ logger.warning(f"Failed to load content from {url}")
187
+
188
+ return documents
189
+
190
+ async def build_knowledge_base_async(embeddings, force_rebuild=False):
191
+ """
192
+ Асинхронное построение базы знаний.
193
+ Параметр force_rebuild позволяет принудительно обновить все URL.
194
+ """
195
  try:
196
  logger.info("Starting knowledge base construction...")
197
  kb_config = get_kb_config()
 
 
198
 
199
  # Определяем URL для обработки
200
+ if force_rebuild:
201
+ urls_to_process = URLS
202
+ kb_config["processed_urls"] = []
203
+ logger.info("Forcing rebuild of entire knowledge base")
204
+ else:
205
+ urls_to_process = [url for url in URLS if url not in kb_config["processed_urls"]]
206
 
207
  if not urls_to_process:
208
  logger.info("No new URLs to process")
 
210
 
211
  logger.info(f"Processing {len(urls_to_process)} new URLs")
212
 
213
+ documents = await load_all_urls(urls_to_process)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  if not documents:
216
+ if kb_config["processed_urls"] and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
217
  logger.info("No new documents to add, loading existing vector store")
218
  return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
219
  raise Exception("No documents were successfully loaded!")
220
 
221
  logger.info(f"Total new documents loaded: {len(documents)}")
222
 
223
+ # Увеличиваем размер чанков
224
  text_splitter = RecursiveCharacterTextSplitter(
225
+ chunk_size=2500, # Увеличенный размер чанка
226
+ chunk_overlap=100
227
  )
228
  logger.info("Splitting documents into chunks...")
229
  chunks = text_splitter.split_documents(documents)
230
  logger.info(f"Created {len(chunks)} chunks")
231
 
232
+ # Если есть существующая база знаний и мы не выполняем полное обновление, добавляем к ней
233
+ if not force_rebuild and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
234
  logger.info("Loading existing vector store...")
235
  vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
236
  logger.info("Adding new documents to existing vector store...")
 
243
  vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
244
 
245
  # Обновляем конфигурацию
246
+ for url in urls_to_process:
247
+ if url not in kb_config["processed_urls"]:
248
+ kb_config["processed_urls"].append(url)
249
+
250
  kb_config["version"] += 1
251
  kb_config["last_update"] = datetime.now().isoformat()
252
  save_kb_config(kb_config)
 
260
  raise Exception(f"Knowledge base creation failed: {str(e)}")
261
 
262
  # Initialize models and knowledge base on startup
263
+ llm, embeddings = init_models()
264
+ vector_store = None
 
265
 
266
+ @app.on_event("startup")
267
+ async def startup_event():
268
+ global vector_store
269
+
270
+ # Только загружаем существующую базу при старте, не создаем новую
271
  if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
272
  try:
273
  vector_store = FAISS.load_local(
 
277
  )
278
  logger.info("Successfully loaded existing knowledge base")
279
  except Exception as e:
280
+ logger.warning(f"Could not load existing knowledge base: {str(e)}")
281
  vector_store = None
282
  else:
283
+ logger.warning("No existing knowledge base found, please use /rebuild-kb endpoint to create one")
 
 
 
 
 
284
 
 
 
 
 
 
 
285
  # API endpoints
286
  @app.post("/chat", response_model=ChatResponse)
287
  async def chat_endpoint(request: ChatRequest):
288
+ global vector_store
289
+
290
+ # Проверяем, инициализирована ли база знаний
291
+ if vector_store is None:
292
+ raise HTTPException(
293
+ status_code=503,
294
+ detail="Knowledge base not initialized. Please use /rebuild-kb endpoint first."
295
+ )
296
+
297
  try:
298
  # Retrieve context
299
+ context_docs = vector_store.similarity_search(request.message, k=3) # Ограничиваем количество документов
300
  context_text = "\n".join([d.page_content for d in context_docs])
301
 
302
  # Generate response
 
333
  raise HTTPException(status_code=500, detail=str(e))
334
 
335
  @app.post("/rebuild-kb")
336
+ async def rebuild_knowledge_base(background_tasks: BackgroundTasks, force: bool = False):
337
+ """
338
+ Rebuild knowledge base in the background
339
+
340
+ - force: если True, перестраивает всю базу знаний с нуля
341
+ """
342
+ global vector_store
343
+
344
  try:
345
+ # Запускаем в фоне
346
+ background_tasks.add_task(_rebuild_kb_task, force)
347
+ action = "rebuild" if force else "update"
348
+ return {"status": "success", "message": f"Knowledge base {action} started in background"}
349
  except Exception as e:
350
  raise HTTPException(status_code=500, detail=str(e))
351
 
352
+ async def _rebuild_kb_task(force: bool = False):
353
+ """Фоновая задача для обновления базы знаний"""
354
+ global vector_store
355
+ try:
356
+ vector_store = await build_knowledge_base_async(embeddings, force_rebuild=force)
357
+ logger.info("Knowledge base rebuild completed successfully")
358
+ except Exception as e:
359
+ logger.error(f"Knowledge base rebuild failed: {str(e)}")
360
+
361
  @app.get("/kb-status")
362
  async def get_kb_status():
363
  """Get current knowledge base status"""
364
+ global vector_store
365
+
366
  kb_config = get_kb_config()
367
  return {
368
+ "initialized": vector_store is not None,
369
  "version": kb_config["version"],
370
  "total_urls": len(URLS),
371
  "processed_urls": len(kb_config["processed_urls"]),
 
394
 
395
  if __name__ == "__main__":
396
  import uvicorn
397
+ uvicorn.run(app, host="0.0.0.0", port=8000)