cl min tid
Browse files
app.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
import os
|
2 |
-
import time
|
3 |
import sys
|
4 |
import json
|
5 |
import traceback
|
6 |
import warnings
|
|
|
|
|
7 |
from datetime import datetime
|
8 |
from typing import Optional, List, Dict
|
9 |
import logging
|
@@ -15,16 +16,14 @@ logging.basicConfig(
|
|
15 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
16 |
)
|
17 |
|
18 |
-
import requests
|
19 |
from bs4 import BeautifulSoup
|
20 |
from dotenv import load_dotenv
|
21 |
-
from fastapi import FastAPI, HTTPException
|
22 |
from pydantic import BaseModel
|
23 |
from langchain_groq import ChatGroq
|
24 |
from langchain_huggingface import HuggingFaceEmbeddings
|
25 |
from langchain_community.vectorstores import FAISS
|
26 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
27 |
-
from langchain_community.document_loaders import WebBaseLoader, BSHTMLLoader
|
28 |
from langchain_core.prompts import PromptTemplate
|
29 |
from langchain_core.output_parsers import StrOutputParser
|
30 |
from langchain_core.tracers import ConsoleCallbackHandler
|
@@ -42,6 +41,8 @@ app = FastAPI(title="Status Law Assistant API")
|
|
42 |
|
43 |
# Конфигурация базы знаний
|
44 |
KB_CONFIG_PATH = "vector_store/kb_config.json"
|
|
|
|
|
45 |
|
46 |
def get_kb_config():
|
47 |
if os.path.exists(KB_CONFIG_PATH):
|
@@ -84,21 +85,6 @@ URLS = [
|
|
84 |
"https://status.law/faq"
|
85 |
]
|
86 |
|
87 |
-
# Check write permissions
|
88 |
-
try:
|
89 |
-
if not os.path.exists(VECTOR_STORE_PATH):
|
90 |
-
os.makedirs(VECTOR_STORE_PATH)
|
91 |
-
test_file_path = os.path.join(VECTOR_STORE_PATH, 'test_write.txt')
|
92 |
-
with open(test_file_path, 'w') as f:
|
93 |
-
f.write('test')
|
94 |
-
os.remove(test_file_path)
|
95 |
-
print(f"Write permissions OK for {VECTOR_STORE_PATH}")
|
96 |
-
except Exception as e:
|
97 |
-
print(f"WARNING: No write permissions for {VECTOR_STORE_PATH}: {str(e)}")
|
98 |
-
print("Current working directory:", os.getcwd())
|
99 |
-
print("User:", os.getenv('USER'))
|
100 |
-
sys.exit(1)
|
101 |
-
|
102 |
# Enhanced logging
|
103 |
class CustomCallbackHandler(ConsoleCallbackHandler):
|
104 |
def on_chain_end(self, run):
|
@@ -127,56 +113,96 @@ def init_models():
|
|
127 |
api_key=os.getenv("GROQ_API_KEY"),
|
128 |
callback_manager=callback_manager
|
129 |
)
|
|
|
130 |
embeddings = HuggingFaceEmbeddings(
|
131 |
-
model_name="intfloat/multilingual-e5-
|
132 |
)
|
133 |
return llm, embeddings
|
134 |
except Exception as e:
|
135 |
raise Exception(f"Model initialization failed: {str(e)}")
|
136 |
|
137 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
try:
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
except Exception as e:
|
142 |
-
|
143 |
-
return
|
144 |
|
145 |
-
def
|
146 |
-
|
147 |
-
|
148 |
-
if response.status_code != 200:
|
149 |
-
print(f"Failed to load {url}, status code: {response.status_code}")
|
150 |
-
return []
|
151 |
-
|
152 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
text = soup.get_text()
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
-
|
167 |
-
except Exception as e:
|
168 |
-
print(f"Error processing {url}: {str(e)}")
|
169 |
-
return []
|
170 |
|
171 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
try:
|
173 |
logger.info("Starting knowledge base construction...")
|
174 |
kb_config = get_kb_config()
|
175 |
-
documents = []
|
176 |
-
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
177 |
|
178 |
# Определяем URL для обработки
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
if not urls_to_process:
|
182 |
logger.info("No new URLs to process")
|
@@ -184,41 +210,27 @@ def build_knowledge_base(embeddings):
|
|
184 |
|
185 |
logger.info(f"Processing {len(urls_to_process)} new URLs")
|
186 |
|
187 |
-
|
188 |
-
logger.info(f"Accessible URLs: {len(available_urls)} out of {len(urls_to_process)}")
|
189 |
-
|
190 |
-
for url in available_urls:
|
191 |
-
try:
|
192 |
-
logger.info(f"Processing {url}")
|
193 |
-
docs = load_url_content(url)
|
194 |
-
if docs:
|
195 |
-
documents.extend(docs)
|
196 |
-
kb_config["processed_urls"].append(url)
|
197 |
-
logger.info(f"Successfully loaded content from {url}")
|
198 |
-
else:
|
199 |
-
logger.warning(f"No content extracted from {url}")
|
200 |
-
except Exception as e:
|
201 |
-
logger.error(f"Failed to process {url}: {str(e)}")
|
202 |
-
continue
|
203 |
|
204 |
if not documents:
|
205 |
-
if kb_config["processed_urls"]:
|
206 |
logger.info("No new documents to add, loading existing vector store")
|
207 |
return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
|
208 |
raise Exception("No documents were successfully loaded!")
|
209 |
|
210 |
logger.info(f"Total new documents loaded: {len(documents)}")
|
211 |
|
|
|
212 |
text_splitter = RecursiveCharacterTextSplitter(
|
213 |
-
chunk_size=
|
214 |
-
chunk_overlap=
|
215 |
)
|
216 |
logger.info("Splitting documents into chunks...")
|
217 |
chunks = text_splitter.split_documents(documents)
|
218 |
logger.info(f"Created {len(chunks)} chunks")
|
219 |
|
220 |
-
# Если есть существующая база
|
221 |
-
if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
|
222 |
logger.info("Loading existing vector store...")
|
223 |
vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
|
224 |
logger.info("Adding new documents to existing vector store...")
|
@@ -231,6 +243,10 @@ def build_knowledge_base(embeddings):
|
|
231 |
vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
|
232 |
|
233 |
# Обновляем конфигурацию
|
|
|
|
|
|
|
|
|
234 |
kb_config["version"] += 1
|
235 |
kb_config["last_update"] = datetime.now().isoformat()
|
236 |
save_kb_config(kb_config)
|
@@ -244,10 +260,14 @@ def build_knowledge_base(embeddings):
|
|
244 |
raise Exception(f"Knowledge base creation failed: {str(e)}")
|
245 |
|
246 |
# Initialize models and knowledge base on startup
|
247 |
-
|
248 |
-
|
249 |
-
vector_store = None
|
250 |
|
|
|
|
|
|
|
|
|
|
|
251 |
if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
|
252 |
try:
|
253 |
vector_store = FAISS.load_local(
|
@@ -257,28 +277,26 @@ try:
|
|
257 |
)
|
258 |
logger.info("Successfully loaded existing knowledge base")
|
259 |
except Exception as e:
|
260 |
-
logger.warning(f"Could not load existing knowledge base
|
261 |
vector_store = None
|
262 |
else:
|
263 |
-
logger.
|
264 |
-
|
265 |
-
if vector_store is None:
|
266 |
-
logger.info("Building new knowledge base...")
|
267 |
-
vector_store = build_knowledge_base(embeddings)
|
268 |
-
logger.info("Knowledge base built successfully")
|
269 |
|
270 |
-
except Exception as e:
|
271 |
-
logger.error(f"Critical initialization error: {str(e)}")
|
272 |
-
logger.error(traceback.format_exc())
|
273 |
-
raise
|
274 |
-
|
275 |
-
# API endpoints
|
276 |
# API endpoints
|
277 |
@app.post("/chat", response_model=ChatResponse)
|
278 |
async def chat_endpoint(request: ChatRequest):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
try:
|
280 |
# Retrieve context
|
281 |
-
context_docs = vector_store.similarity_search(request.message)
|
282 |
context_text = "\n".join([d.page_content for d in context_docs])
|
283 |
|
284 |
# Generate response
|
@@ -315,19 +333,39 @@ async def chat_endpoint(request: ChatRequest):
|
|
315 |
raise HTTPException(status_code=500, detail=str(e))
|
316 |
|
317 |
@app.post("/rebuild-kb")
|
318 |
-
async def rebuild_knowledge_base():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
try:
|
320 |
-
|
321 |
-
|
322 |
-
|
|
|
323 |
except Exception as e:
|
324 |
raise HTTPException(status_code=500, detail=str(e))
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
@app.get("/kb-status")
|
327 |
async def get_kb_status():
|
328 |
"""Get current knowledge base status"""
|
|
|
|
|
329 |
kb_config = get_kb_config()
|
330 |
return {
|
|
|
331 |
"version": kb_config["version"],
|
332 |
"total_urls": len(URLS),
|
333 |
"processed_urls": len(kb_config["processed_urls"]),
|
@@ -356,4 +394,4 @@ def log_interaction(user_input: str, bot_response: str, context: str):
|
|
356 |
|
357 |
if __name__ == "__main__":
|
358 |
import uvicorn
|
359 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
1 |
import os
|
|
|
2 |
import sys
|
3 |
import json
|
4 |
import traceback
|
5 |
import warnings
|
6 |
+
import asyncio
|
7 |
+
import aiohttp
|
8 |
from datetime import datetime
|
9 |
from typing import Optional, List, Dict
|
10 |
import logging
|
|
|
16 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
17 |
)
|
18 |
|
|
|
19 |
from bs4 import BeautifulSoup
|
20 |
from dotenv import load_dotenv
|
21 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
22 |
from pydantic import BaseModel
|
23 |
from langchain_groq import ChatGroq
|
24 |
from langchain_huggingface import HuggingFaceEmbeddings
|
25 |
from langchain_community.vectorstores import FAISS
|
26 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
27 |
from langchain_core.prompts import PromptTemplate
|
28 |
from langchain_core.output_parsers import StrOutputParser
|
29 |
from langchain_core.tracers import ConsoleCallbackHandler
|
|
|
41 |
|
42 |
# Конфигурация базы знаний
|
43 |
KB_CONFIG_PATH = "vector_store/kb_config.json"
|
44 |
+
CACHE_DIR = "cache"
|
45 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
46 |
|
47 |
def get_kb_config():
|
48 |
if os.path.exists(KB_CONFIG_PATH):
|
|
|
85 |
"https://status.law/faq"
|
86 |
]
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# Enhanced logging
|
89 |
class CustomCallbackHandler(ConsoleCallbackHandler):
|
90 |
def on_chain_end(self, run):
|
|
|
113 |
api_key=os.getenv("GROQ_API_KEY"),
|
114 |
callback_manager=callback_manager
|
115 |
)
|
116 |
+
# Используем smaller модель для эмбеддингов
|
117 |
embeddings = HuggingFaceEmbeddings(
|
118 |
+
model_name="intfloat/multilingual-e5-small-instruct"
|
119 |
)
|
120 |
return llm, embeddings
|
121 |
except Exception as e:
|
122 |
raise Exception(f"Model initialization failed: {str(e)}")
|
123 |
|
124 |
+
async def fetch_url(session, url):
|
125 |
+
cache_file = os.path.join(CACHE_DIR, f"{url.replace('/', '_').replace(':', '_')}.html")
|
126 |
+
|
127 |
+
# Проверяем кэш
|
128 |
+
if os.path.exists(cache_file):
|
129 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
130 |
+
return url, f.read()
|
131 |
+
|
132 |
try:
|
133 |
+
async with session.get(url, ssl=False, timeout=30) as response:
|
134 |
+
if response.status == 200:
|
135 |
+
content = await response.text()
|
136 |
+
# Сохраняем в кэш
|
137 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
138 |
+
f.write(content)
|
139 |
+
return url, content
|
140 |
+
else:
|
141 |
+
logger.warning(f"Failed to load {url}, status code: {response.status}")
|
142 |
+
return url, None
|
143 |
except Exception as e:
|
144 |
+
logger.error(f"Error fetching {url}: {str(e)}")
|
145 |
+
return url, None
|
146 |
|
147 |
+
def process_html_content(url, html_content):
|
148 |
+
if not html_content:
|
149 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
152 |
+
|
153 |
+
# Remove script and style elements
|
154 |
+
for script in soup(["script", "style"]):
|
155 |
+
script.decompose()
|
|
|
156 |
|
157 |
+
# Get text content
|
158 |
+
text = soup.get_text()
|
159 |
+
|
160 |
+
# Clean up text
|
161 |
+
lines = (line.strip() for line in text.splitlines())
|
162 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
163 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
164 |
+
|
165 |
+
if not text.strip():
|
166 |
+
return None
|
167 |
|
168 |
+
return Document(page_content=text, metadata={"source": url})
|
|
|
|
|
|
|
169 |
|
170 |
+
async def load_all_urls(urls_to_process):
|
171 |
+
documents = []
|
172 |
+
|
173 |
+
async with aiohttp.ClientSession() as session:
|
174 |
+
tasks = [fetch_url(session, url) for url in urls_to_process]
|
175 |
+
results = await asyncio.gather(*tasks)
|
176 |
+
|
177 |
+
for url, content in results:
|
178 |
+
if content:
|
179 |
+
doc = process_html_content(url, content)
|
180 |
+
if doc:
|
181 |
+
documents.append(doc)
|
182 |
+
logger.info(f"Successfully processed content from {url}")
|
183 |
+
else:
|
184 |
+
logger.warning(f"No useful content extracted from {url}")
|
185 |
+
else:
|
186 |
+
logger.warning(f"Failed to load content from {url}")
|
187 |
+
|
188 |
+
return documents
|
189 |
+
|
190 |
+
async def build_knowledge_base_async(embeddings, force_rebuild=False):
|
191 |
+
"""
|
192 |
+
Асинхронное построение базы знаний.
|
193 |
+
Параметр force_rebuild позволяет принудительно обновить все URL.
|
194 |
+
"""
|
195 |
try:
|
196 |
logger.info("Starting knowledge base construction...")
|
197 |
kb_config = get_kb_config()
|
|
|
|
|
198 |
|
199 |
# Определяем URL для обработки
|
200 |
+
if force_rebuild:
|
201 |
+
urls_to_process = URLS
|
202 |
+
kb_config["processed_urls"] = []
|
203 |
+
logger.info("Forcing rebuild of entire knowledge base")
|
204 |
+
else:
|
205 |
+
urls_to_process = [url for url in URLS if url not in kb_config["processed_urls"]]
|
206 |
|
207 |
if not urls_to_process:
|
208 |
logger.info("No new URLs to process")
|
|
|
210 |
|
211 |
logger.info(f"Processing {len(urls_to_process)} new URLs")
|
212 |
|
213 |
+
documents = await load_all_urls(urls_to_process)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
215 |
if not documents:
|
216 |
+
if kb_config["processed_urls"] and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
|
217 |
logger.info("No new documents to add, loading existing vector store")
|
218 |
return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
|
219 |
raise Exception("No documents were successfully loaded!")
|
220 |
|
221 |
logger.info(f"Total new documents loaded: {len(documents)}")
|
222 |
|
223 |
+
# Увеличиваем размер чанков
|
224 |
text_splitter = RecursiveCharacterTextSplitter(
|
225 |
+
chunk_size=2500, # Увеличенный размер чанка
|
226 |
+
chunk_overlap=100
|
227 |
)
|
228 |
logger.info("Splitting documents into chunks...")
|
229 |
chunks = text_splitter.split_documents(documents)
|
230 |
logger.info(f"Created {len(chunks)} chunks")
|
231 |
|
232 |
+
# Если есть существующая база знаний и мы не выполняем полное обновление, добавляем к ней
|
233 |
+
if not force_rebuild and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
|
234 |
logger.info("Loading existing vector store...")
|
235 |
vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
|
236 |
logger.info("Adding new documents to existing vector store...")
|
|
|
243 |
vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
|
244 |
|
245 |
# Обновляем конфигурацию
|
246 |
+
for url in urls_to_process:
|
247 |
+
if url not in kb_config["processed_urls"]:
|
248 |
+
kb_config["processed_urls"].append(url)
|
249 |
+
|
250 |
kb_config["version"] += 1
|
251 |
kb_config["last_update"] = datetime.now().isoformat()
|
252 |
save_kb_config(kb_config)
|
|
|
260 |
raise Exception(f"Knowledge base creation failed: {str(e)}")
|
261 |
|
262 |
# Initialize models and knowledge base on startup
|
263 |
+
llm, embeddings = init_models()
|
264 |
+
vector_store = None
|
|
|
265 |
|
266 |
+
@app.on_event("startup")
|
267 |
+
async def startup_event():
|
268 |
+
global vector_store
|
269 |
+
|
270 |
+
# Только загружаем существующую базу при старте, не создаем новую
|
271 |
if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
|
272 |
try:
|
273 |
vector_store = FAISS.load_local(
|
|
|
277 |
)
|
278 |
logger.info("Successfully loaded existing knowledge base")
|
279 |
except Exception as e:
|
280 |
+
logger.warning(f"Could not load existing knowledge base: {str(e)}")
|
281 |
vector_store = None
|
282 |
else:
|
283 |
+
logger.warning("No existing knowledge base found, please use /rebuild-kb endpoint to create one")
|
|
|
|
|
|
|
|
|
|
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
# API endpoints
|
286 |
@app.post("/chat", response_model=ChatResponse)
|
287 |
async def chat_endpoint(request: ChatRequest):
|
288 |
+
global vector_store
|
289 |
+
|
290 |
+
# Проверяем, инициализирована ли база знаний
|
291 |
+
if vector_store is None:
|
292 |
+
raise HTTPException(
|
293 |
+
status_code=503,
|
294 |
+
detail="Knowledge base not initialized. Please use /rebuild-kb endpoint first."
|
295 |
+
)
|
296 |
+
|
297 |
try:
|
298 |
# Retrieve context
|
299 |
+
context_docs = vector_store.similarity_search(request.message, k=3) # Ограничиваем количество документов
|
300 |
context_text = "\n".join([d.page_content for d in context_docs])
|
301 |
|
302 |
# Generate response
|
|
|
333 |
raise HTTPException(status_code=500, detail=str(e))
|
334 |
|
335 |
@app.post("/rebuild-kb")
|
336 |
+
async def rebuild_knowledge_base(background_tasks: BackgroundTasks, force: bool = False):
|
337 |
+
"""
|
338 |
+
Rebuild knowledge base in the background
|
339 |
+
|
340 |
+
- force: если True, перестраивает всю базу знаний с нуля
|
341 |
+
"""
|
342 |
+
global vector_store
|
343 |
+
|
344 |
try:
|
345 |
+
# Запускаем в фоне
|
346 |
+
background_tasks.add_task(_rebuild_kb_task, force)
|
347 |
+
action = "rebuild" if force else "update"
|
348 |
+
return {"status": "success", "message": f"Knowledge base {action} started in background"}
|
349 |
except Exception as e:
|
350 |
raise HTTPException(status_code=500, detail=str(e))
|
351 |
|
352 |
+
async def _rebuild_kb_task(force: bool = False):
|
353 |
+
"""Фоновая задача для обновления базы знаний"""
|
354 |
+
global vector_store
|
355 |
+
try:
|
356 |
+
vector_store = await build_knowledge_base_async(embeddings, force_rebuild=force)
|
357 |
+
logger.info("Knowledge base rebuild completed successfully")
|
358 |
+
except Exception as e:
|
359 |
+
logger.error(f"Knowledge base rebuild failed: {str(e)}")
|
360 |
+
|
361 |
@app.get("/kb-status")
|
362 |
async def get_kb_status():
|
363 |
"""Get current knowledge base status"""
|
364 |
+
global vector_store
|
365 |
+
|
366 |
kb_config = get_kb_config()
|
367 |
return {
|
368 |
+
"initialized": vector_store is not None,
|
369 |
"version": kb_config["version"],
|
370 |
"total_urls": len(URLS),
|
371 |
"processed_urls": len(kb_config["processed_urls"]),
|
|
|
394 |
|
395 |
if __name__ == "__main__":
|
396 |
import uvicorn
|
397 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|