Spaces:
Sleeping
Sleeping
update
Browse files- components/llm/common.py +2 -1
- components/llm/deepinfra_api.py +48 -30
- routes/llm.py +29 -7
components/llm/common.py
CHANGED
@@ -72,8 +72,9 @@ class LlmApi:
|
|
72 |
class Message(BaseModel):
|
73 |
role: str
|
74 |
content: str
|
75 |
-
searchResults: str
|
76 |
searchEntities: Optional[List[str]] = []
|
|
|
77 |
|
78 |
class ChatRequest(BaseModel):
|
79 |
history: List[Message]
|
|
|
72 |
class Message(BaseModel):
|
73 |
role: str
|
74 |
content: str
|
75 |
+
searchResults: Optional[str] = ''
|
76 |
searchEntities: Optional[List[str]] = []
|
77 |
+
reasoning: Optional[str] = ''
|
78 |
|
79 |
class ChatRequest(BaseModel):
|
80 |
history: List[Message]
|
components/llm/deepinfra_api.py
CHANGED
@@ -257,7 +257,7 @@ class DeepInfraApi(LlmApi):
|
|
257 |
logging.error(f"Request failed: status code {response.status_code}")
|
258 |
logging.error(response.text)
|
259 |
|
260 |
-
async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int =
|
261 |
"""
|
262 |
Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
|
263 |
|
@@ -271,7 +271,9 @@ class DeepInfraApi(LlmApi):
|
|
271 |
Returns:
|
272 |
str: Сгенерированный текст.
|
273 |
"""
|
274 |
-
|
|
|
|
|
275 |
request = self.create_chat_request(request, system_prompt, params)
|
276 |
request["stream"] = True
|
277 |
|
@@ -312,45 +314,61 @@ class DeepInfraApi(LlmApi):
|
|
312 |
|
313 |
|
314 |
async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
|
315 |
-
|
316 |
"""
|
317 |
-
Выполняет потоковый запрос к API и возвращает токены по мере их
|
318 |
|
319 |
Args:
|
320 |
request (ChatRequest): История чата.
|
321 |
system_prompt (str): Системный промпт.
|
322 |
params (LlmPredictParams): Параметры предсказания.
|
|
|
|
|
323 |
|
324 |
Yields:
|
325 |
str: Токены ответа LLM.
|
326 |
"""
|
|
|
327 |
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
async def predict(self, prompt: str, system_prompt: str) -> str:
|
356 |
"""
|
|
|
257 |
logging.error(f"Request failed: status code {response.status_code}")
|
258 |
logging.error(response.text)
|
259 |
|
260 |
+
async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> str:
|
261 |
"""
|
262 |
Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
|
263 |
|
|
|
271 |
Returns:
|
272 |
str: Сгенерированный текст.
|
273 |
"""
|
274 |
+
|
275 |
+
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
276 |
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
277 |
request = self.create_chat_request(request, system_prompt, params)
|
278 |
request["stream"] = True
|
279 |
|
|
|
314 |
|
315 |
|
316 |
async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
|
317 |
+
params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> AsyncGenerator[str, None]:
|
318 |
"""
|
319 |
+
Выполняет потоковый запрос к API и возвращает токены по мере их генерации с реконнектом при ошибках.
|
320 |
|
321 |
Args:
|
322 |
request (ChatRequest): История чата.
|
323 |
system_prompt (str): Системный промпт.
|
324 |
params (LlmPredictParams): Параметры предсказания.
|
325 |
+
max_retries (int): Максимальное количество попыток повторного подключения.
|
326 |
+
retry_delay (float): Задержка между попытками в секундах.
|
327 |
|
328 |
Yields:
|
329 |
str: Токены ответа LLM.
|
330 |
"""
|
331 |
+
print(request.history)
|
332 |
timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
|
333 |
+
attempt = 0
|
334 |
+
|
335 |
+
while attempt < max_retries + 1:
|
336 |
+
try:
|
337 |
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
338 |
+
request_data = self.create_chat_request(request, system_prompt, params)
|
339 |
+
request_data["stream"] = True
|
340 |
+
|
341 |
+
async with client.stream(
|
342 |
+
"POST",
|
343 |
+
f"{self.params.url}/v1/openai/chat/completions",
|
344 |
+
json=request_data,
|
345 |
+
headers=super().create_headers(),
|
346 |
+
) as response:
|
347 |
+
if response.status_code != 200:
|
348 |
+
error_content = await response.aread()
|
349 |
+
raise Exception(f"API error: {error_content.decode('utf-8')}")
|
350 |
+
|
351 |
+
async for line in response.aiter_lines():
|
352 |
+
if line.startswith("data: "):
|
353 |
+
try:
|
354 |
+
data = json.loads(line[len("data: "):].strip())
|
355 |
+
if data == "[DONE]":
|
356 |
+
return # Успешно завершаем генерацию
|
357 |
+
if "choices" in data and data["choices"]:
|
358 |
+
token_value = data["choices"][0].get("delta", {}).get("content", "")
|
359 |
+
if token_value:
|
360 |
+
yield token_value
|
361 |
+
except json.JSONDecodeError:
|
362 |
+
continue
|
363 |
+
return # Успешно завершили обработку потока
|
364 |
+
|
365 |
+
except Exception as e:
|
366 |
+
attempt += 1
|
367 |
+
if attempt == max_retries + 1:
|
368 |
+
raise Exception(f"predict_chat_stream failed after {max_retries} retries: {str(e)}")
|
369 |
+
# Ждем перед следующей попыткой
|
370 |
+
await asyncio.sleep(retry_delay)
|
371 |
+
|
372 |
|
373 |
async def predict(self, prompt: str, system_prompt: str) -> str:
|
374 |
"""
|
routes/llm.py
CHANGED
@@ -82,12 +82,21 @@ def try_insert_search_results(
|
|
82 |
return True
|
83 |
return False
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
86 |
"""
|
87 |
Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
|
88 |
Формат:
|
89 |
-
<search-results>[Источник] - текст</search-results>
|
90 |
role: текст сообщения
|
|
|
|
|
|
|
91 |
"""
|
92 |
if not chat_request.history:
|
93 |
return ChatRequest(history=[])
|
@@ -95,12 +104,15 @@ def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
|
95 |
# Собираем историю в одну строку
|
96 |
collapsed_content = []
|
97 |
for msg in chat_request.history:
|
98 |
-
# Добавляем search-results, если они есть
|
99 |
-
if msg.searchResults:
|
100 |
-
collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
|
101 |
# Добавляем текст сообщения с указанием роли
|
102 |
if msg.content.strip():
|
103 |
-
collapsed_content.append(f"{msg.role}: {msg.content.strip()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# Формируем финальный текст с переносами строк
|
106 |
new_content = "\n".join(collapsed_content)
|
@@ -122,9 +134,19 @@ async def sse_generator(request: ChatRequest, llm_api: DeepInfraApi, system_prom
|
|
122 |
Генератор для стриминга ответа LLM через SSE.
|
123 |
"""
|
124 |
try:
|
125 |
-
qe_result = await dialogue_service.get_qe_result(request.history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
qe_event = {
|
127 |
-
"event": "
|
128 |
"data": {
|
129 |
"text": qe_result.debug_message
|
130 |
}
|
|
|
82 |
return True
|
83 |
return False
|
84 |
|
85 |
+
def try_insert_reasoning(
|
86 |
+
chat_request: ChatRequest, reasoning: str
|
87 |
+
):
|
88 |
+
for msg in reversed(chat_request.history):
|
89 |
+
if msg.role == "user":
|
90 |
+
msg.reasoning = reasoning
|
91 |
+
|
92 |
def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
|
93 |
"""
|
94 |
Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
|
95 |
Формат:
|
|
|
96 |
role: текст сообщения
|
97 |
+
<reasoning>[Источник] - текст</reasoning>
|
98 |
+
<search-results>[Источник] - текст</search-results>
|
99 |
+
|
100 |
"""
|
101 |
if not chat_request.history:
|
102 |
return ChatRequest(history=[])
|
|
|
104 |
# Собираем историю в одну строку
|
105 |
collapsed_content = []
|
106 |
for msg in chat_request.history:
|
|
|
|
|
|
|
107 |
# Добавляем текст сообщения с указанием роли
|
108 |
if msg.content.strip():
|
109 |
+
collapsed_content.append(f"{msg.role.strip()}: {msg.content.strip()}")
|
110 |
+
# Добавляем reasoning, если есть
|
111 |
+
if msg.reasoning.strip():
|
112 |
+
collapsed_content.append(f"<reasoning>{msg.reasoning}</reasoning>")
|
113 |
+
# Добавляем search-results, если они есть
|
114 |
+
if msg.searchResults.strip():
|
115 |
+
collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
|
116 |
|
117 |
# Формируем финальный текст с переносами строк
|
118 |
new_content = "\n".join(collapsed_content)
|
|
|
134 |
Генератор для стриминга ответа LLM через SSE.
|
135 |
"""
|
136 |
try:
|
137 |
+
qe_result = await dialogue_service.get_qe_result(request.history)
|
138 |
+
try_insert_reasoning(request, qe_result.debug_message)
|
139 |
+
|
140 |
+
# qe_debug_event = {
|
141 |
+
# "event": "debug",
|
142 |
+
# "data": {
|
143 |
+
# "text": qe_result.debug_message
|
144 |
+
# }
|
145 |
+
# }
|
146 |
+
# yield f"data: {json.dumps(qe_debug_event, ensure_ascii=False)}\n\n"
|
147 |
+
|
148 |
qe_event = {
|
149 |
+
"event": "reasoning",
|
150 |
"data": {
|
151 |
"text": qe_result.debug_message
|
152 |
}
|