Spaces:

muryshev
/

generic-chatbot-backend

Runtime error

App Files Files Community

muryshev commited on Apr 7

Commit

383ba14

1 Parent(s): 1e5d06f

update

Browse files

Files changed (3) hide show

components/llm/common.py +2 -1
components/llm/deepinfra_api.py +48 -30
routes/llm.py +29 -7

components/llm/common.py CHANGED Viewed

@@ -72,8 +72,9 @@ class LlmApi:
 class Message(BaseModel):
     role: str
     content: str
-    searchResults: str
     searchEntities: Optional[List[str]] = []
 class ChatRequest(BaseModel):
     history: List[Message]

 class Message(BaseModel):
     role: str
     content: str
+    searchResults: Optional[str] = ''
     searchEntities: Optional[List[str]] = []
+    reasoning: Optional[str] = ''
 class ChatRequest(BaseModel):
     history: List[Message]

components/llm/deepinfra_api.py CHANGED Viewed

@@ -257,7 +257,7 @@ class DeepInfraApi(LlmApi):
                 logging.error(f"Request failed: status code {response.status_code}")
                 logging.error(response.text)
-    async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int = 3, retry_delay: float = 0.5) -> str:
         """
         Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
@@ -271,7 +271,9 @@ class DeepInfraApi(LlmApi):
         Returns:
             str: Сгенерированный текст.
         """
-        async with httpx.AsyncClient() as client:
             request = self.create_chat_request(request, system_prompt, params)
             request["stream"] = True
@@ -312,45 +314,61 @@ class DeepInfraApi(LlmApi):
     async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
-                                     params: LlmPredictParams) -> AsyncGenerator[str, None]:
         """
-        Выполняет потоковый запрос к API и возвращает токены по мере их генерации.
         Args:
             request (ChatRequest): История чата.
             system_prompt (str): Системный промпт.
             params (LlmPredictParams): Параметры предсказания.
         Yields:
             str: Токены ответа LLM.
         """
         timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            request_data = self.create_chat_request(request, system_prompt, params)
-            request_data["stream"] = True
-            async with client.stream(
-                "POST",
-                f"{self.params.url}/v1/openai/chat/completions",
-                json=request_data,
-                headers=super().create_headers(),
-            ) as response:
-                if response.status_code != 200:
-                    error_content = await response.aread()
-                    raise Exception(f"API error: {error_content.decode('utf-8')}")
-                async for line in response.aiter_lines():
-                    if line.startswith("data: "):
-                        try:
-                            data = json.loads(line[len("data: "):].strip())
-                            if data == "[DONE]":
-                                break
-                            if "choices" in data and data["choices"]:
-                                token_value = data["choices"][0].get("delta", {}).get("content", "")
-                                if token_value:
-                                    yield token_value
-                        except json.JSONDecodeError:
-                            continue
     async def predict(self, prompt: str, system_prompt: str) -> str:
         """

                 logging.error(f"Request failed: status code {response.status_code}")
                 logging.error(response.text)
+    async def predict_chat_stream(self, request: ChatRequest, system_prompt, params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> str:
         """
         Выполняет запрос к API с поддержкой потокового вывода (SSE) и возвращает результат.
         Returns:
             str: Сгенерированный текст.
         """
+        timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
+        async with httpx.AsyncClient(timeout=timeout) as client:
             request = self.create_chat_request(request, system_prompt, params)
             request["stream"] = True
     async def get_predict_chat_generator(self, request: ChatRequest, system_prompt: str,
+                                   params: LlmPredictParams, max_retries: int = 5, retry_delay: float = 2) -> AsyncGenerator[str, None]:
         """
+        Выполняет потоковый запрос к API и возвращает токены по мере их генерации с реконнектом при ошибках.
         Args:
             request (ChatRequest): История чата.
             system_prompt (str): Системный промпт.
             params (LlmPredictParams): Параметры предсказания.
+            max_retries (int): Максимальное количество попыток повторного подключения.
+            retry_delay (float): Задержка между попытками в секундах.
         Yields:
             str: Токены ответа LLM.
         """
+        print(request.history)
         timeout = httpx.Timeout(connect=30.0, read=None, pool=None, write=None, timeout=None)
+        attempt = 0
+        while attempt < max_retries + 1:
+            try:
+                async with httpx.AsyncClient(timeout=timeout) as client:
+                    request_data = self.create_chat_request(request, system_prompt, params)
+                    request_data["stream"] = True
+                    async with client.stream(
+                        "POST",
+                        f"{self.params.url}/v1/openai/chat/completions",
+                        json=request_data,
+                        headers=super().create_headers(),
+                    ) as response:
+                        if response.status_code != 200:
+                            error_content = await response.aread()
+                            raise Exception(f"API error: {error_content.decode('utf-8')}")
+                        async for line in response.aiter_lines():
+                            if line.startswith("data: "):
+                                try:
+                                    data = json.loads(line[len("data: "):].strip())
+                                    if data == "[DONE]":
+                                        return  # Успешно завершаем генерацию
+                                    if "choices" in data and data["choices"]:
+                                        token_value = data["choices"][0].get("delta", {}).get("content", "")
+                                        if token_value:
+                                            yield token_value
+                                except json.JSONDecodeError:
+                                    continue
+                        return  # Успешно завершили обработку потока
+            except Exception as e:
+                attempt += 1
+                if attempt == max_retries + 1:
+                    raise Exception(f"predict_chat_stream failed after {max_retries} retries: {str(e)}")
+                # Ждем перед следующей попыткой
+                await asyncio.sleep(retry_delay)
     async def predict(self, prompt: str, system_prompt: str) -> str:
         """

routes/llm.py CHANGED Viewed

@@ -82,12 +82,21 @@ def try_insert_search_results(
         return True
     return False
 def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
     """
     Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
     Формат:
-    <search-results>[Источник] - текст</search-results>
     role: текст сообщения
     """
     if not chat_request.history:
         return ChatRequest(history=[])
@@ -95,12 +104,15 @@ def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
     # Собираем историю в одну строку
     collapsed_content = []
     for msg in chat_request.history:
-        # Добавляем search-results, если они есть
-        if msg.searchResults:
-            collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
         # Добавляем текст сообщения с указанием роли
         if msg.content.strip():
-            collapsed_content.append(f"{msg.role}: {msg.content.strip()}")
     # Формируем финальный текст с переносами строк
     new_content = "\n".join(collapsed_content)
@@ -122,9 +134,19 @@ async def sse_generator(request: ChatRequest, llm_api: DeepInfraApi, system_prom
     Генератор для стриминга ответа LLM через SSE.
     """
     try:
-        qe_result = await dialogue_service.get_qe_result(request.history)
         qe_event = {
-            "event": "debug",
             "data": {
                 "text": qe_result.debug_message
             }

         return True
     return False
+def try_insert_reasoning(
+    chat_request: ChatRequest, reasoning: str
+):
+    for msg in reversed(chat_request.history):
+        if msg.role == "user":
+            msg.reasoning = reasoning
 def collapse_history_to_first_message(chat_request: ChatRequest) -> ChatRequest:
     """
     Сворачивает историю в первое сообщение и возвращает новый объект ChatRequest.
     Формат:
     role: текст сообщения
+    <reasoning>[Источник] - текст</reasoning>
+    <search-results>[Источник] - текст</search-results>
     """
     if not chat_request.history:
         return ChatRequest(history=[])
     # Собираем историю в одну строку
     collapsed_content = []
     for msg in chat_request.history:
         # Добавляем текст сообщения с указанием роли
         if msg.content.strip():
+            collapsed_content.append(f"{msg.role.strip()}: {msg.content.strip()}")
+        # Добавляем reasoning, если есть
+        if msg.reasoning.strip():
+            collapsed_content.append(f"<reasoning>{msg.reasoning}</reasoning>")
+        # Добавляем search-results, если они есть
+        if msg.searchResults.strip():
+            collapsed_content.append(f"<search-results>{msg.searchResults}</search-results>")
     # Формируем финальный текст с переносами строк
     new_content = "\n".join(collapsed_content)
     Генератор для стриминга ответа LLM через SSE.
     """
     try:
+        qe_result = await dialogue_service.get_qe_result(request.history)
+        try_insert_reasoning(request, qe_result.debug_message)
+        # qe_debug_event = {
+        #     "event": "debug",
+        #     "data": {
+        #         "text": qe_result.debug_message
+        #     }
+        # }
+        # yield f"data: {json.dumps(qe_debug_event, ensure_ascii=False)}\n\n"
         qe_event = {
+            "event": "reasoning",
             "data": {
                 "text": qe_result.debug_message
             }