Hjgugugjhuhjggg commited on
Commit
61dff03
verified
1 Parent(s): ecaa77c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -0
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import threading
4
+ import time
5
+ import asyncio
6
+ import uvicorn
7
+ from fastapi import FastAPI, HTTPException
8
+ from pydantic import BaseModel
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_community.chat_models import ChatOllama
11
+ from langchain_community.tools import DuckDuckGoSearchRun
12
+ from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
13
+ from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
14
+ from langgraph.graph import END, StateGraph
15
+ from typing_extensions import TypedDict
16
+ from fastapi.responses import StreamingResponse
17
+ from ollama import AsyncClient
18
+ import gc
19
+ import psutil
20
+ import torch
21
+ from functools import lru_cache
22
+
23
+ # Aseg煤rate de tener la librer铆a Ollama instalada
24
+ # pip install ollama langchain langchain_community langgraph
25
+
26
+ # Ruta de Ollama
27
+ OLLAMA = os.path.expanduser("~/ollama")
28
+ if not os.path.exists(OLLAMA):
29
+ print("Ollama no encontrado, descargando...")
30
+ subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
31
+ os.chmod(OLLAMA, 0o755)
32
+
33
+ # Iniciar el servidor Ollama en segundo plano
34
+ async def ollama_service_thread():
35
+ print("Iniciando el servicio de Ollama")
36
+ subprocess.run("~/ollama serve -1", shell=True) # -1 indica mantener el servidor en ejecuci贸n indefinidamente
37
+
38
+ # Iniciar el hilo de servicio Ollama
39
+ print("Creando y comenzando el hilo del servicio Ollama")
40
+ OLLAMA_SERVICE_THREAD = threading.Thread(target=asyncio.run, args=(ollama_service_thread(),))
41
+ OLLAMA_SERVICE_THREAD.start()
42
+
43
+ # Esperar a que Ollama se inicie
44
+ print("Esperando a que Ollama inicie...")
45
+ time.sleep(10)
46
+
47
+ # Descargar el modelo de Hugging Face si no est谩 disponible
48
+ async def download_ollama_model(model_name='hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'):
49
+ try:
50
+ print(f"Descargando el modelo: {model_name}")
51
+ subprocess.run(["ollama", "pull", model_name], check=True)
52
+ except subprocess.CalledProcessError as e:
53
+ print(f"Error al descargar el modelo: {e}")
54
+ raise
55
+
56
+ # Descargar el modelo de Ollama en el hilo principal
57
+ download_ollama_model("hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S")
58
+
59
+ # Crear la aplicaci贸n FastAPI
60
+ app = FastAPI()
61
+
62
+ # Definir el modelo de datos para recibir las consultas en la API
63
+ class QueryRequest(BaseModel):
64
+ query: str
65
+
66
+ # Definir el modelo de lenguaje de Ollama
67
+ local_llm = 'hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'
68
+ llama3 = ChatOllama(model=local_llm)
69
+
70
+ # Definir la herramienta de b煤squeda web usando DuckDuckGo
71
+ wrapper = DuckDuckGoSearchAPIWrapper(max_results=1)
72
+ web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper)
73
+
74
+ # Implementar optimizaci贸n extrema mediante LRU Cache
75
+ @lru_cache(maxsize=1024) # Cache de tama帽o limitado para resultados previos
76
+ async def cached_search(query):
77
+ return await web_search_tool.invoke(query)
78
+
79
+ # Definir los prompts para generaci贸n y enrutamiento
80
+ generate_prompt = PromptTemplate(
81
+ template="""
82
+ <|begin_of_text|>
83
+ <|start_header_id|>system<|end_header_id|>
84
+ You are an AI assistant for Research Question Tasks, that synthesizes web search results.
85
+ Strictly use the following pieces of web search context to answer the question. If you don't know the answer, just say that you don't know.
86
+ Keep the answer concise, but provide all of the details you can in the form of a research report.
87
+ Only make direct references to material if provided in the context.
88
+ <|eot_id|>
89
+ <|start_header_id|>user<|end_header_id|>
90
+ Question: {question}
91
+ Web Search Context: {context}
92
+ Answer:
93
+ <|eot_id|>
94
+ <|start_header_id|>assistant<|end_header_id|>""",
95
+ input_variables=["question", "context"],
96
+ )
97
+
98
+ generate_chain = generate_prompt | llama3 | StrOutputParser()
99
+
100
+ router_prompt = PromptTemplate(
101
+ template="""
102
+ <|begin_of_text|>
103
+ <|start_header_id|>system<|end_header_id|>
104
+ You are an expert at routing a user question to either the generation stage or web search.
105
+ Use the web search for questions that require more context for a better answer, or recent events.
106
+ Otherwise, you can skip and go straight to the generation phase to respond.
107
+ You do not need to be stringent with the keywords in the question related to these topics.
108
+ Give a binary choice 'web_search' or 'generate' based on the question.
109
+ Return the JSON with a single key 'choice' with no premable or explanation.
110
+ Question to route: {question}
111
+ <|eot_id|>
112
+ <|start_header_id|>assistant<|end_header_id|>""",
113
+ input_variables=["question"],
114
+ )
115
+
116
+ question_router = router_prompt | llama3 | JsonOutputParser()
117
+
118
+ query_prompt = PromptTemplate(
119
+ template="""
120
+ <|begin_of_text|>
121
+ <|start_header_id|>system<|end_header_id|>
122
+ You are an expert at crafting web search queries for research questions.
123
+ More often than not, a user will ask a basic question that they wish to learn more about, however it might not be in the best format.
124
+ Reword their query to be the most effective web search string possible.
125
+ Return the JSON with a single key 'query' with no premable or explanation.
126
+ Question to transform: {question}
127
+ <|eot_id|>
128
+ <|start_header_id|>assistant<|end_header_id|>""",
129
+ input_variables=["question"],
130
+ )
131
+
132
+ query_chain = query_prompt | llama3 | JsonOutputParser()
133
+
134
+ # Definir el estado del grafo
135
+ class GraphState(TypedDict):
136
+ question: str
137
+ generation: str
138
+ search_query: str
139
+ context: str
140
+
141
+ # Nodos de procesamiento
142
+ async def generate(state):
143
+ print("Step: Generating Final Response")
144
+ question = state["question"]
145
+ context = state["context"]
146
+ generation = await generate_chain.invoke({"context": context, "question": question})
147
+ return {"generation": generation}
148
+
149
+ async def transform_query(state):
150
+ print("Step: Optimizing Query for Web Search")
151
+ question = state['question']
152
+ gen_query = await query_chain.invoke({"question": question})
153
+ search_query = gen_query.get("query", "") # Asegurarnos de que estamos obteniendo la clave correcta
154
+ return {"search_query": search_query}
155
+
156
+ async def web_search(state):
157
+ search_query = state['search_query']
158
+ print(f'Step: Searching the Web for: "{search_query}"')
159
+ try:
160
+ # Se usa la cach茅 para optimizar los resultados
161
+ search_result = await cached_search(search_query)
162
+ if isinstance(search_result, str): # Si la respuesta es una cadena, la convertimos en un diccionario
163
+ print(f"Respuesta de b煤squeda web es cadena: {search_result}")
164
+ return {"context": search_result}
165
+ elif isinstance(search_result, dict): # Si es un diccionario, lo usamos directamente
166
+ return {"context": search_result}
167
+ else:
168
+ raise ValueError("Respuesta de b煤squeda web no es v谩lida")
169
+ except Exception as e:
170
+ print(f"Web search failed: {e}")
171
+ return None # Si la b煤squeda falla, no devuelve contexto
172
+
173
+ async def route_question(state):
174
+ print("Step: Routing Query")
175
+ question = state['question']
176
+ output = await question_router.invoke({"question": question})
177
+ if output.get('choice') == "web_search":
178
+ print("Step: Routing Query to Web Search")
179
+ return "websearch"
180
+ elif output.get('choice') == 'generate':
181
+ print("Step: Routing Query to Generation")
182
+ return "generate"
183
+
184
+ # Crear el grafo de estado
185
+ workflow = StateGraph(GraphState)
186
+ workflow.add_node("websearch", web_search)
187
+ workflow.add_node("transform_query", transform_query)
188
+ workflow.add_node("generate", generate)
189
+
190
+ workflow.set_conditional_entry_point(
191
+ route_question,
192
+ {
193
+ "websearch": "transform_query",
194
+ "generate": "generate",
195
+ },
196
+ )
197
+
198
+ # Gesti贸n de recursos de CPU, RAM y GPU
199
+ def release_resources():
200
+ try:
201
+ torch.cuda.empty_cache()
202
+ gc.collect()
203
+ except Exception as e:
204
+ print(f"Failed to release resources: {e}")
205
+
206
+ def resource_manager():
207
+ MAX_RAM_PERCENT = 80 # Ajustar seg煤n sea necesario
208
+ MAX_CPU_PERCENT = 80
209
+ MAX_GPU_PERCENT = 80
210
+ MAX_RAM_MB = 4096 # Ajustar seg煤n la memoria disponible
211
+
212
+ while True:
213
+ try:
214
+ virtual_mem = psutil.virtual_memory()
215
+ current_ram_percent = virtual_mem.percent
216
+ current_ram_mb = virtual_mem.used / (1024 * 1024) # Convertir a MB
217
+
218
+ if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
219
+ release_resources()
220
+
221
+ current_cpu_percent = psutil.cpu_percent()
222
+ if current_cpu_percent > MAX_CPU_PERCENT:
223
+ psutil.Process(os.getpid()).nice()
224
+
225
+ if torch.cuda.is_available():
226
+ gpu = torch.cuda.current_device()
227
+ gpu_mem = torch.cuda.memory_allocated(gpu) / (1024 * 1024) # Convertir a MB
228
+
229
+ if gpu_mem > MAX_GPU_PERCENT:
230
+ release_resources()
231
+
232
+ except Exception as e:
233
+ print(f"Error en el gestor de recursos: {e}")
234
+
235
+ resource_manager()
236
+
237
+ # Procesar la consulta en paralelo
238
+ async def process_query_in_parallel(query):
239
+ try:
240
+ state = GraphState(question=query, generation="", search_query="", context="")
241
+ return await workflow.invoke(state)
242
+ except Exception as e:
243
+ print(f"Error en la ejecuci贸n paralela: {e}")
244
+ raise
245
+
246
+ # Ruta de la API para manejar consultas
247
+ @app.post("/query")
248
+ async def query_handler(request: QueryRequest):
249
+ try:
250
+ query = request.query
251
+ result = await process_query_in_parallel(query) # Llamada as铆ncrona
252
+ return {"results": result}
253
+ except Exception as e:
254
+ raise HTTPException(status_code=500, detail=str(e))
255
+
256
+ # Ejecutar el servidor FastAPI
257
+ if __name__ == "__main__":
258
+ uvicorn.run(app, host="0.0.0.0", port=8000)