Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import threading
|
4 |
+
import time
|
5 |
+
import asyncio
|
6 |
+
import uvicorn
|
7 |
+
from fastapi import FastAPI, HTTPException
|
8 |
+
from pydantic import BaseModel
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
from langchain_community.chat_models import ChatOllama
|
11 |
+
from langchain_community.tools import DuckDuckGoSearchRun
|
12 |
+
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
13 |
+
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
|
14 |
+
from langgraph.graph import END, StateGraph
|
15 |
+
from typing_extensions import TypedDict
|
16 |
+
from fastapi.responses import StreamingResponse
|
17 |
+
from ollama import AsyncClient
|
18 |
+
import gc
|
19 |
+
import psutil
|
20 |
+
import torch
|
21 |
+
from functools import lru_cache
|
22 |
+
|
23 |
+
# Aseg煤rate de tener la librer铆a Ollama instalada
|
24 |
+
# pip install ollama langchain langchain_community langgraph
|
25 |
+
|
26 |
+
# Ruta de Ollama
|
27 |
+
OLLAMA = os.path.expanduser("~/ollama")
|
28 |
+
if not os.path.exists(OLLAMA):
|
29 |
+
print("Ollama no encontrado, descargando...")
|
30 |
+
subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
|
31 |
+
os.chmod(OLLAMA, 0o755)
|
32 |
+
|
33 |
+
# Iniciar el servidor Ollama en segundo plano
|
34 |
+
async def ollama_service_thread():
|
35 |
+
print("Iniciando el servicio de Ollama")
|
36 |
+
subprocess.run("~/ollama serve -1", shell=True) # -1 indica mantener el servidor en ejecuci贸n indefinidamente
|
37 |
+
|
38 |
+
# Iniciar el hilo de servicio Ollama
|
39 |
+
print("Creando y comenzando el hilo del servicio Ollama")
|
40 |
+
OLLAMA_SERVICE_THREAD = threading.Thread(target=asyncio.run, args=(ollama_service_thread(),))
|
41 |
+
OLLAMA_SERVICE_THREAD.start()
|
42 |
+
|
43 |
+
# Esperar a que Ollama se inicie
|
44 |
+
print("Esperando a que Ollama inicie...")
|
45 |
+
time.sleep(10)
|
46 |
+
|
47 |
+
# Descargar el modelo de Hugging Face si no est谩 disponible
|
48 |
+
async def download_ollama_model(model_name='hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'):
|
49 |
+
try:
|
50 |
+
print(f"Descargando el modelo: {model_name}")
|
51 |
+
subprocess.run(["ollama", "pull", model_name], check=True)
|
52 |
+
except subprocess.CalledProcessError as e:
|
53 |
+
print(f"Error al descargar el modelo: {e}")
|
54 |
+
raise
|
55 |
+
|
56 |
+
# Descargar el modelo de Ollama en el hilo principal
|
57 |
+
download_ollama_model("hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S")
|
58 |
+
|
59 |
+
# Crear la aplicaci贸n FastAPI
|
60 |
+
app = FastAPI()
|
61 |
+
|
62 |
+
# Definir el modelo de datos para recibir las consultas en la API
|
63 |
+
class QueryRequest(BaseModel):
|
64 |
+
query: str
|
65 |
+
|
66 |
+
# Definir el modelo de lenguaje de Ollama
|
67 |
+
local_llm = 'hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'
|
68 |
+
llama3 = ChatOllama(model=local_llm)
|
69 |
+
|
70 |
+
# Definir la herramienta de b煤squeda web usando DuckDuckGo
|
71 |
+
wrapper = DuckDuckGoSearchAPIWrapper(max_results=1)
|
72 |
+
web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper)
|
73 |
+
|
74 |
+
# Implementar optimizaci贸n extrema mediante LRU Cache
|
75 |
+
@lru_cache(maxsize=1024) # Cache de tama帽o limitado para resultados previos
|
76 |
+
async def cached_search(query):
|
77 |
+
return await web_search_tool.invoke(query)
|
78 |
+
|
79 |
+
# Definir los prompts para generaci贸n y enrutamiento
|
80 |
+
generate_prompt = PromptTemplate(
|
81 |
+
template="""
|
82 |
+
<|begin_of_text|>
|
83 |
+
<|start_header_id|>system<|end_header_id|>
|
84 |
+
You are an AI assistant for Research Question Tasks, that synthesizes web search results.
|
85 |
+
Strictly use the following pieces of web search context to answer the question. If you don't know the answer, just say that you don't know.
|
86 |
+
Keep the answer concise, but provide all of the details you can in the form of a research report.
|
87 |
+
Only make direct references to material if provided in the context.
|
88 |
+
<|eot_id|>
|
89 |
+
<|start_header_id|>user<|end_header_id|>
|
90 |
+
Question: {question}
|
91 |
+
Web Search Context: {context}
|
92 |
+
Answer:
|
93 |
+
<|eot_id|>
|
94 |
+
<|start_header_id|>assistant<|end_header_id|>""",
|
95 |
+
input_variables=["question", "context"],
|
96 |
+
)
|
97 |
+
|
98 |
+
generate_chain = generate_prompt | llama3 | StrOutputParser()
|
99 |
+
|
100 |
+
router_prompt = PromptTemplate(
|
101 |
+
template="""
|
102 |
+
<|begin_of_text|>
|
103 |
+
<|start_header_id|>system<|end_header_id|>
|
104 |
+
You are an expert at routing a user question to either the generation stage or web search.
|
105 |
+
Use the web search for questions that require more context for a better answer, or recent events.
|
106 |
+
Otherwise, you can skip and go straight to the generation phase to respond.
|
107 |
+
You do not need to be stringent with the keywords in the question related to these topics.
|
108 |
+
Give a binary choice 'web_search' or 'generate' based on the question.
|
109 |
+
Return the JSON with a single key 'choice' with no premable or explanation.
|
110 |
+
Question to route: {question}
|
111 |
+
<|eot_id|>
|
112 |
+
<|start_header_id|>assistant<|end_header_id|>""",
|
113 |
+
input_variables=["question"],
|
114 |
+
)
|
115 |
+
|
116 |
+
question_router = router_prompt | llama3 | JsonOutputParser()
|
117 |
+
|
118 |
+
query_prompt = PromptTemplate(
|
119 |
+
template="""
|
120 |
+
<|begin_of_text|>
|
121 |
+
<|start_header_id|>system<|end_header_id|>
|
122 |
+
You are an expert at crafting web search queries for research questions.
|
123 |
+
More often than not, a user will ask a basic question that they wish to learn more about, however it might not be in the best format.
|
124 |
+
Reword their query to be the most effective web search string possible.
|
125 |
+
Return the JSON with a single key 'query' with no premable or explanation.
|
126 |
+
Question to transform: {question}
|
127 |
+
<|eot_id|>
|
128 |
+
<|start_header_id|>assistant<|end_header_id|>""",
|
129 |
+
input_variables=["question"],
|
130 |
+
)
|
131 |
+
|
132 |
+
query_chain = query_prompt | llama3 | JsonOutputParser()
|
133 |
+
|
134 |
+
# Definir el estado del grafo
|
135 |
+
class GraphState(TypedDict):
|
136 |
+
question: str
|
137 |
+
generation: str
|
138 |
+
search_query: str
|
139 |
+
context: str
|
140 |
+
|
141 |
+
# Nodos de procesamiento
|
142 |
+
async def generate(state):
|
143 |
+
print("Step: Generating Final Response")
|
144 |
+
question = state["question"]
|
145 |
+
context = state["context"]
|
146 |
+
generation = await generate_chain.invoke({"context": context, "question": question})
|
147 |
+
return {"generation": generation}
|
148 |
+
|
149 |
+
async def transform_query(state):
|
150 |
+
print("Step: Optimizing Query for Web Search")
|
151 |
+
question = state['question']
|
152 |
+
gen_query = await query_chain.invoke({"question": question})
|
153 |
+
search_query = gen_query.get("query", "") # Asegurarnos de que estamos obteniendo la clave correcta
|
154 |
+
return {"search_query": search_query}
|
155 |
+
|
156 |
+
async def web_search(state):
|
157 |
+
search_query = state['search_query']
|
158 |
+
print(f'Step: Searching the Web for: "{search_query}"')
|
159 |
+
try:
|
160 |
+
# Se usa la cach茅 para optimizar los resultados
|
161 |
+
search_result = await cached_search(search_query)
|
162 |
+
if isinstance(search_result, str): # Si la respuesta es una cadena, la convertimos en un diccionario
|
163 |
+
print(f"Respuesta de b煤squeda web es cadena: {search_result}")
|
164 |
+
return {"context": search_result}
|
165 |
+
elif isinstance(search_result, dict): # Si es un diccionario, lo usamos directamente
|
166 |
+
return {"context": search_result}
|
167 |
+
else:
|
168 |
+
raise ValueError("Respuesta de b煤squeda web no es v谩lida")
|
169 |
+
except Exception as e:
|
170 |
+
print(f"Web search failed: {e}")
|
171 |
+
return None # Si la b煤squeda falla, no devuelve contexto
|
172 |
+
|
173 |
+
async def route_question(state):
|
174 |
+
print("Step: Routing Query")
|
175 |
+
question = state['question']
|
176 |
+
output = await question_router.invoke({"question": question})
|
177 |
+
if output.get('choice') == "web_search":
|
178 |
+
print("Step: Routing Query to Web Search")
|
179 |
+
return "websearch"
|
180 |
+
elif output.get('choice') == 'generate':
|
181 |
+
print("Step: Routing Query to Generation")
|
182 |
+
return "generate"
|
183 |
+
|
184 |
+
# Crear el grafo de estado
|
185 |
+
workflow = StateGraph(GraphState)
|
186 |
+
workflow.add_node("websearch", web_search)
|
187 |
+
workflow.add_node("transform_query", transform_query)
|
188 |
+
workflow.add_node("generate", generate)
|
189 |
+
|
190 |
+
workflow.set_conditional_entry_point(
|
191 |
+
route_question,
|
192 |
+
{
|
193 |
+
"websearch": "transform_query",
|
194 |
+
"generate": "generate",
|
195 |
+
},
|
196 |
+
)
|
197 |
+
|
198 |
+
# Gesti贸n de recursos de CPU, RAM y GPU
|
199 |
+
def release_resources():
|
200 |
+
try:
|
201 |
+
torch.cuda.empty_cache()
|
202 |
+
gc.collect()
|
203 |
+
except Exception as e:
|
204 |
+
print(f"Failed to release resources: {e}")
|
205 |
+
|
206 |
+
def resource_manager():
|
207 |
+
MAX_RAM_PERCENT = 80 # Ajustar seg煤n sea necesario
|
208 |
+
MAX_CPU_PERCENT = 80
|
209 |
+
MAX_GPU_PERCENT = 80
|
210 |
+
MAX_RAM_MB = 4096 # Ajustar seg煤n la memoria disponible
|
211 |
+
|
212 |
+
while True:
|
213 |
+
try:
|
214 |
+
virtual_mem = psutil.virtual_memory()
|
215 |
+
current_ram_percent = virtual_mem.percent
|
216 |
+
current_ram_mb = virtual_mem.used / (1024 * 1024) # Convertir a MB
|
217 |
+
|
218 |
+
if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
|
219 |
+
release_resources()
|
220 |
+
|
221 |
+
current_cpu_percent = psutil.cpu_percent()
|
222 |
+
if current_cpu_percent > MAX_CPU_PERCENT:
|
223 |
+
psutil.Process(os.getpid()).nice()
|
224 |
+
|
225 |
+
if torch.cuda.is_available():
|
226 |
+
gpu = torch.cuda.current_device()
|
227 |
+
gpu_mem = torch.cuda.memory_allocated(gpu) / (1024 * 1024) # Convertir a MB
|
228 |
+
|
229 |
+
if gpu_mem > MAX_GPU_PERCENT:
|
230 |
+
release_resources()
|
231 |
+
|
232 |
+
except Exception as e:
|
233 |
+
print(f"Error en el gestor de recursos: {e}")
|
234 |
+
|
235 |
+
resource_manager()
|
236 |
+
|
237 |
+
# Procesar la consulta en paralelo
|
238 |
+
async def process_query_in_parallel(query):
|
239 |
+
try:
|
240 |
+
state = GraphState(question=query, generation="", search_query="", context="")
|
241 |
+
return await workflow.invoke(state)
|
242 |
+
except Exception as e:
|
243 |
+
print(f"Error en la ejecuci贸n paralela: {e}")
|
244 |
+
raise
|
245 |
+
|
246 |
+
# Ruta de la API para manejar consultas
|
247 |
+
@app.post("/query")
|
248 |
+
async def query_handler(request: QueryRequest):
|
249 |
+
try:
|
250 |
+
query = request.query
|
251 |
+
result = await process_query_in_parallel(query) # Llamada as铆ncrona
|
252 |
+
return {"results": result}
|
253 |
+
except Exception as e:
|
254 |
+
raise HTTPException(status_code=500, detail=str(e))
|
255 |
+
|
256 |
+
# Ejecutar el servidor FastAPI
|
257 |
+
if __name__ == "__main__":
|
258 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|