Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
-
import threading
|
4 |
-
import time
|
5 |
import asyncio
|
6 |
-
import
|
|
|
|
|
|
|
|
|
7 |
from fastapi import FastAPI, HTTPException
|
8 |
from pydantic import BaseModel
|
|
|
9 |
from langchain.prompts import PromptTemplate
|
10 |
from langchain_community.chat_models import ChatOllama
|
11 |
from langchain_community.tools import DuckDuckGoSearchRun
|
@@ -13,57 +16,54 @@ from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
|
|
13 |
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
|
14 |
from langgraph.graph import END, StateGraph
|
15 |
from typing_extensions import TypedDict
|
16 |
-
from fastapi.responses import StreamingResponse
|
17 |
-
from ollama import AsyncClient
|
18 |
-
import gc
|
19 |
-
import psutil
|
20 |
-
import torch
|
21 |
-
from functools import lru_cache
|
22 |
|
23 |
# Aseg煤rate de tener la librer铆a Ollama instalada
|
24 |
# pip install ollama langchain langchain_community langgraph
|
25 |
|
26 |
-
#
|
27 |
OLLAMA = os.path.expanduser("~/ollama")
|
28 |
-
if not os.path.exists(OLLAMA):
|
29 |
-
print("Ollama no encontrado, descargando...")
|
30 |
-
subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
|
31 |
-
os.chmod(OLLAMA, 0o755)
|
32 |
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
print("Iniciando el servicio de Ollama")
|
36 |
-
subprocess.run("~/ollama serve
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
OLLAMA_SERVICE_THREAD = threading.Thread(target=asyncio.run, args=(ollama_service_thread(),))
|
41 |
OLLAMA_SERVICE_THREAD.start()
|
42 |
|
43 |
-
# Esperar a que Ollama
|
44 |
print("Esperando a que Ollama inicie...")
|
45 |
time.sleep(10)
|
46 |
|
47 |
# Descargar el modelo de Hugging Face si no est谩 disponible
|
48 |
-
|
49 |
-
try:
|
50 |
-
print(f"Descargando el modelo: {model_name}")
|
51 |
-
subprocess.run(["ollama", "pull", model_name], check=True)
|
52 |
-
except subprocess.CalledProcessError as e:
|
53 |
-
print(f"Error al descargar el modelo: {e}")
|
54 |
-
raise
|
55 |
-
|
56 |
-
# Descargar el modelo de Ollama en el hilo principal
|
57 |
-
download_ollama_model("hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S")
|
58 |
|
59 |
-
# Crear
|
60 |
app = FastAPI()
|
61 |
|
62 |
-
#
|
63 |
class QueryRequest(BaseModel):
|
64 |
query: str
|
65 |
|
66 |
-
# Definir el modelo de lenguaje de Ollama
|
67 |
local_llm = 'hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'
|
68 |
llama3 = ChatOllama(model=local_llm)
|
69 |
|
@@ -71,12 +71,7 @@ llama3 = ChatOllama(model=local_llm)
|
|
71 |
wrapper = DuckDuckGoSearchAPIWrapper(max_results=1)
|
72 |
web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper)
|
73 |
|
74 |
-
#
|
75 |
-
@lru_cache(maxsize=1024) # Cache de tama帽o limitado para resultados previos
|
76 |
-
async def cached_search(query):
|
77 |
-
return await web_search_tool.invoke(query)
|
78 |
-
|
79 |
-
# Definir los prompts para generaci贸n y enrutamiento
|
80 |
generate_prompt = PromptTemplate(
|
81 |
template="""
|
82 |
<|begin_of_text|>
|
@@ -139,26 +134,25 @@ class GraphState(TypedDict):
|
|
139 |
context: str
|
140 |
|
141 |
# Nodos de procesamiento
|
142 |
-
|
143 |
print("Step: Generating Final Response")
|
144 |
question = state["question"]
|
145 |
context = state["context"]
|
146 |
-
generation =
|
147 |
return {"generation": generation}
|
148 |
|
149 |
-
|
150 |
print("Step: Optimizing Query for Web Search")
|
151 |
question = state['question']
|
152 |
-
gen_query =
|
153 |
search_query = gen_query.get("query", "") # Asegurarnos de que estamos obteniendo la clave correcta
|
154 |
return {"search_query": search_query}
|
155 |
|
156 |
-
|
157 |
search_query = state['search_query']
|
158 |
print(f'Step: Searching the Web for: "{search_query}"')
|
159 |
try:
|
160 |
-
|
161 |
-
search_result = await cached_search(search_query)
|
162 |
if isinstance(search_result, str): # Si la respuesta es una cadena, la convertimos en un diccionario
|
163 |
print(f"Respuesta de b煤squeda web es cadena: {search_result}")
|
164 |
return {"context": search_result}
|
@@ -170,10 +164,10 @@ async def web_search(state):
|
|
170 |
print(f"Web search failed: {e}")
|
171 |
return None # Si la b煤squeda falla, no devuelve contexto
|
172 |
|
173 |
-
|
174 |
print("Step: Routing Query")
|
175 |
question = state['question']
|
176 |
-
output =
|
177 |
if output.get('choice') == "web_search":
|
178 |
print("Step: Routing Query to Web Search")
|
179 |
return "websearch"
|
@@ -194,8 +188,28 @@ workflow.set_conditional_entry_point(
|
|
194 |
"generate": "generate",
|
195 |
},
|
196 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
#
|
199 |
def release_resources():
|
200 |
try:
|
201 |
torch.cuda.empty_cache()
|
@@ -204,16 +218,16 @@ def release_resources():
|
|
204 |
print(f"Failed to release resources: {e}")
|
205 |
|
206 |
def resource_manager():
|
207 |
-
MAX_RAM_PERCENT = 1
|
208 |
MAX_CPU_PERCENT = 1
|
209 |
MAX_GPU_PERCENT = 1
|
210 |
-
MAX_RAM_MB = 1
|
211 |
|
212 |
while True:
|
213 |
try:
|
214 |
virtual_mem = psutil.virtual_memory()
|
215 |
current_ram_percent = virtual_mem.percent
|
216 |
-
current_ram_mb = virtual_mem.used / (1 * 1) #
|
217 |
|
218 |
if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
|
219 |
release_resources()
|
@@ -224,35 +238,15 @@ def resource_manager():
|
|
224 |
|
225 |
if torch.cuda.is_available():
|
226 |
gpu = torch.cuda.current_device()
|
227 |
-
gpu_mem = torch.cuda.
|
228 |
|
229 |
if gpu_mem > MAX_GPU_PERCENT:
|
230 |
release_resources()
|
231 |
|
232 |
except Exception as e:
|
233 |
-
print(f"Error
|
234 |
|
235 |
resource_manager()
|
236 |
|
237 |
-
# Procesar la consulta en paralelo
|
238 |
-
async def process_query_in_parallel(query):
|
239 |
-
try:
|
240 |
-
state = GraphState(question=query, generation="", search_query="", context="")
|
241 |
-
return await workflow.invoke(state)
|
242 |
-
except Exception as e:
|
243 |
-
print(f"Error en la ejecuci贸n paralela: {e}")
|
244 |
-
raise
|
245 |
-
|
246 |
-
# Ruta de la API para manejar consultas
|
247 |
-
@app.post("/query")
|
248 |
-
async def query_handler(request: QueryRequest):
|
249 |
-
try:
|
250 |
-
query = request.query
|
251 |
-
result = await process_query_in_parallel(query) # Llamada as铆ncrona
|
252 |
-
return {"results": result}
|
253 |
-
except Exception as e:
|
254 |
-
raise HTTPException(status_code=500, detail=str(e))
|
255 |
-
|
256 |
-
# Ejecutar el servidor FastAPI
|
257 |
if __name__ == "__main__":
|
258 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
1 |
import os
|
2 |
import subprocess
|
|
|
|
|
3 |
import asyncio
|
4 |
+
import time
|
5 |
+
import threading
|
6 |
+
import gc
|
7 |
+
import psutil
|
8 |
+
import torch
|
9 |
from fastapi import FastAPI, HTTPException
|
10 |
from pydantic import BaseModel
|
11 |
+
import uvicorn
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
from langchain_community.chat_models import ChatOllama
|
14 |
from langchain_community.tools import DuckDuckGoSearchRun
|
|
|
16 |
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
|
17 |
from langgraph.graph import END, StateGraph
|
18 |
from typing_extensions import TypedDict
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# Aseg煤rate de tener la librer铆a Ollama instalada
|
21 |
# pip install ollama langchain langchain_community langgraph
|
22 |
|
23 |
+
# Configuraci贸n de Ollama y su servicio
|
24 |
OLLAMA = os.path.expanduser("~/ollama")
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
def download_ollama_model(model_name='hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'):
|
27 |
+
try:
|
28 |
+
if not os.path.exists(OLLAMA):
|
29 |
+
print("Ollama no encontrado, descargando...")
|
30 |
+
subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
|
31 |
+
os.chmod(OLLAMA, 0o755)
|
32 |
+
|
33 |
+
print(f"Descargando el modelo: {model_name}")
|
34 |
+
subprocess.run(["~/ollama", "pull", model_name], check=True)
|
35 |
+
except subprocess.CalledProcessError as e:
|
36 |
+
print(f"Error al descargar el modelo: {e}")
|
37 |
+
raise
|
38 |
+
|
39 |
+
# Funci贸n as铆ncrona para manejar la descarga del modelo
|
40 |
+
async def async_download_ollama_model():
|
41 |
+
await asyncio.to_thread(download_ollama_model)
|
42 |
+
|
43 |
+
# Iniciar el servidor Ollama en un hilo
|
44 |
+
def ollama_service_thread():
|
45 |
print("Iniciando el servicio de Ollama")
|
46 |
+
subprocess.run("~/ollama serve", shell=True)
|
47 |
|
48 |
+
# Crear un hilo para iniciar Ollama
|
49 |
+
OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
|
|
|
50 |
OLLAMA_SERVICE_THREAD.start()
|
51 |
|
52 |
+
# Esperar a que Ollama est茅 listo
|
53 |
print("Esperando a que Ollama inicie...")
|
54 |
time.sleep(10)
|
55 |
|
56 |
# Descargar el modelo de Hugging Face si no est谩 disponible
|
57 |
+
asyncio.run(async_download_ollama_model())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
# Crear instancia de FastAPI
|
60 |
app = FastAPI()
|
61 |
|
62 |
+
# Definici贸n del modelo de datos para recibir las consultas en la API
|
63 |
class QueryRequest(BaseModel):
|
64 |
query: str
|
65 |
|
66 |
+
# Definir el modelo de lenguaje de Ollama (sin 'temperature')
|
67 |
local_llm = 'hf.co/MaziyarPanahi/Llama-3.2-3B-Instruct-uncensored-GGUF:IQ1_S'
|
68 |
llama3 = ChatOllama(model=local_llm)
|
69 |
|
|
|
71 |
wrapper = DuckDuckGoSearchAPIWrapper(max_results=1)
|
72 |
web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper)
|
73 |
|
74 |
+
# Definici贸n de los prompts para generaci贸n y enrutamiento
|
|
|
|
|
|
|
|
|
|
|
75 |
generate_prompt = PromptTemplate(
|
76 |
template="""
|
77 |
<|begin_of_text|>
|
|
|
134 |
context: str
|
135 |
|
136 |
# Nodos de procesamiento
|
137 |
+
def generate(state):
|
138 |
print("Step: Generating Final Response")
|
139 |
question = state["question"]
|
140 |
context = state["context"]
|
141 |
+
generation = generate_chain.invoke({"context": context, "question": question})
|
142 |
return {"generation": generation}
|
143 |
|
144 |
+
def transform_query(state):
|
145 |
print("Step: Optimizing Query for Web Search")
|
146 |
question = state['question']
|
147 |
+
gen_query = query_chain.invoke({"question": question})
|
148 |
search_query = gen_query.get("query", "") # Asegurarnos de que estamos obteniendo la clave correcta
|
149 |
return {"search_query": search_query}
|
150 |
|
151 |
+
def web_search(state):
|
152 |
search_query = state['search_query']
|
153 |
print(f'Step: Searching the Web for: "{search_query}"')
|
154 |
try:
|
155 |
+
search_result = web_search_tool.invoke(search_query)
|
|
|
156 |
if isinstance(search_result, str): # Si la respuesta es una cadena, la convertimos en un diccionario
|
157 |
print(f"Respuesta de b煤squeda web es cadena: {search_result}")
|
158 |
return {"context": search_result}
|
|
|
164 |
print(f"Web search failed: {e}")
|
165 |
return None # Si la b煤squeda falla, no devuelve contexto
|
166 |
|
167 |
+
def route_question(state):
|
168 |
print("Step: Routing Query")
|
169 |
question = state['question']
|
170 |
+
output = question_router.invoke({"question": question})
|
171 |
if output.get('choice') == "web_search":
|
172 |
print("Step: Routing Query to Web Search")
|
173 |
return "websearch"
|
|
|
188 |
"generate": "generate",
|
189 |
},
|
190 |
)
|
191 |
+
workflow.add_edge("transform_query", "websearch")
|
192 |
+
workflow.add_edge("websearch", "generate")
|
193 |
+
workflow.add_edge("generate", END)
|
194 |
+
|
195 |
+
# Compilar el agente
|
196 |
+
local_agent = workflow.compile()
|
197 |
+
|
198 |
+
# Funci贸n para ejecutar el agente
|
199 |
+
def run_agent_parallel(query):
|
200 |
+
output = local_agent.invoke({"question": query})
|
201 |
+
if "generation" not in output: # Si la b煤squeda web fall贸 y no hubo respuesta de generaci贸n
|
202 |
+
print("Web search failed, using Ollama model directly.")
|
203 |
+
return generate({"question": query, "context": ""})["generation"] # Generar directamente
|
204 |
+
return output['generation']
|
205 |
+
|
206 |
+
# L贸gica del servidor FastAPI
|
207 |
+
@app.post("/query")
|
208 |
+
async def query_endpoint(request: QueryRequest):
|
209 |
+
query = request.query
|
210 |
+
return {"response": run_agent_parallel(query)}
|
211 |
|
212 |
+
# L贸gica de recursos
|
213 |
def release_resources():
|
214 |
try:
|
215 |
torch.cuda.empty_cache()
|
|
|
218 |
print(f"Failed to release resources: {e}")
|
219 |
|
220 |
def resource_manager():
|
221 |
+
MAX_RAM_PERCENT = 1
|
222 |
MAX_CPU_PERCENT = 1
|
223 |
MAX_GPU_PERCENT = 1
|
224 |
+
MAX_RAM_MB = 1
|
225 |
|
226 |
while True:
|
227 |
try:
|
228 |
virtual_mem = psutil.virtual_memory()
|
229 |
current_ram_percent = virtual_mem.percent
|
230 |
+
current_ram_mb = virtual_mem.used / (1 * 1) # Convert to MB
|
231 |
|
232 |
if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
|
233 |
release_resources()
|
|
|
238 |
|
239 |
if torch.cuda.is_available():
|
240 |
gpu = torch.cuda.current_device()
|
241 |
+
gpu_mem = torch.cuda.memory_percent(gpu)
|
242 |
|
243 |
if gpu_mem > MAX_GPU_PERCENT:
|
244 |
release_resources()
|
245 |
|
246 |
except Exception as e:
|
247 |
+
print(f"Error in resource manager: {e}")
|
248 |
|
249 |
resource_manager()
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
if __name__ == "__main__":
|
252 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|