Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pydantic import BaseModel
|
2 |
-
from
|
3 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
import re
|
5 |
import httpx
|
@@ -14,7 +14,6 @@ import uvicorn
|
|
14 |
from threading import Thread
|
15 |
|
16 |
load_dotenv()
|
17 |
-
|
18 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
19 |
|
20 |
global_data = {
|
@@ -30,7 +29,7 @@ global_data = {
|
|
30 |
'mask': 'mask_token'
|
31 |
},
|
32 |
'model_metadata': {},
|
33 |
-
'max_tokens':
|
34 |
'tokenizers': {},
|
35 |
'model_params': {},
|
36 |
'model_size': {},
|
@@ -103,10 +102,7 @@ class ModelManager:
|
|
103 |
executor.submit(self.load_model, config)
|
104 |
return self.models
|
105 |
|
106 |
-
|
107 |
model_manager = ModelManager()
|
108 |
-
|
109 |
-
|
110 |
global_data['models'] = model_manager.load_all_models()
|
111 |
|
112 |
class ChatRequest(BaseModel):
|
@@ -135,14 +131,6 @@ def cache_response(func):
|
|
135 |
return response
|
136 |
return wrapper
|
137 |
|
138 |
-
|
139 |
-
@cache_response
|
140 |
-
def generate_model_response(model, inputs):
|
141 |
-
try:
|
142 |
-
response = model(inputs)
|
143 |
-
except Exception as e:
|
144 |
-
return ""
|
145 |
-
|
146 |
@cache_response
|
147 |
def generate_model_response(model, inputs):
|
148 |
try:
|
@@ -179,11 +167,17 @@ app = FastAPI()
|
|
179 |
|
180 |
@app.post("/generate")
|
181 |
async def generate(request: ChatRequest):
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
184 |
|
185 |
def run_uvicorn():
|
186 |
-
|
|
|
|
|
|
|
187 |
|
188 |
iface = gr.Interface(
|
189 |
fn=process_message,
|
@@ -193,10 +187,10 @@ iface = gr.Interface(
|
|
193 |
description="Enter a message and get responses from multiple LLMs using CPU."
|
194 |
)
|
195 |
|
196 |
-
|
197 |
def run_gradio():
|
198 |
iface.launch(server_port=7862, prevent_thread_lock=True)
|
199 |
|
200 |
if __name__ == "__main__":
|
201 |
Thread(target=run_uvicorn).start()
|
202 |
Thread(target=run_gradio).start()
|
|
|
|
1 |
from pydantic import BaseModel
|
2 |
+
from llama_cpp_agent import Llama
|
3 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
4 |
import re
|
5 |
import httpx
|
|
|
14 |
from threading import Thread
|
15 |
|
16 |
load_dotenv()
|
|
|
17 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
18 |
|
19 |
global_data = {
|
|
|
29 |
'mask': 'mask_token'
|
30 |
},
|
31 |
'model_metadata': {},
|
32 |
+
'max_tokens': 256,
|
33 |
'tokenizers': {},
|
34 |
'model_params': {},
|
35 |
'model_size': {},
|
|
|
102 |
executor.submit(self.load_model, config)
|
103 |
return self.models
|
104 |
|
|
|
105 |
model_manager = ModelManager()
|
|
|
|
|
106 |
global_data['models'] = model_manager.load_all_models()
|
107 |
|
108 |
class ChatRequest(BaseModel):
|
|
|
131 |
return response
|
132 |
return wrapper
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
@cache_response
|
135 |
def generate_model_response(model, inputs):
|
136 |
try:
|
|
|
167 |
|
168 |
@app.post("/generate")
|
169 |
async def generate(request: ChatRequest):
|
170 |
+
try:
|
171 |
+
response = await process_message(request.message)
|
172 |
+
return JSONResponse(content={"response": response})
|
173 |
+
except Exception as e:
|
174 |
+
return JSONResponse(content={"error": str(e)})
|
175 |
|
176 |
def run_uvicorn():
|
177 |
+
try:
|
178 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
179 |
+
except Exception as e:
|
180 |
+
print(f"Error al ejecutar uvicorn: {e}")
|
181 |
|
182 |
iface = gr.Interface(
|
183 |
fn=process_message,
|
|
|
187 |
description="Enter a message and get responses from multiple LLMs using CPU."
|
188 |
)
|
189 |
|
|
|
190 |
def run_gradio():
|
191 |
iface.launch(server_port=7862, prevent_thread_lock=True)
|
192 |
|
193 |
if __name__ == "__main__":
|
194 |
Thread(target=run_uvicorn).start()
|
195 |
Thread(target=run_gradio).start()
|
196 |
+
asyncio.get_event_loop().run_forever()
|