Spaces:
Sleeping
Sleeping
Aleksandr Maiorov
commited on
Commit
·
6fb3190
1
Parent(s):
9bbb532
v0.2.0
Browse files- запуск llama-server из докера
- удалена загрузка модели через llama python
- Dockerfile +8 -3
- app.py +1 -99
Dockerfile
CHANGED
@@ -1,17 +1,22 @@
|
|
1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
FROM python:3.9
|
5 |
-
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
-
|
10 |
WORKDIR /app
|
11 |
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
-
RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
15 |
|
16 |
COPY --chown=user . /app
|
|
|
17 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
3 |
+
FROM ghcr.io/ggerganov/llama.cpp:server
|
4 |
+
ENV LLAMA_ARG_MODEL_URL=https://huggingface.co/Vikhrmodels/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF/resolve/main/Vikhr-Qwen-2.5-1.5b-Instruct-Q8_0.gguf \
|
5 |
+
LLAMA_ARG_CTX_SIZE=4096 \
|
6 |
+
LLAMA_ARG_N_PARALLEL=2 \
|
7 |
+
LLAMA_ARG_ENDPOINT_METRICS=1 \
|
8 |
+
LLAMA_ARG_PORT=7860
|
9 |
+
#RUN ./llama-server -mu https://huggingface.co/Vikhrmodels/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF/resolve/main/Vikhr-Qwen-2.5-1.5b-Instruct-Q8_0.gguf -c 2048 --port 7860 --host 0.0.0.0
|
10 |
|
11 |
FROM python:3.9
|
|
|
12 |
RUN useradd -m -u 1000 user
|
13 |
USER user
|
14 |
ENV PATH="/home/user/.local/bin:$PATH"
|
|
|
15 |
WORKDIR /app
|
16 |
|
17 |
COPY --chown=user ./requirements.txt requirements.txt
|
18 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
|
|
19 |
|
20 |
COPY --chown=user . /app
|
21 |
+
|
22 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
CHANGED
@@ -1,13 +1,8 @@
|
|
1 |
import logging
|
2 |
-
from typing import Union, Optional, SupportsIndex
|
3 |
from fastapi import FastAPI
|
4 |
-
from llama_cpp import Llama
|
5 |
|
6 |
app = FastAPI()
|
7 |
|
8 |
-
CHAT_TEMPLATE = '<|system|> {system_prompt}<|end|><|user|> {prompt}<|end|><|assistant|>'.strip()
|
9 |
-
SYSTEM_PROMPT = ''
|
10 |
-
|
11 |
logging.basicConfig(
|
12 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
13 |
level=logging.INFO
|
@@ -16,100 +11,7 @@ logger = logging.getLogger(__name__)
|
|
16 |
|
17 |
logger.info("Запускаемся... 🥳🥳🥳")
|
18 |
|
19 |
-
REPO_ID = "Vikhrmodels/QVikhr-2.5-1.5B-Instruct-SMPO_GGUF"
|
20 |
-
FILENAME = "QVikhr-2.5-1.5B-Instruct-SMPO-Q8_0.gguf"
|
21 |
-
|
22 |
-
# Инициализация модели
|
23 |
-
try:
|
24 |
-
logger.info(f"Загрузка модели {FILENAME}...")
|
25 |
-
|
26 |
-
# загрузка модели для локального хранилища
|
27 |
-
# llm = Llama(
|
28 |
-
# model_path=f"./models/{model_name}.gguf",
|
29 |
-
# verbose=False,
|
30 |
-
# n_gpu_layers=-1,
|
31 |
-
# n_ctx=1512,
|
32 |
-
# temperature=0.3,
|
33 |
-
# num_return_sequences=1,
|
34 |
-
# no_repeat_ngram_size=2,
|
35 |
-
# top_k=50,
|
36 |
-
# top_p=0.95,
|
37 |
-
# )
|
38 |
-
|
39 |
-
# if not llm:
|
40 |
-
LLM = Llama.from_pretrained(
|
41 |
-
repo_id=REPO_ID,
|
42 |
-
filename=FILENAME,
|
43 |
-
n_gpu_layers=-1,
|
44 |
-
n_ctx=1512,
|
45 |
-
temperature=0.3,
|
46 |
-
num_return_sequences=1,
|
47 |
-
no_repeat_ngram_size=2,
|
48 |
-
top_k=50,
|
49 |
-
top_p=0.95,
|
50 |
-
)
|
51 |
-
|
52 |
-
except Exception as e:
|
53 |
-
logger.error(f"Ошибка загрузки модели: {str(e)}")
|
54 |
-
raise
|
55 |
-
|
56 |
-
|
57 |
-
# составление промта для модели
|
58 |
-
def create_prompt(text: str) -> Union[str, None]:
|
59 |
-
try:
|
60 |
-
user_input = text
|
61 |
-
logger.info(f"Получено сообщение: {user_input}")
|
62 |
-
|
63 |
-
|
64 |
-
# Генерация шаблона
|
65 |
-
return CHAT_TEMPLATE.format(
|
66 |
-
system_prompt=SYSTEM_PROMPT or 'Ответ должен быть точным, кратким и с юмором.',
|
67 |
-
prompt=user_input,
|
68 |
-
)
|
69 |
-
except Exception as e:
|
70 |
-
logger.error(e)
|
71 |
-
|
72 |
-
|
73 |
-
def generate_response(prompt: str) -> Optional[str]:
|
74 |
-
try:
|
75 |
-
# Обработка текстового сообщения
|
76 |
-
output = LLM(
|
77 |
-
prompt,
|
78 |
-
max_tokens=64,
|
79 |
-
stop=["<|end|>"],
|
80 |
-
)
|
81 |
-
|
82 |
-
logger.info('Output:')
|
83 |
-
logger.info(output)
|
84 |
-
|
85 |
-
response = output['choices'][0]['text']
|
86 |
-
|
87 |
-
# Отправка ответа
|
88 |
-
if response:
|
89 |
-
return response
|
90 |
-
|
91 |
-
return 'Произошла ошибка при обработке запроса'
|
92 |
-
|
93 |
-
except Exception as e:
|
94 |
-
logger.error(f"Ошибка обработки сообщения: {str(e)}")
|
95 |
-
|
96 |
|
97 |
@app.get("/")
|
98 |
def greet_json():
|
99 |
-
return {"Hello": "World!"}
|
100 |
-
|
101 |
-
@app.put("/system-prompt")
|
102 |
-
async def set_system_prompt(text: str):
|
103 |
-
# Генерация ответа с помощью модели
|
104 |
-
logger.info('post/system-prompt')
|
105 |
-
global SYSTEM_PROMPT
|
106 |
-
SYSTEM_PROMPT = text
|
107 |
-
|
108 |
-
|
109 |
-
@app.post("/predict")
|
110 |
-
async def predict(text: str):
|
111 |
-
# Генерация ответа с помощью модели
|
112 |
-
logger.info('post/predict')
|
113 |
-
prompt = create_prompt(text)
|
114 |
-
response = generate_response(prompt)
|
115 |
-
return {"response": response}
|
|
|
1 |
import logging
|
|
|
2 |
from fastapi import FastAPI
|
|
|
3 |
|
4 |
app = FastAPI()
|
5 |
|
|
|
|
|
|
|
6 |
logging.basicConfig(
|
7 |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
8 |
level=logging.INFO
|
|
|
11 |
|
12 |
logger.info("Запускаемся... 🥳🥳🥳")
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
@app.get("/")
|
16 |
def greet_json():
|
17 |
+
return {"Hello": "World!"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|