Aleksandr Maiorov commited on
Commit
6fb3190
·
1 Parent(s): 9bbb532

- запуск llama-server из докера
- удалена загрузка модели через llama python

Files changed (2) hide show
  1. Dockerfile +8 -3
  2. app.py +1 -99
Dockerfile CHANGED
@@ -1,17 +1,22 @@
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
 
 
 
 
 
 
 
3
 
4
  FROM python:3.9
5
-
6
  RUN useradd -m -u 1000 user
7
  USER user
8
  ENV PATH="/home/user/.local/bin:$PATH"
9
-
10
  WORKDIR /app
11
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
- RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
15
 
16
  COPY --chown=user . /app
 
17
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
+ FROM ghcr.io/ggerganov/llama.cpp:server
4
+ ENV LLAMA_ARG_MODEL_URL=https://huggingface.co/Vikhrmodels/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF/resolve/main/Vikhr-Qwen-2.5-1.5b-Instruct-Q8_0.gguf \
5
+ LLAMA_ARG_CTX_SIZE=4096 \
6
+ LLAMA_ARG_N_PARALLEL=2 \
7
+ LLAMA_ARG_ENDPOINT_METRICS=1 \
8
+ LLAMA_ARG_PORT=7860
9
+ #RUN ./llama-server -mu https://huggingface.co/Vikhrmodels/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF/resolve/main/Vikhr-Qwen-2.5-1.5b-Instruct-Q8_0.gguf -c 2048 --port 7860 --host 0.0.0.0
10
 
11
  FROM python:3.9
 
12
  RUN useradd -m -u 1000 user
13
  USER user
14
  ENV PATH="/home/user/.local/bin:$PATH"
 
15
  WORKDIR /app
16
 
17
  COPY --chown=user ./requirements.txt requirements.txt
18
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
19
 
20
  COPY --chown=user . /app
21
+
22
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,13 +1,8 @@
1
  import logging
2
- from typing import Union, Optional, SupportsIndex
3
  from fastapi import FastAPI
4
- from llama_cpp import Llama
5
 
6
  app = FastAPI()
7
 
8
- CHAT_TEMPLATE = '<|system|> {system_prompt}<|end|><|user|> {prompt}<|end|><|assistant|>'.strip()
9
- SYSTEM_PROMPT = ''
10
-
11
  logging.basicConfig(
12
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
13
  level=logging.INFO
@@ -16,100 +11,7 @@ logger = logging.getLogger(__name__)
16
 
17
  logger.info("Запускаемся... 🥳🥳🥳")
18
 
19
- REPO_ID = "Vikhrmodels/QVikhr-2.5-1.5B-Instruct-SMPO_GGUF"
20
- FILENAME = "QVikhr-2.5-1.5B-Instruct-SMPO-Q8_0.gguf"
21
-
22
- # Инициализация модели
23
- try:
24
- logger.info(f"Загрузка модели {FILENAME}...")
25
-
26
- # загрузка модели для локального хранилища
27
- # llm = Llama(
28
- # model_path=f"./models/{model_name}.gguf",
29
- # verbose=False,
30
- # n_gpu_layers=-1,
31
- # n_ctx=1512,
32
- # temperature=0.3,
33
- # num_return_sequences=1,
34
- # no_repeat_ngram_size=2,
35
- # top_k=50,
36
- # top_p=0.95,
37
- # )
38
-
39
- # if not llm:
40
- LLM = Llama.from_pretrained(
41
- repo_id=REPO_ID,
42
- filename=FILENAME,
43
- n_gpu_layers=-1,
44
- n_ctx=1512,
45
- temperature=0.3,
46
- num_return_sequences=1,
47
- no_repeat_ngram_size=2,
48
- top_k=50,
49
- top_p=0.95,
50
- )
51
-
52
- except Exception as e:
53
- logger.error(f"Ошибка загрузки модели: {str(e)}")
54
- raise
55
-
56
-
57
- # составление промта для модели
58
- def create_prompt(text: str) -> Union[str, None]:
59
- try:
60
- user_input = text
61
- logger.info(f"Получено сообщение: {user_input}")
62
-
63
-
64
- # Генерация шаблона
65
- return CHAT_TEMPLATE.format(
66
- system_prompt=SYSTEM_PROMPT or 'Ответ должен быть точным, кратким и с юмором.',
67
- prompt=user_input,
68
- )
69
- except Exception as e:
70
- logger.error(e)
71
-
72
-
73
- def generate_response(prompt: str) -> Optional[str]:
74
- try:
75
- # Обработка текстового сообщения
76
- output = LLM(
77
- prompt,
78
- max_tokens=64,
79
- stop=["<|end|>"],
80
- )
81
-
82
- logger.info('Output:')
83
- logger.info(output)
84
-
85
- response = output['choices'][0]['text']
86
-
87
- # Отправка ответа
88
- if response:
89
- return response
90
-
91
- return 'Произошла ошибка при обработке запроса'
92
-
93
- except Exception as e:
94
- logger.error(f"Ошибка обработки сообщения: {str(e)}")
95
-
96
 
97
  @app.get("/")
98
  def greet_json():
99
- return {"Hello": "World!"}
100
-
101
- @app.put("/system-prompt")
102
- async def set_system_prompt(text: str):
103
- # Генерация ответа с помощью модели
104
- logger.info('post/system-prompt')
105
- global SYSTEM_PROMPT
106
- SYSTEM_PROMPT = text
107
-
108
-
109
- @app.post("/predict")
110
- async def predict(text: str):
111
- # Генерация ответа с помощью модели
112
- logger.info('post/predict')
113
- prompt = create_prompt(text)
114
- response = generate_response(prompt)
115
- return {"response": response}
 
1
  import logging
 
2
  from fastapi import FastAPI
 
3
 
4
  app = FastAPI()
5
 
 
 
 
6
  logging.basicConfig(
7
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
8
  level=logging.INFO
 
11
 
12
  logger.info("Запускаемся... 🥳🥳🥳")
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  @app.get("/")
16
  def greet_json():
17
+ return {"Hello": "World!"}