Commit
·
ae1084a
1
Parent(s):
d67f72a
Update README.md
Browse files
README.md
CHANGED
|
@@ -10,8 +10,8 @@ MistralLite is a fine-tuned [Mistral-7B-v0.1](https://huggingface.co/mistralai/M
|
|
| 10 |
MistralLight evolves from [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1), and their similarities and differences are summarized below:
|
| 11 |
|Model|Fine-tuned on long contexts| Max context length| RotaryEmbedding adaptation| Sliding Window Size|
|
| 12 |
|----------|-------------:|------------:|-----------:|-----------:|
|
| 13 |
-
| Mistral-7B-v0.1 |
|
| 14 |
-
| MistralLite |
|
| 15 |
|
| 16 |
## Motivation of Developing MistralLite
|
| 17 |
|
|
@@ -160,7 +160,6 @@ hub = {
|
|
| 160 |
'HF_MODEL_ID':'amazon/MistralLite',
|
| 161 |
'HF_TASK':'text-generation',
|
| 162 |
'SM_NUM_GPUS':'1',
|
| 163 |
-
'HF_MODEL_QUANTIZE':'true'
|
| 164 |
}
|
| 165 |
|
| 166 |
model = HuggingFaceModel(
|
|
@@ -184,10 +183,16 @@ input_data = {
|
|
| 184 |
"inputs": "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>",
|
| 185 |
"parameters": {
|
| 186 |
"do_sample": False,
|
| 187 |
-
"max_new_tokens":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
}
|
| 189 |
}
|
| 190 |
-
predictor.predict(input_data)
|
|
|
|
| 191 |
```
|
| 192 |
or via [boto3](https://pypi.org/project/boto3/), and the example code is shown as below:
|
| 193 |
|
|
@@ -207,15 +212,17 @@ def call_endpoint(client, prompt, endpoint_name, paramters):
|
|
| 207 |
|
| 208 |
client = boto3.client("sagemaker-runtime")
|
| 209 |
parameters = {
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
| 217 |
prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
|
| 218 |
-
result = call_endpoint(client, prompt, endpoint_name,
|
| 219 |
print(result)
|
| 220 |
```
|
| 221 |
|
|
@@ -227,11 +234,12 @@ Use TGI version 1.1.0 or later. The official Docker container is: `ghcr.io/huggi
|
|
| 227 |
Example Docker parameters:
|
| 228 |
|
| 229 |
```shell
|
| 230 |
-
docker run -d --gpus all --shm-size 1g -p 443:80 ghcr.io/huggingface/text-generation-inference:1.1.0 \
|
| 231 |
--model-id amazon/MistralLite \
|
| 232 |
--max-input-length 8192 \
|
| 233 |
--max-total-tokens 16384 \
|
| 234 |
-
--max-batch-prefill-tokens 16384
|
|
|
|
| 235 |
```
|
| 236 |
|
| 237 |
### Perform Inference ###
|
|
@@ -249,9 +257,9 @@ SERVER_HOST = "localhost"
|
|
| 249 |
SERVER_URL = f"{SERVER_HOST}:{SERVER_PORT}"
|
| 250 |
tgi_client = Client(f"http://{SERVER_URL}", timeout=60)
|
| 251 |
|
| 252 |
-
def
|
| 253 |
random_seed=1,
|
| 254 |
-
max_new_tokens=
|
| 255 |
print_stream=True,
|
| 256 |
assist_role=True):
|
| 257 |
if (assist_role):
|
|
@@ -261,10 +269,10 @@ def invoke_falconlite(prompt,
|
|
| 261 |
prompt,
|
| 262 |
do_sample=False,
|
| 263 |
max_new_tokens=max_new_tokens,
|
| 264 |
-
typical_p=0.2,
|
| 265 |
temperature=None,
|
| 266 |
truncate=None,
|
| 267 |
seed=random_seed,
|
|
|
|
| 268 |
):
|
| 269 |
if hasattr(response, "token"):
|
| 270 |
if not response.token.special:
|
|
@@ -275,7 +283,7 @@ def invoke_falconlite(prompt,
|
|
| 275 |
return output
|
| 276 |
|
| 277 |
prompt = "What are the main challenges to support a long context for LLM?"
|
| 278 |
-
result =
|
| 279 |
```
|
| 280 |
|
| 281 |
**Important** - When using MistralLite for inference for the first time, it may require a brief 'warm-up' period that can take 10s of seconds. However, subsequent inferences should be faster and return results in a more timely manner. This warm-up period is normal and should not affect the overall performance of the system once the initialisation period has been completed.
|
|
|
|
| 10 |
MistralLight evolves from [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1), and their similarities and differences are summarized below:
|
| 11 |
|Model|Fine-tuned on long contexts| Max context length| RotaryEmbedding adaptation| Sliding Window Size|
|
| 12 |
|----------|-------------:|------------:|-----------:|-----------:|
|
| 13 |
+
| Mistral-7B-v0.1 | up to 8K tokens | 32K | rope_theta = 10000 | 4096 |
|
| 14 |
+
| MistralLite | up to 16K tokens | 32K | **rope_theta = 1000000** | **16384** |
|
| 15 |
|
| 16 |
## Motivation of Developing MistralLite
|
| 17 |
|
|
|
|
| 160 |
'HF_MODEL_ID':'amazon/MistralLite',
|
| 161 |
'HF_TASK':'text-generation',
|
| 162 |
'SM_NUM_GPUS':'1',
|
|
|
|
| 163 |
}
|
| 164 |
|
| 165 |
model = HuggingFaceModel(
|
|
|
|
| 183 |
"inputs": "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>",
|
| 184 |
"parameters": {
|
| 185 |
"do_sample": False,
|
| 186 |
+
"max_new_tokens": 400,
|
| 187 |
+
"return_full_text": False,
|
| 188 |
+
"typical_p": 0.2,
|
| 189 |
+
"temperature":None,
|
| 190 |
+
"truncate":None,
|
| 191 |
+
"seed": 1,
|
| 192 |
}
|
| 193 |
}
|
| 194 |
+
result = predictor.predict(input_data)[0]["generated_text"]
|
| 195 |
+
print(result)
|
| 196 |
```
|
| 197 |
or via [boto3](https://pypi.org/project/boto3/), and the example code is shown as below:
|
| 198 |
|
|
|
|
| 212 |
|
| 213 |
client = boto3.client("sagemaker-runtime")
|
| 214 |
parameters = {
|
| 215 |
+
"do_sample": False,
|
| 216 |
+
"max_new_tokens": 400,
|
| 217 |
+
"return_full_text": False,
|
| 218 |
+
"typical_p": 0.2,
|
| 219 |
+
"temperature":None,
|
| 220 |
+
"truncate":None,
|
| 221 |
+
"seed": 1,
|
| 222 |
+
}
|
| 223 |
+
endpoint_name = "MistralLite-2023-10-16-09-45-58"
|
| 224 |
prompt = "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>"
|
| 225 |
+
result = call_endpoint(client, prompt, endpoint_name, parameters)
|
| 226 |
print(result)
|
| 227 |
```
|
| 228 |
|
|
|
|
| 234 |
Example Docker parameters:
|
| 235 |
|
| 236 |
```shell
|
| 237 |
+
docker run -d --gpus all --shm-size 1g -p 443:80 -v $(pwd)/models:/data ghcr.io/huggingface/text-generation-inference:1.1.0 \
|
| 238 |
--model-id amazon/MistralLite \
|
| 239 |
--max-input-length 8192 \
|
| 240 |
--max-total-tokens 16384 \
|
| 241 |
+
--max-batch-prefill-tokens 16384 \
|
| 242 |
+
--trust-remote-code
|
| 243 |
```
|
| 244 |
|
| 245 |
### Perform Inference ###
|
|
|
|
| 257 |
SERVER_URL = f"{SERVER_HOST}:{SERVER_PORT}"
|
| 258 |
tgi_client = Client(f"http://{SERVER_URL}", timeout=60)
|
| 259 |
|
| 260 |
+
def invoke_tgi(prompt,
|
| 261 |
random_seed=1,
|
| 262 |
+
max_new_tokens=400,
|
| 263 |
print_stream=True,
|
| 264 |
assist_role=True):
|
| 265 |
if (assist_role):
|
|
|
|
| 269 |
prompt,
|
| 270 |
do_sample=False,
|
| 271 |
max_new_tokens=max_new_tokens,
|
|
|
|
| 272 |
temperature=None,
|
| 273 |
truncate=None,
|
| 274 |
seed=random_seed,
|
| 275 |
+
typical_p=0.2,
|
| 276 |
):
|
| 277 |
if hasattr(response, "token"):
|
| 278 |
if not response.token.special:
|
|
|
|
| 283 |
return output
|
| 284 |
|
| 285 |
prompt = "What are the main challenges to support a long context for LLM?"
|
| 286 |
+
result = invoke_tgi(prompt)
|
| 287 |
```
|
| 288 |
|
| 289 |
**Important** - When using MistralLite for inference for the first time, it may require a brief 'warm-up' period that can take 10s of seconds. However, subsequent inferences should be faster and return results in a more timely manner. This warm-up period is normal and should not affect the overall performance of the system once the initialisation period has been completed.
|