Update README.md
Browse files
README.md
CHANGED
@@ -250,8 +250,11 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
|
|
250 |
"""
|
251 |
TURN_TEMPLATE = "<|im_start|>{role}\n{content}<eos>\n"
|
252 |
TURN_PREFIX = "<|im_start|>{role}\n"
|
|
|
|
|
253 |
sample = None
|
254 |
assistant_prefix_len = None
|
|
|
255 |
for turn_id, turn in enumerate(conversations):
|
256 |
prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
|
257 |
turn_sample = tokenizer(
|
@@ -261,7 +264,12 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
|
|
261 |
if turn['role'] == 'assistant':
|
262 |
if assistant_prefix_len is None:
|
263 |
assistant_prefix_len = len(tokenizer.encode(TURN_PREFIX.format(role=turn['role']), add_special_tokens=False))
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
265 |
if sample is None:
|
266 |
sample = turn_sample
|
267 |
else:
|
@@ -282,9 +290,12 @@ def seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations, add_assistant_p
|
|
282 |
|
283 |
# ! testing
|
284 |
sample = seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations)
|
285 |
-
|
286 |
-
|
|
|
287 |
|
|
|
|
|
288 |
|
289 |
```
|
290 |
|
|
|
250 |
"""
|
251 |
TURN_TEMPLATE = "<|im_start|>{role}\n{content}<eos>\n"
|
252 |
TURN_PREFIX = "<|im_start|>{role}\n"
|
253 |
+
TURN_SUFFIX = "<eos>\n"
|
254 |
+
TURN_SUFFIX_TAKE = "<eos>"
|
255 |
sample = None
|
256 |
assistant_prefix_len = None
|
257 |
+
assistant_suffix_len = None
|
258 |
for turn_id, turn in enumerate(conversations):
|
259 |
prompt = TURN_TEMPLATE.format(role=turn['role'], content=turn['content'])
|
260 |
turn_sample = tokenizer(
|
|
|
264 |
if turn['role'] == 'assistant':
|
265 |
if assistant_prefix_len is None:
|
266 |
assistant_prefix_len = len(tokenizer.encode(TURN_PREFIX.format(role=turn['role']), add_special_tokens=False))
|
267 |
+
if assistant_suffix_len is None:
|
268 |
+
assistant_suffix_len = (
|
269 |
+
len(tokenizer.encode(TURN_SUFFIX.format(role=turn['role']), add_special_tokens=False)) -
|
270 |
+
len(tokenizer.encode(TURN_SUFFIX_TAKE, add_special_tokens=False))
|
271 |
+
)
|
272 |
+
turn_sample['token_type_ids'][assistant_prefix_len:-assistant_suffix_len] = [1] * (len(turn_sample['input_ids']) - assistant_prefix_len - assistant_suffix_len)
|
273 |
if sample is None:
|
274 |
sample = turn_sample
|
275 |
else:
|
|
|
290 |
|
291 |
# ! testing
|
292 |
sample = seallm_7b_v25_tokenize_multi_turns(tokenizer, conversations)
|
293 |
+
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
|
294 |
+
pairs = [(x, y) for x, y in zip(tokens, sample['token_type_ids'])]
|
295 |
+
print(pairs)
|
296 |
|
297 |
+
# source and special tokens is masked out (token_type 0), only assistant with <eos> is trained (token_type 1)
|
298 |
+
# [('<bos>', 0), ('<', 0), ('|', 0), ..., ('assistant', 0), ('\n', 0), ('Hi', 1), ('▁there', 1), (',', 1), ('▁how', 1), ('▁can', 1), ('▁I', 1), ('▁help', 1), ('?', 1), ('<eos>', 1), ('\n', 0), ('<', 0), ...
|
299 |
|
300 |
```
|
301 |
|