|
import json |
|
import torch |
|
from model import BraLM, Vocab |
|
from tokenizers import Tokenizer |
|
|
|
bpe_tokenizer = Tokenizer.from_file("wiki_bpe_tokenizer_4000_bytelevel.json") |
|
|
|
def decode_en_sentence(head, max_token=32, do_sample=False): |
|
bpe_tokens = bpe_tokenizer.encode(head).tokens |
|
if len(bpe_tokens) < 2: |
|
return head |
|
start = [vocab((bpe_tokens[i] + '->' + bpe_tokens[i+1])) for i in range(len(bpe_tokens)-1)] |
|
ret = model.decode(start, vocab, max_token, do_sample) |
|
decode_tuple_list = [vocab.decode(p).split('->') for p in ret] |
|
decode_sentence = decode_tuple_list[0][0] + "".join([p[-1] for p in decode_tuple_list]) |
|
return decode_sentence |
|
|
|
|
|
with open("./vocab_wiki_4k_en.json") as f: |
|
node_dict = json.load(f) |
|
vocab = Vocab.from_node_dict(node_dict) |
|
|
|
model = BraLM(hidden_size=32) |
|
model.prepare_network(vocab) |
|
|
|
state_dict = torch.load("model_en.bin", weights_only=True) |
|
model.load_state_dict(state_dict) |
|
model.to_device("cuda:6") |
|
|
|
head = "In frogs, the hind legs are larger" |
|
encoding = bpe_tokenizer.encode(head) |
|
token_len = len(encoding.ids) |
|
max_token = 32 - token_len |
|
decode_sentence = decode_en_sentence(head, max_token).replace("Ġ", " ") |
|
|
|
print(decode_sentence) |
|
|