Spaces:
Build error
Build error
File size: 3,764 Bytes
a446b0b 0935f1b a446b0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
from tqdm import tqdm
from nltk import tokenize
import numpy as np
import pickle, torch
import comet.src.data.data as data
import comet.src.data.config as cfg
import comet.src.models.utils as model_utils
import comet.src.interactive.functions as interactive
class CSKFeatureExtractor:
def __init__(self, dir=".", device=0):
super(CSKFeatureExtractor, self).__init__()
model_file = os.path.join(
dir, "comet/pretrained_models/atomic_pretrained_model.pickle"
)
sampling_algorithm = "beam-5"
category = "all"
opt, state_dict = interactive.load_model_file(model_file)
data_loader, text_encoder = interactive.load_data("atomic", opt, dir)
self.opt = opt
self.data_loader = data_loader
self.text_encoder = text_encoder
n_ctx = data_loader.max_event + data_loader.max_effect
n_vocab = len(text_encoder.encoder) + n_ctx
self.model = interactive.make_model(opt, n_vocab, n_ctx, state_dict)
self.model.eval()
if device != "cpu":
cfg.device = int(device)
cfg.do_gpu = True
torch.cuda.set_device(cfg.device)
self.model.cuda(cfg.device)
else:
cfg.device = "cpu"
def set_atomic_inputs(self, input_event, category, data_loader, text_encoder):
XMB = torch.zeros(1, data_loader.max_event + 1).long().to(cfg.device)
prefix, suffix = data.atomic_data.do_example(
text_encoder, input_event, None, True, None
)
if len(prefix) > data_loader.max_event + 1:
prefix = prefix[: data_loader.max_event + 1]
XMB[:, : len(prefix)] = torch.LongTensor(prefix)
XMB[:, -1] = torch.LongTensor([text_encoder.encoder["<{}>".format(category)]])
batch = {}
batch["sequences"] = XMB
batch["attention_mask"] = data.atomic_data.make_attention_mask(XMB)
return batch
def extract(self, sentence):
atomic_keys = [
"xIntent",
"xAttr",
"xNeed",
"xWant",
"xEffect",
"xReact",
"oWant",
"oEffect",
"oReact",
]
map1 = [{}, {}, {}, {}, {}, {}, {}, {}, {}]
all_keys = list(sentence.keys())
for i in tqdm(range(len(all_keys))):
item = all_keys[i]
list1 = [[], [], [], [], [], [], [], [], []]
for x in sentence[item]:
input_event = x.encode("ascii", errors="ignore").decode("utf-8")
m1 = []
for sent in tokenize.sent_tokenize(input_event):
seqs = []
masks = []
for category in atomic_keys:
batch = self.set_atomic_inputs(
sent, category, self.data_loader, self.text_encoder
)
seqs.append(batch["sequences"])
masks.append(batch["attention_mask"])
XMB = torch.cat(seqs)
MMB = torch.cat(masks)
XMB = model_utils.prepare_position_embeddings(
self.opt, self.data_loader.vocab_encoder, XMB.unsqueeze(-1)
)
h, _ = self.model(XMB.unsqueeze(1), sequence_mask=MMB)
last_index = MMB[0][:-1].nonzero()[-1].cpu().numpy()[0] + 1
m1.append(h[:, -1, :].detach().cpu().numpy())
m1 = np.mean(np.array(m1), axis=0)
for k, l1 in enumerate(list1):
l1.append(m1[k])
for k, v1 in enumerate(map1):
v1[item] = list1[k]
return map1
|