File size: 878 Bytes
06a851e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import CamembertTokenizer


def get_tokenizer(model_name='camembert-base'):
    tokenizer = CamembertTokenizer.from_pretrained(model_name)
    return tokenizer


def tokenize_encode_corpus(tokenizer, descriptions, max_len):

    encoded_corpus = tokenizer(text=descriptions,
                            add_special_tokens=True,
                            padding='max_length',
                            truncation='longest_first',
                            max_length=max_len,
                            return_attention_mask=True)
    return encoded_corpus
    

def extract_inputs_masks(encoded_corpus):

    try:
        input_ids = encoded_corpus['input_ids']
        attention_mask = encoded_corpus['attention_mask']
    except:
        print('Available keys are = ', encoded_corpus.keys())
        return None
    return input_ids, attention_mask