from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers import AutoTokenizer from .transformer import * from .build import * def build_lang_encoder(config_encoder, tokenizer, verbose, **kwargs): model_name = config_encoder['NAME'] if not is_lang_encoder(model_name): raise ValueError(f'Unkown model: {model_name}') return lang_encoders(model_name)(config_encoder, tokenizer, verbose, **kwargs) def build_tokenizer(config_encoder): tokenizer = None os.environ['TOKENIZERS_PARALLELISM'] = 'true' if config_encoder['TOKENIZER'] == 'clip': pretrained_tokenizer = config_encoder.get( 'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32' ) tokenizer = CLIPTokenizer.from_pretrained(pretrained_tokenizer) tokenizer.add_special_tokens({'cls_token': tokenizer.eos_token}) elif config_encoder['TOKENIZER'] == 'clip-fast': pretrained_tokenizer = config_encoder.get( 'PRETRAINED_TOKENIZER', 'openai/clip-vit-base-patch32' ) tokenizer = CLIPTokenizerFast.from_pretrained(pretrained_tokenizer, from_slow=True) elif config_encoder['TOKENIZER'] == 'biomed-clip': tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext") else: tokenizer = AutoTokenizer.from_pretrained(config_encoder['TOKENIZER']) return tokenizer