import gradio as gr from faiss import IndexFlatIP import pandas as pd import numpy as np from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") input_embeddings = np.load("bert_input_embeddings.npy") index = IndexFlatIP(input_embeddings.shape[-1]) index.add(input_embeddings) vocab = {v:k for k,v in tokenizer.vocab.items()} lookup_table = pd.Series(vocab).sort_index() def get_first_subword(word): try: return tokenizer.vocab[word] except: return tokenizer(word, add_special_tokens=False)['input_ids'][0] def search(token_to_lookup, num_neighbors=100): i = get_first_subword(token_to_lookup) _ , I = index.search(input_embeddings[i:i+1], num_neighbors) hits = lookup_table.take(I[0]) return hits.values iface = gr.Interface( fn=search, inputs=gr.Textbox(lines=1, placeholder="Enter token..."), outputs=gr.Textbox(label="Results"), examples=[ ["##logy"], ["responded"], ], ) iface.launch(enable_queue=True, debug=True, show_error=True)