Spaces:
Build error
Build error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sentence_transformers.util import cos_sim | |
from sentence_transformers import SentenceTransformer | |
from bokeh.plotting import figure, output_notebook, show, save | |
from bokeh.io import output_file, show | |
from bokeh.models import ColumnDataSource, HoverTool | |
from sklearn.manifold import TSNE | |
def load_model(): | |
model = SentenceTransformer('hackathon-pln-es/paraphrase-spanish-distilroberta') | |
model.eval() | |
return model | |
st.title("Sentence Embedding for Spanish with Bertin") | |
st.write("Sentence embedding for spanish trained according to instructions in the paper [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/pdf/2004.09813.pdf) and the [documentation](https://www.sbert.net/examples/training/multilingual/README.html) accompanying its companion python package. We have used the strongest available pretrained English Bi-Encoder ([paraphrase-mpnet-base-v2](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models)) as a teacher model, and the pretrained Spanish [BERTIN](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) as the student model.Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/paraphrase-spanish-distilroberta.") | |
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.") | |
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli Pinto y Mauricio Mazuecos.") | |
sent1 = st.text_area('Enter sentence 1') | |
sent2 = st.text_area('Enter sentence 2') | |
if st.button('Compute similarity'): | |
if sent1 and sent2: | |
model = load_model() | |
encodings = model.encode([sent1, sent2]) | |
sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0] | |
st.text('Cosine Similarity: {0:.4f}'.format(sim)) | |
else: | |
st.write('Missing a sentences') | |
else: | |
pass | |