|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
from sentence_transformers import SentenceTransformer |
|
from sklearn.metrics.pairwise import euclidean_distances |
|
|
|
|
|
text_embeddings = pd.read_parquet('text_embeddings_abstract_generated_by_LLM.parquet') |
|
|
|
|
|
model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2') |
|
model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2') |
|
model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2') |
|
model_gte_large = SentenceTransformer('thenlper/gte-large') |
|
model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0') |
|
|
|
|
|
model_options = { |
|
'all-MiniLM-L6-v2': model_all_Mini, |
|
'intfloat/e5-large-v2': model_e5_large_v2, |
|
'intfloat/e5-small-v2': model_e5_small_v2, |
|
'thenlper/gte-large': model_gte_large, |
|
'avsolatorio/GIST-large-Embedding-v0': model_GIST_large |
|
} |
|
|
|
|
|
def find_similar_texts(model_name, input_text): |
|
|
|
|
|
if not model_name: |
|
return "You have forgotten to select a sentence-transformer." |
|
|
|
|
|
input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1) |
|
embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2']) |
|
distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten() |
|
|
|
|
|
if any(distances_mini < 1.05): |
|
selected_model = model_options[model_name] |
|
embedding_column = 'embedding_' + model_name |
|
input_embedding = selected_model.encode(input_text).reshape(1, -1) |
|
embedding_matrix = np.vstack(text_embeddings[embedding_column]) |
|
distances = euclidean_distances(embedding_matrix, input_embedding).flatten() |
|
text_embeddings['euclidean_distance'] = distances |
|
sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True) |
|
top_five = sorted_embeddings.head(5)[['abstract', 'patent no', 'title']] |
|
|
|
formatted_output = '<br><br>'.join([ |
|
f"<strong>Patent No:</strong> {row['patent no']}<br>" |
|
f"<strong>Title:</strong> {row['title']}<br>" |
|
f"<strong>Abstract:</strong> {row['abstract']}<br>" |
|
for index, row in top_five.iterrows() |
|
]) |
|
|
|
return formatted_output |
|
else: |
|
return "It seems there is no patent abstract close to your description." |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Sentence-Transformer based AI-Generated-Patent-Abstract Search") |
|
with gr.Row(): |
|
with gr.Column(): |
|
model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Choose Sentence-Transformer") |
|
text_input = gr.Textbox(lines=2, placeholder="machine learning for drug dosing", label="Input Text (example: machine learning for drug dosing. Remark: This is only a small number of AI generated machine learning patents!)") |
|
submit_button = gr.Button("Search") |
|
|
|
with gr.Column(): |
|
output = gr.HTML(label="Top 5 Patent Abstracts") |
|
|
|
submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output) |
|
|
|
|
|
gr.Markdown(""" |
|
### Description |
|
This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of AI generated patent abstracts in the field of machine learning and AI. |
|
|
|
- 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/). |
|
- 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2). |
|
- 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2). |
|
- 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281). |
|
- 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829). |
|
|
|
|
|
|
|
<strong>Please note: The patent data used in this demo are AI generated. This app is intended only for demonstration purposes. |
|
""") |
|
model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output) |
|
text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output) |
|
|
|
demo.launch() |
|
|
|
|
|
|