Spaces:

Zeitstaub
/

AI-Patents_searched_by_AI

Running

App Files Files Community

AI-Patents_searched_by_AI / app.py

Zeitstaub

Update app.py

471ebd4 verified over 1 year ago

raw

history blame contribute delete

5.25 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import euclidean_distances

	# Load DataFrame
	text_embeddings = pd.read_parquet('text_embeddings_abstract_generated_by_LLM.parquet')

	# Initialize models
	model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2')
	model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2')
	model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2')
	model_gte_large = SentenceTransformer('thenlper/gte-large')
	model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0')

	# Model selection drop-down list
	model_options = {
	'all-MiniLM-L6-v2': model_all_Mini,
	'intfloat/e5-large-v2': model_e5_large_v2,
	'intfloat/e5-small-v2': model_e5_small_v2,
	'thenlper/gte-large': model_gte_large,
	'avsolatorio/GIST-large-Embedding-v0': model_GIST_large
	}

	# Main function for the Gradio interface
	def find_similar_texts(model_name, input_text):

	# Check whether model has been selected
	if not model_name:
	return "You have forgotten to select a sentence-transformer."

	# Check whether there are abstracts matching the text input
	input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1)
	embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2'])
	distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten()

	# Only continue if similar abstract found
	if any(distances_mini < 1.05):
	selected_model = model_options[model_name]
	embedding_column = 'embedding_' + model_name
	input_embedding = selected_model.encode(input_text).reshape(1, -1)
	embedding_matrix = np.vstack(text_embeddings[embedding_column])
	distances = euclidean_distances(embedding_matrix, input_embedding).flatten()
	text_embeddings['euclidean_distance'] = distances
	sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True)
	top_five = sorted_embeddings.head(5)[['abstract', 'patent no', 'title']]
	# formatted_output = '\n\n'.join([f"<Patent No: {row['patent no']}\nTitle: {row['title']}\nAbstract: {row['abstract']}" for index, row in top_five.iterrows()])
	formatted_output = '<br><br>'.join([
	f"<strong>Patent No:</strong> {row['patent no']}<br>"
	f"<strong>Title:</strong> {row['title']}<br>"
	f"<strong>Abstract:</strong> {row['abstract']}<br>"
	for index, row in top_five.iterrows()
	])

	return formatted_output
	else:
	return "It seems there is no patent abstract close to your description."

	# Create Gradio interface using Blocks
	with gr.Blocks() as demo:
	gr.Markdown("## Sentence-Transformer based AI-Generated-Patent-Abstract Search")
	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Choose Sentence-Transformer")
	text_input = gr.Textbox(lines=2, placeholder="machine learning for drug dosing", label="Input Text (example: machine learning for drug dosing. Remark: This is only a small number of AI generated machine learning patents!)")
	submit_button = gr.Button("Search")

	with gr.Column():
	output = gr.HTML(label="Top 5 Patent Abstracts")

	submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output)


	gr.Markdown("""
	### Description
	This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of AI generated patent abstracts in the field of machine learning and AI.

	- 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/).
	- 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2).
	- 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2).
	- 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281).
	- 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829).



	<strong>Please note: The patent data used in this demo are AI generated. This app is intended only for demonstration purposes.
	""")
	model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
	text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output)

	demo.launch()

	#The patents can be viewed at [Espacenet](https://worldwide.espacenet.com/?locale=en_EP), the free onine service by the European Patent Office.