Spaces:

AlexTransformer
/

ernie_embebbing_layer_example

Running

App Files Files Community

ernie_embebbing_layer_example / app.py

AlexTransformer

Update app.py

7ed47f4 verified 11 days ago

raw

history blame contribute delete

3.15 kB

	import torch
	import torch.nn.functional as F
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import gradio as gr
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.decomposition import PCA

	# Load model and tokenizer
	model_name = "baidu/ERNIE-4.5-0.3B-PT"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.float32
	).to(device)

	embedding_layer = model.get_input_embeddings()

	# Get sentence embedding by averaging token embeddings
	def get_sentence_embedding(text):
	inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
	with torch.no_grad():
	embeddings = embedding_layer(inputs["input_ids"])
	sentence_embedding = embeddings.mean(dim=1)
	return sentence_embedding

	# Show token list and token IDs
	def tokenize_sentence(sentence):
	tokens = tokenizer.tokenize(sentence)
	token_ids = tokenizer.convert_tokens_to_ids(tokens)
	return list(zip(tokens, token_ids))

	# PCA plot of two sentence embeddings
	def plot_embeddings(sentence1, sentence2):
	emb1 = get_sentence_embedding(sentence1).cpu().numpy()
	emb2 = get_sentence_embedding(sentence2).cpu().numpy()
	embeddings = np.vstack([emb1, emb2]) # Shape: (2, hidden_size)

	# PCA to reduce to 2D
	pca = PCA(n_components=2)
	reduced = pca.fit_transform(embeddings)

	# Plot
	fig, ax = plt.subplots()
	ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
	ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
	ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
	ax.set_title("2D PCA of Sentence Embeddings")
	ax.set_xlabel("PCA 1")
	ax.set_ylabel("PCA 2")
	ax.grid(True)
	return fig

	# Main function to run all outputs
	def analyze_sentences(sentence1, sentence2):
	# Cosine similarity
	emb1 = get_sentence_embedding(sentence1)
	emb2 = get_sentence_embedding(sentence2)
	similarity = F.cosine_similarity(emb1, emb2).item()

	# Token info
	tokens1 = tokenize_sentence(sentence1)
	tokens2 = tokenize_sentence(sentence2)

	# Plot
	fig = plot_embeddings(sentence1, sentence2)

	return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig

	# Build Gradio interface
	demo = gr.Interface(
	fn=analyze_sentences,
	inputs=[
	gr.Textbox(label="Sentence 1", placeholder="I love cat."),
	gr.Textbox(label="Sentence 2", placeholder="I love dog."),
	],
	outputs=[
	gr.Textbox(label="Cosine Similarity Score"),
	gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
	gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
	gr.Plot(label="2D PCA Plot of Embeddings"),
	],
	title="ERNIE 4.5 Embedding Visualization",
	description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
	)

	if __name__ == "__main__":
	demo.launch()