|
import torch |
|
import torch.nn.functional as F |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import gradio as gr |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from sklearn.decomposition import PCA |
|
|
|
|
|
model_name = "baidu/ERNIE-4.5-0.3B-PT" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float32 |
|
).to(device) |
|
|
|
embedding_layer = model.get_input_embeddings() |
|
|
|
|
|
def get_sentence_embedding(text): |
|
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device) |
|
with torch.no_grad(): |
|
embeddings = embedding_layer(inputs["input_ids"]) |
|
sentence_embedding = embeddings.mean(dim=1) |
|
return sentence_embedding |
|
|
|
|
|
def tokenize_sentence(sentence): |
|
tokens = tokenizer.tokenize(sentence) |
|
token_ids = tokenizer.convert_tokens_to_ids(tokens) |
|
return list(zip(tokens, token_ids)) |
|
|
|
|
|
def plot_embeddings(sentence1, sentence2): |
|
emb1 = get_sentence_embedding(sentence1).cpu().numpy() |
|
emb2 = get_sentence_embedding(sentence2).cpu().numpy() |
|
embeddings = np.vstack([emb1, emb2]) |
|
|
|
|
|
pca = PCA(n_components=2) |
|
reduced = pca.fit_transform(embeddings) |
|
|
|
|
|
fig, ax = plt.subplots() |
|
ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"]) |
|
ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red") |
|
ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue") |
|
ax.set_title("2D PCA of Sentence Embeddings") |
|
ax.set_xlabel("PCA 1") |
|
ax.set_ylabel("PCA 2") |
|
ax.grid(True) |
|
return fig |
|
|
|
|
|
def analyze_sentences(sentence1, sentence2): |
|
|
|
emb1 = get_sentence_embedding(sentence1) |
|
emb2 = get_sentence_embedding(sentence2) |
|
similarity = F.cosine_similarity(emb1, emb2).item() |
|
|
|
|
|
tokens1 = tokenize_sentence(sentence1) |
|
tokens2 = tokenize_sentence(sentence2) |
|
|
|
|
|
fig = plot_embeddings(sentence1, sentence2) |
|
|
|
return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig |
|
|
|
|
|
demo = gr.Interface( |
|
fn=analyze_sentences, |
|
inputs=[ |
|
gr.Textbox(label="Sentence 1", placeholder="I love cat."), |
|
gr.Textbox(label="Sentence 2", placeholder="I love dog."), |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Cosine Similarity Score"), |
|
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"), |
|
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"), |
|
gr.Plot(label="2D PCA Plot of Embeddings"), |
|
], |
|
title="ERNIE 4.5 Embedding Visualization", |
|
description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|
|
|