AlexTransformer's picture
Update app.py
7ed47f4 verified
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# Load model and tokenizer
model_name = "baidu/ERNIE-4.5-0.3B-PT"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float32
).to(device)
embedding_layer = model.get_input_embeddings()
# Get sentence embedding by averaging token embeddings
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
with torch.no_grad():
embeddings = embedding_layer(inputs["input_ids"])
sentence_embedding = embeddings.mean(dim=1)
return sentence_embedding
# Show token list and token IDs
def tokenize_sentence(sentence):
tokens = tokenizer.tokenize(sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
return list(zip(tokens, token_ids))
# PCA plot of two sentence embeddings
def plot_embeddings(sentence1, sentence2):
emb1 = get_sentence_embedding(sentence1).cpu().numpy()
emb2 = get_sentence_embedding(sentence2).cpu().numpy()
embeddings = np.vstack([emb1, emb2]) # Shape: (2, hidden_size)
# PCA to reduce to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)
# Plot
fig, ax = plt.subplots()
ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
ax.set_title("2D PCA of Sentence Embeddings")
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
ax.grid(True)
return fig
# Main function to run all outputs
def analyze_sentences(sentence1, sentence2):
# Cosine similarity
emb1 = get_sentence_embedding(sentence1)
emb2 = get_sentence_embedding(sentence2)
similarity = F.cosine_similarity(emb1, emb2).item()
# Token info
tokens1 = tokenize_sentence(sentence1)
tokens2 = tokenize_sentence(sentence2)
# Plot
fig = plot_embeddings(sentence1, sentence2)
return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig
# Build Gradio interface
demo = gr.Interface(
fn=analyze_sentences,
inputs=[
gr.Textbox(label="Sentence 1", placeholder="I love cat."),
gr.Textbox(label="Sentence 2", placeholder="I love dog."),
],
outputs=[
gr.Textbox(label="Cosine Similarity Score"),
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
gr.Plot(label="2D PCA Plot of Embeddings"),
],
title="ERNIE 4.5 Embedding Visualization",
description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
)
if __name__ == "__main__":
demo.launch()