Spaces:

AlexTransformer
/

ernie_embebbing_layer_example

Sleeping

App Files Files Community

AlexTransformer commited on Aug 5

Commit

f0fdc4b

verified ·

1 Parent(s): 9118d60

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -10

app.py CHANGED Viewed

@@ -2,11 +2,13 @@ import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 model_name = "baidu/ERNIE-4.5-0.3B-PT"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
@@ -16,6 +18,7 @@ model = AutoModelForCausalLM.from_pretrained(
 embedding_layer = model.get_input_embeddings()
 def get_sentence_embedding(text):
     inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
     with torch.no_grad():
@@ -23,21 +26,64 @@ def get_sentence_embedding(text):
         sentence_embedding = embeddings.mean(dim=1)
     return sentence_embedding
-def calculate_similarity(sentence1, sentence2):
     emb1 = get_sentence_embedding(sentence1)
     emb2 = get_sentence_embedding(sentence2)
     similarity = F.cosine_similarity(emb1, emb2).item()
-    return f"Similarity: {similarity:.4f}"
 demo = gr.Interface(
-    fn=calculate_similarity,
     inputs=[
-        gr.Textbox(label="Sentence 1", placeholder="我爱北京"),
-        gr.Textbox(label="Sentence 2", placeholder="我爱上海"),
     ],
-    outputs=gr.Textbox(label="Similarity"),
-    title="Calculate two sentences's similarity by ERNIE 4.5-0.3B's embedding layer",
-    description="This app uses the embedding layer of Baidu ERNIE-4.5-0.3B-PT model to compute the cosine similarity between two sentences.",
 )
 if __name__ == "__main__":

 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+# Load model and tokenizer
 model_name = "baidu/ERNIE-4.5-0.3B-PT"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
 embedding_layer = model.get_input_embeddings()
+# Get sentence embedding by averaging token embeddings
 def get_sentence_embedding(text):
     inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
     with torch.no_grad():
         sentence_embedding = embeddings.mean(dim=1)
     return sentence_embedding
+# Show token list and token IDs
+def tokenize_sentence(sentence):
+    tokens = tokenizer.tokenize(sentence)
+    token_ids = tokenizer.convert_tokens_to_ids(tokens)
+    return list(zip(tokens, token_ids))
+# PCA plot of two sentence embeddings
+def plot_embeddings(sentence1, sentence2):
+    emb1 = get_sentence_embedding(sentence1).cpu().numpy()
+    emb2 = get_sentence_embedding(sentence2).cpu().numpy()
+    embeddings = np.vstack([emb1, emb2])  # Shape: (2, hidden_size)
+    # PCA to reduce to 2D
+    pca = PCA(n_components=2)
+    reduced = pca.fit_transform(embeddings)
+    # Plot
+    fig, ax = plt.subplots()
+    ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
+    ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
+    ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
+    ax.set_title("2D PCA of Sentence Embeddings")
+    ax.set_xlabel("PCA 1")
+    ax.set_ylabel("PCA 2")
+    ax.grid(True)
+    return fig
+# Main function to run all outputs
+def analyze_sentences(sentence1, sentence2):
+    # Cosine similarity
     emb1 = get_sentence_embedding(sentence1)
     emb2 = get_sentence_embedding(sentence2)
     similarity = F.cosine_similarity(emb1, emb2).item()
+    # Token info
+    tokens1 = tokenize_sentence(sentence1)
+    tokens2 = tokenize_sentence(sentence2)
+    # Plot
+    fig = plot_embeddings(sentence1, sentence2)
+    return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig
+# Build Gradio interface
 demo = gr.Interface(
+    fn=analyze_sentences,
     inputs=[
+        gr.Textbox(label="Sentence 1", placeholder="I love cat."),
+        gr.Textbox(label="Sentence 2", placeholder="I love dog."),
+    ],
+    outputs=[
+        gr.Textbox(label="Cosine Similarity Score"),
+        gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
+        gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
+        gr.Plot(label="2D PCA Plot of Embeddings"),
     ],
+    title="ERNIE 4.5 Embedding Visualization",
+    description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
 )
 if __name__ == "__main__":