AlexTransformer commited on
Commit
f0fdc4b
·
verified ·
1 Parent(s): 9118d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -10
app.py CHANGED
@@ -2,11 +2,13 @@ import torch
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
 
 
 
5
 
 
6
  model_name = "baidu/ERNIE-4.5-0.3B-PT"
7
-
8
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
9
-
10
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_name,
@@ -16,6 +18,7 @@ model = AutoModelForCausalLM.from_pretrained(
16
 
17
  embedding_layer = model.get_input_embeddings()
18
 
 
19
  def get_sentence_embedding(text):
20
  inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
21
  with torch.no_grad():
@@ -23,21 +26,64 @@ def get_sentence_embedding(text):
23
  sentence_embedding = embeddings.mean(dim=1)
24
  return sentence_embedding
25
 
26
- def calculate_similarity(sentence1, sentence2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  emb1 = get_sentence_embedding(sentence1)
28
  emb2 = get_sentence_embedding(sentence2)
29
  similarity = F.cosine_similarity(emb1, emb2).item()
30
- return f"Similarity: {similarity:.4f}"
31
 
 
 
 
 
 
 
 
 
 
 
32
  demo = gr.Interface(
33
- fn=calculate_similarity,
34
  inputs=[
35
- gr.Textbox(label="Sentence 1", placeholder="我爱北京"),
36
- gr.Textbox(label="Sentence 2", placeholder="我爱上海"),
 
 
 
 
 
 
37
  ],
38
- outputs=gr.Textbox(label="Similarity"),
39
- title="Calculate two sentences's similarity by ERNIE 4.5-0.3B's embedding layer",
40
- description="This app uses the embedding layer of Baidu ERNIE-4.5-0.3B-PT model to compute the cosine similarity between two sentences.",
41
  )
42
 
43
  if __name__ == "__main__":
 
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import gradio as gr
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ from sklearn.decomposition import PCA
8
 
9
+ # Load model and tokenizer
10
  model_name = "baidu/ERNIE-4.5-0.3B-PT"
 
11
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_name,
 
18
 
19
  embedding_layer = model.get_input_embeddings()
20
 
21
+ # Get sentence embedding by averaging token embeddings
22
  def get_sentence_embedding(text):
23
  inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True).to(device)
24
  with torch.no_grad():
 
26
  sentence_embedding = embeddings.mean(dim=1)
27
  return sentence_embedding
28
 
29
+ # Show token list and token IDs
30
+ def tokenize_sentence(sentence):
31
+ tokens = tokenizer.tokenize(sentence)
32
+ token_ids = tokenizer.convert_tokens_to_ids(tokens)
33
+ return list(zip(tokens, token_ids))
34
+
35
+ # PCA plot of two sentence embeddings
36
+ def plot_embeddings(sentence1, sentence2):
37
+ emb1 = get_sentence_embedding(sentence1).cpu().numpy()
38
+ emb2 = get_sentence_embedding(sentence2).cpu().numpy()
39
+ embeddings = np.vstack([emb1, emb2]) # Shape: (2, hidden_size)
40
+
41
+ # PCA to reduce to 2D
42
+ pca = PCA(n_components=2)
43
+ reduced = pca.fit_transform(embeddings)
44
+
45
+ # Plot
46
+ fig, ax = plt.subplots()
47
+ ax.scatter(reduced[:, 0], reduced[:, 1], color=["red", "blue"])
48
+ ax.annotate("Sentence 1", (reduced[0, 0], reduced[0, 1]), color="red")
49
+ ax.annotate("Sentence 2", (reduced[1, 0], reduced[1, 1]), color="blue")
50
+ ax.set_title("2D PCA of Sentence Embeddings")
51
+ ax.set_xlabel("PCA 1")
52
+ ax.set_ylabel("PCA 2")
53
+ ax.grid(True)
54
+ return fig
55
+
56
+ # Main function to run all outputs
57
+ def analyze_sentences(sentence1, sentence2):
58
+ # Cosine similarity
59
  emb1 = get_sentence_embedding(sentence1)
60
  emb2 = get_sentence_embedding(sentence2)
61
  similarity = F.cosine_similarity(emb1, emb2).item()
 
62
 
63
+ # Token info
64
+ tokens1 = tokenize_sentence(sentence1)
65
+ tokens2 = tokenize_sentence(sentence2)
66
+
67
+ # Plot
68
+ fig = plot_embeddings(sentence1, sentence2)
69
+
70
+ return f"Similarity: {similarity:.4f}", tokens1, tokens2, fig
71
+
72
+ # Build Gradio interface
73
  demo = gr.Interface(
74
+ fn=analyze_sentences,
75
  inputs=[
76
+ gr.Textbox(label="Sentence 1", placeholder="I love cat."),
77
+ gr.Textbox(label="Sentence 2", placeholder="I love dog."),
78
+ ],
79
+ outputs=[
80
+ gr.Textbox(label="Cosine Similarity Score"),
81
+ gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 1 Tokens"),
82
+ gr.Dataframe(headers=["Token", "Token ID"], label="Sentence 2 Tokens"),
83
+ gr.Plot(label="2D PCA Plot of Embeddings"),
84
  ],
85
+ title="ERNIE 4.5 Embedding Visualization",
86
+ description="Compare two sentences using ERNIE 4.5-0.3B's embedding layer. Outputs cosine similarity, token info, and PCA plot.",
 
87
  )
88
 
89
  if __name__ == "__main__":