zhangxiyi.amos commited on
Commit
312d284
·
1 Parent(s): c5064c3

fix: code5t 部署

Browse files
Files changed (1) hide show
  1. app.py +24 -3
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import spaces
2
  import gradio as gr
3
  from numpy.linalg import norm
4
- from transformers import AutoModel
5
  from sentence_transformers import SentenceTransformer
 
6
 
7
  cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
8
 
@@ -11,7 +12,17 @@ model2 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_re
11
  model3 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-zh", trust_remote_code=True)
12
  model4 = SentenceTransformer("aspire/acge_text_embedding")
13
  model5 = SentenceTransformer("intfloat/multilingual-e5-large")
14
- model6 = SentenceTransformer("Salesforce/codet5p-110m-embedding")
 
 
 
 
 
 
 
 
 
 
15
 
16
  @spaces.GPU
17
  def generate(query1, query2, source_code):
@@ -24,11 +35,21 @@ def generate(query1, query2, source_code):
24
 
25
  results = []
26
  model_names = ["jinaai/jina-embeddings-v2-base-code", "jinaai/jina-embeddings-v2-base-en", "jinaai/jina-embeddings-v2-base-zh", "aspire/acge_text_embedding", "intfloat/multilingual-e5-large", "Salesforce/codet5p-110m-embedding"]
27
- for model, name in zip([model1, model2, model3, model4, model5, model6], model_names):
 
28
  embeddings = model.encode([query1, query2, source_code])
29
  score1 = cos_sim(embeddings[0], embeddings[2])
30
  score2 = cos_sim(embeddings[1], embeddings[2])
31
  results.append([name, float(score1), float(score2)])
 
 
 
 
 
 
 
 
 
32
 
33
  return results
34
 
 
1
  import spaces
2
  import gradio as gr
3
  from numpy.linalg import norm
4
+ from transformers import AutoModel, AutoTokenizer, AutoConfig
5
  from sentence_transformers import SentenceTransformer
6
+ import torch
7
 
8
  cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
9
 
 
12
  model3 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-zh", trust_remote_code=True)
13
  model4 = SentenceTransformer("aspire/acge_text_embedding")
14
  model5 = SentenceTransformer("intfloat/multilingual-e5-large")
15
+
16
+ # 对于 Salesforce/codet5p-110m-embedding 模型,我们需要特殊处理
17
+ config = AutoConfig.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
18
+ tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
19
+ model6 = AutoModel.from_pretrained("Salesforce/codet5p-110m-embedding", config=config, trust_remote_code=True)
20
+
21
+ # 创建一个简单的平均池化函数来获取嵌入
22
+ def mean_pooling(model_output, attention_mask):
23
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
24
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
25
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
26
 
27
  @spaces.GPU
28
  def generate(query1, query2, source_code):
 
35
 
36
  results = []
37
  model_names = ["jinaai/jina-embeddings-v2-base-code", "jinaai/jina-embeddings-v2-base-en", "jinaai/jina-embeddings-v2-base-zh", "aspire/acge_text_embedding", "intfloat/multilingual-e5-large", "Salesforce/codet5p-110m-embedding"]
38
+
39
+ for model, name in zip([model1, model2, model3, model4, model5], model_names[:-1]):
40
  embeddings = model.encode([query1, query2, source_code])
41
  score1 = cos_sim(embeddings[0], embeddings[2])
42
  score2 = cos_sim(embeddings[1], embeddings[2])
43
  results.append([name, float(score1), float(score2)])
44
+
45
+ # 特殊处理 Salesforce/codet5p-110m-embedding 模型
46
+ inputs = tokenizer([query1, query2, source_code], padding=True, truncation=True, return_tensors="pt")
47
+ with torch.no_grad():
48
+ model_output = model6(**inputs)
49
+ embeddings = mean_pooling(model_output, inputs['attention_mask'])
50
+ score1 = cos_sim(embeddings[0], embeddings[2])
51
+ score2 = cos_sim(embeddings[1], embeddings[2])
52
+ results.append([model_names[-1], float(score1), float(score2)])
53
 
54
  return results
55