File size: 3,490 Bytes
19488e2
74c5a4a
be10d84
312d284
21edaab
312d284
74c5a4a
be10d84
21edaab
026c201
 
 
21edaab
 
312d284
 
 
 
 
 
 
 
 
 
 
0b2ff15
 
b8bd956
 
 
 
 
 
 
 
 
c5064c3
312d284
 
b8bd956
 
 
c0026d3
312d284
 
 
 
 
 
 
 
 
b8bd956
 
0b2ff15
 
 
6450f47
b8bd956
 
 
6450f47
026c201
b8bd956
c0026d3
b8bd956
 
026c201
0b2ff15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import spaces
import gradio as gr
from numpy.linalg import norm
from transformers import AutoModel, AutoTokenizer, AutoConfig
from sentence_transformers import SentenceTransformer
import torch

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))

model1 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-code", trust_remote_code=True)
model2 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
model3 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-zh", trust_remote_code=True)
model4 = SentenceTransformer("aspire/acge_text_embedding")
model5 = SentenceTransformer("intfloat/multilingual-e5-large")

# 对于 Salesforce/codet5p-110m-embedding 模型,我们需要特殊处理
config = AutoConfig.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
model6 = AutoModel.from_pretrained("Salesforce/codet5p-110m-embedding", config=config, trust_remote_code=True)

# 创建一个简单的平均池化函数来获取嵌入
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

@spaces.GPU
def generate(query1, query2, source_code):
    if len(query1) < 1:
        query1 = "How do I access the index while iterating over a sequence with a for loop?"
    if len(query2) < 1:
        query2 = "get a list of all the keys in a dictionary"
    if len(source_code) < 1:
        source_code = "# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)"

    results = []
    model_names = ["jinaai/jina-embeddings-v2-base-code", "jinaai/jina-embeddings-v2-base-en", "jinaai/jina-embeddings-v2-base-zh", "aspire/acge_text_embedding", "intfloat/multilingual-e5-large", "Salesforce/codet5p-110m-embedding"]
    
    for model, name in zip([model1, model2, model3, model4, model5], model_names[:-1]):
        embeddings = model.encode([query1, query2, source_code])
        score1 = cos_sim(embeddings[0], embeddings[2])
        score2 = cos_sim(embeddings[1], embeddings[2])
        results.append([name, float(score1), float(score2)])
    
    # 特殊处理 Salesforce/codet5p-110m-embedding 模型
    inputs = tokenizer([query1, query2, source_code], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model6(**inputs)
    embeddings = mean_pooling(model_output, inputs['attention_mask'])
    score1 = cos_sim(embeddings[0], embeddings[2])
    score2 = cos_sim(embeddings[1], embeddings[2])
    results.append([model_names[-1], float(score1), float(score2)])

    return results

gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(label="query1", placeholder="How do I access the index while iterating over a sequence with a for loop?"),
        gr.Text(label="query2", placeholder="get a list of all the keys in a dictionary"),
        gr.Text(label="code", placeholder="# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)"),
    ],
    outputs=[
        gr.Dataframe(
            headers=["Model", "Query1 Score", "Query2 Score"],
            label="Similarity Scores",
        )
    ],
).launch()