File size: 2,231 Bytes
19488e2
74c5a4a
be10d84
21edaab
 
74c5a4a
be10d84
21edaab
026c201
 
 
21edaab
 
c5064c3
0b2ff15
 
b8bd956
 
 
 
 
 
 
 
 
c5064c3
 
b8bd956
 
 
c0026d3
b8bd956
 
0b2ff15
 
 
6450f47
b8bd956
 
 
6450f47
026c201
b8bd956
c0026d3
b8bd956
 
026c201
0b2ff15
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import spaces
import gradio as gr
from numpy.linalg import norm
from transformers import AutoModel
from sentence_transformers import SentenceTransformer

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))

model1 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-code", trust_remote_code=True)
model2 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
model3 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-zh", trust_remote_code=True)
model4 = SentenceTransformer("aspire/acge_text_embedding")
model5 = SentenceTransformer("intfloat/multilingual-e5-large")
model6 = SentenceTransformer("Salesforce/codet5p-110m-embedding")

@spaces.GPU
def generate(query1, query2, source_code):
    if len(query1) < 1:
        query1 = "How do I access the index while iterating over a sequence with a for loop?"
    if len(query2) < 1:
        query2 = "get a list of all the keys in a dictionary"
    if len(source_code) < 1:
        source_code = "# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)"

    results = []
    model_names = ["jinaai/jina-embeddings-v2-base-code", "jinaai/jina-embeddings-v2-base-en", "jinaai/jina-embeddings-v2-base-zh", "aspire/acge_text_embedding", "intfloat/multilingual-e5-large", "Salesforce/codet5p-110m-embedding"]
    for model, name in zip([model1, model2, model3, model4, model5, model6], model_names):
        embeddings = model.encode([query1, query2, source_code])
        score1 = cos_sim(embeddings[0], embeddings[2])
        score2 = cos_sim(embeddings[1], embeddings[2])
        results.append([name, float(score1), float(score2)])

    return results

gr.Interface(
    fn=generate,
    inputs=[
        gr.Text(label="query1", placeholder="How do I access the index while iterating over a sequence with a for loop?"),
        gr.Text(label="query2", placeholder="get a list of all the keys in a dictionary"),
        gr.Text(label="code", placeholder="# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n    print(idx, x)"),
    ],
    outputs=[
        gr.Dataframe(
            headers=["Model", "Query1 Score", "Query2 Score"],
            label="Similarity Scores",
        )
    ],
).launch()