Spaces:
Running
Running
File size: 3,490 Bytes
19488e2 74c5a4a be10d84 312d284 21edaab 312d284 74c5a4a be10d84 21edaab 026c201 21edaab 312d284 0b2ff15 b8bd956 c5064c3 312d284 b8bd956 c0026d3 312d284 b8bd956 0b2ff15 6450f47 b8bd956 6450f47 026c201 b8bd956 c0026d3 b8bd956 026c201 0b2ff15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import spaces
import gradio as gr
from numpy.linalg import norm
from transformers import AutoModel, AutoTokenizer, AutoConfig
from sentence_transformers import SentenceTransformer
import torch
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model1 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-code", trust_remote_code=True)
model2 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
model3 = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-zh", trust_remote_code=True)
model4 = SentenceTransformer("aspire/acge_text_embedding")
model5 = SentenceTransformer("intfloat/multilingual-e5-large")
# 对于 Salesforce/codet5p-110m-embedding 模型,我们需要特殊处理
config = AutoConfig.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-110m-embedding", trust_remote_code=True)
model6 = AutoModel.from_pretrained("Salesforce/codet5p-110m-embedding", config=config, trust_remote_code=True)
# 创建一个简单的平均池化函数来获取嵌入
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@spaces.GPU
def generate(query1, query2, source_code):
if len(query1) < 1:
query1 = "How do I access the index while iterating over a sequence with a for loop?"
if len(query2) < 1:
query2 = "get a list of all the keys in a dictionary"
if len(source_code) < 1:
source_code = "# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)"
results = []
model_names = ["jinaai/jina-embeddings-v2-base-code", "jinaai/jina-embeddings-v2-base-en", "jinaai/jina-embeddings-v2-base-zh", "aspire/acge_text_embedding", "intfloat/multilingual-e5-large", "Salesforce/codet5p-110m-embedding"]
for model, name in zip([model1, model2, model3, model4, model5], model_names[:-1]):
embeddings = model.encode([query1, query2, source_code])
score1 = cos_sim(embeddings[0], embeddings[2])
score2 = cos_sim(embeddings[1], embeddings[2])
results.append([name, float(score1), float(score2)])
# 特殊处理 Salesforce/codet5p-110m-embedding 模型
inputs = tokenizer([query1, query2, source_code], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
model_output = model6(**inputs)
embeddings = mean_pooling(model_output, inputs['attention_mask'])
score1 = cos_sim(embeddings[0], embeddings[2])
score2 = cos_sim(embeddings[1], embeddings[2])
results.append([model_names[-1], float(score1), float(score2)])
return results
gr.Interface(
fn=generate,
inputs=[
gr.Text(label="query1", placeholder="How do I access the index while iterating over a sequence with a for loop?"),
gr.Text(label="query2", placeholder="get a list of all the keys in a dictionary"),
gr.Text(label="code", placeholder="# Use the built-in enumerator\nfor idx, x in enumerate(xs):\n print(idx, x)"),
],
outputs=[
gr.Dataframe(
headers=["Model", "Query1 Score", "Query2 Score"],
label="Similarity Scores",
)
],
).launch() |