File size: 2,815 Bytes
cd2135c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import sys
import time
import hashlib
import numpy as np
import requests
OPENAI_BASE_URL = os.environ.get('OPENAI_BASE_URL', '')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')
EMB_CACHE_DIR = os.environ.get('EMB_CACHE_DIR', '.cache/embs')
os.makedirs(EMB_CACHE_DIR, exist_ok=True)
def uuid_for_text(text):
return hashlib.md5(text.encode('utf8')).hexdigest()
def request_openai_emb(texts, model="text-embedding-3-large",
base_url='https://api.openai.com', prefix_url='/v1/embeddings',
timeout=4, retry=3, interval=2, caching=True):
if isinstance(texts, str):
texts = [texts]
assert len(texts) <= 256
data = []
if caching:
for text in texts:
emb_file = f"{EMB_CACHE_DIR}/{uuid_for_text(text)}"
if os.path.isfile(emb_file) and os.path.getsize(emb_file) > 0:
data.append(np.loadtxt(emb_file))
if len(texts) == len(data):
return data
url = f"{OPENAI_BASE_URL}{prefix_url}" if OPENAI_BASE_URL else f"{base_url}{prefix_url}"
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json"
}
payload = {"input": texts, "model": model}
while retry > 0 and len(data) == 0:
try:
r = requests.post(url, headers=headers, json=payload,
timeout=timeout)
res = r.json()
for x in res["data"]:
data.append(np.array(x["embedding"]))
except Exception as e:
print(f"request openai, retry {retry}, error: {e}", file=sys.stderr)
time.sleep(interval)
retry -= 1
if len(data) != len(texts):
data = []
if caching and len(data) > 0:
for text, emb in zip(texts, data):
emb_file = f"{EMB_CACHE_DIR}/{uuid_for_text(text)}"
np.savetxt(emb_file, emb)
return data
class OpenaiEmbModel:
def encode(self, sentences, batch_size=32, **kwargs):
batch_size = min(64, batch_size)
embs = []
for i in range(0, len(sentences), batch_size):
batch_texts = sentences[i:i+batch_size]
batch_embs = request_openai_emb(batch_texts,
caching=True, retry=3, interval=2)
assert len(batch_texts) == len(batch_embs), "The batch of texts and embs DONT match!"
embs.extend(batch_embs)
return embs
model = OpenaiEmbModel()
######
# test
#####
#embs = model.encode(['全国', '北京'])
#print(embs)
# task_list
task_list = ['Classification', 'Clustering', 'Reranking', 'Retrieval', 'STS', 'PairClassification']
# languages
task_langs=["zh", "zh-CN"]
evaluation = MTEB(task_types=task_list, task_langs=task_langs)
evaluation.run(model, output_folder=f"results/zh/{model_name.split('/')[-1]}")
|